diff --git a/data_analysis/github_issues_analysis/analysis.ipynb b/data_analysis/github_issues_analysis/analysis.ipynb
new file mode 100644
index 0000000..8271f32
--- /dev/null
+++ b/data_analysis/github_issues_analysis/analysis.ipynb
@@ -0,0 +1,1320 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Data curation of GitHub Issues\n",
+ "\n",
+ "## Preprocessing:\n",
+ " 1- filtering automated text\n",
+ "\n",
+ " 2- filtering non-English text (TODO)\n",
+ "\n",
+ " 3- filtering events from bots\n",
+ "\n",
+ " 4- filtering based on number of users (keep issues with one user only if text length is larger than 400 and smaller than 7000)\n",
+ " \n",
+ " 5- filtering based on number of events (overlaps with previous filter)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datasets\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from utils import (merge_text_columns, remove_bot_comments,\n",
+ " strip_automated_email_text)\n",
+ "\n",
+ "\n",
+ "def get_percentiles(ds, x=[0, 25, 50, 90, 95, 100], text_col=\"text_size\"):\n",
+ " df = pd.DataFrame(\n",
+ " {\n",
+ " \"percentile\": x,\n",
+ " \"user_count\": [int(np.percentile(ds[\"user_count\"], i)) for i in x],\n",
+ " \"event_count\": [int(np.percentile(ds[\"event_count\"], i)) for i in x],\n",
+ " \"text_size\": [int(np.percentile(ds[text_col], i)) for i in x],\n",
+ " }\n",
+ " )\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using custom data configuration bigcode--subset-github-issues-64ef5cdc6c7e0107\n",
+ "Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'],\n",
+ " num_rows: 10000\n",
+ "})"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = datasets.load_dataset(\"bigcode/subset-github-issues\", split=\"train\")\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data preprocessing: \n",
+ "\n",
+ "- reformat column name as \"text\" for both description and comments\n",
+ "- remove automated text\n",
+ "- replace usernames\n",
+ "- add number of users and events, and total size of text in the issue (text in comments/description..)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fd9da4a2ae411309.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-067b3f10662b791d.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3978f2e94b3a379f.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-c0ca3a0c29be9300.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-eaf26ba4f8d67172.arrow\n"
+ ]
+ }
+ ],
+ "source": [
+ "data = (\n",
+ " data.map(merge_text_columns)\n",
+ " .map(strip_automated_email_text)\n",
+ " .map(lambda x: {\"user_count\": len(set(event[\"author\"] for event in x[\"events\"]))})\n",
+ " .map(lambda x: {\"event_count\": len(x[\"events\"])})\n",
+ " .map(lambda x: {\"text_size\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size'],\n",
+ " num_rows: 10000\n",
+ "})"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Removal of events from bots"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6d9deeb91d0716c0.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-183f69a510704c03.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-a34eb93828948a16.arrow\n",
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-9fe60754066fa34b.arrow\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Percentage of issues modified by the bot filter: 29.73%\n",
+ "Removal of 14.74% of issues entirely generated by bots\n",
+ "Removal of: 17.25% events generated by bots\n"
+ ]
+ }
+ ],
+ "source": [
+ "dataset = data.map(remove_bot_comments)\n",
+ "# new event count\n",
+ "dataset = dataset.map(lambda x: {\"event_count_no_bots\": len(x[\"events\"])})\n",
+ "# filter out issues entirely generated by bots\n",
+ "dataset_no_bots = dataset.filter(lambda x: not x[\"bot_issue\"])\n",
+ "# update text size\n",
+ "dataset_no_bots = dataset_no_bots.map(lambda x: {\"text_size_no_bots\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n",
+ "\n",
+ "# let's see how many issues are modified by the bot filter\n",
+ "modified_by_bot = sum(dataset[\"modified_by_bot\"])\n",
+ "print(f\"Percentage of issues modified by the bot filter: {modified_by_bot * 100 / len(dataset):.2f}%\")\n",
+ "\n",
+ "# let's see hwo many issues are deleted\n",
+ "print(f\"Removal of {(len(dataset) - len(dataset_no_bots)) * 100 / len(dataset):.2f}% of issues entirely generated by bots\")\n",
+ "\n",
+ "# let's see how many events are deleted\n",
+ "old_number_events = sum(dataset[\"event_count\"])\n",
+ "new_number_events = sum(dataset_no_bots[\"event_count_no_bots\"])\n",
+ "print(f\"Removal of: {(old_number_events - new_number_events) * 100 / old_number_events:.2f}% events generated by bots\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-d27764cd36891200.arrow\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " percentile | \n",
+ " user_count | \n",
+ " event_count | \n",
+ " text_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2480 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 50 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4544 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 90 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 12339 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 95 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 16387 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 100 | \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 88469 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " percentile user_count event_count text_size\n",
+ "0 0 1 1 0\n",
+ "1 25 1 2 2480\n",
+ "2 50 1 2 4544\n",
+ "3 90 2 3 12339\n",
+ "4 95 2 4 16387\n",
+ "5 100 4 37 88469"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# percentiles of the dataset of issues generated by bots dataset user_count, event_count and text_size\n",
+ "bots_dataset = dataset.filter(lambda x: x[\"bot_issue\"])\n",
+ "get_percentiles(bots_dataset)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After manual inspection: comments are usually long full of links & not very useful to the conversation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 140,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " percentile | \n",
+ " user_count | \n",
+ " event_count | \n",
+ " text_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 141 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 50 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 479 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 90 | \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 3016 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 95 | \n",
+ " 4 | \n",
+ " 12 | \n",
+ " 4903 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 100 | \n",
+ " 77 | \n",
+ " 192 | \n",
+ " 279048 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " percentile user_count event_count text_size\n",
+ "0 0 1 1 0\n",
+ "1 25 1 2 141\n",
+ "2 50 2 3 479\n",
+ "3 90 4 8 3016\n",
+ "4 95 4 12 4903\n",
+ "5 100 77 192 279048"
+ ]
+ },
+ "execution_count": 140,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# no bots dataset\n",
+ "get_percentiles(dataset_no_bots, text_col=\"text_size_no_bots\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Statistics about number of users/authors and events in issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " percentile | \n",
+ " user_count | \n",
+ " event_count | \n",
+ " text_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 141 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 50 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 479 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 90 | \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 3016 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 95 | \n",
+ " 4 | \n",
+ " 12 | \n",
+ " 4903 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 96 | \n",
+ " 5 | \n",
+ " 13 | \n",
+ " 5653 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 100 | \n",
+ " 77 | \n",
+ " 192 | \n",
+ " 279048 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " percentile user_count event_count text_size\n",
+ "0 0 1 1 0\n",
+ "1 25 1 2 141\n",
+ "2 50 2 3 479\n",
+ "3 90 4 8 3016\n",
+ "4 95 4 12 4903\n",
+ "5 96 5 13 5653\n",
+ "6 100 77 192 279048"
+ ]
+ },
+ "execution_count": 139,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# no bots dataset\n",
+ "get_percentiles(dataset_no_bots, x=[0, 25, 50, 90, 95, 96, 100], text_col=\"text_size_no_bots\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We want to keep issues with at least two 2 users, for those with one user, we analyze the text size to see if we keep them or not."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "09f9e41ea0034aa2aad659fa30da3510",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/9 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "22.34% of data removed\n"
+ ]
+ }
+ ],
+ "source": [
+ "ds_user_1 = dataset_no_bots.filter(lambda x: x[\"user_count\"] < 2)\n",
+ "print(f\"{len(ds_user_1) * 100 / len(dataset)}% of data removed\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_issue(events):\n",
+ " for event in events:\n",
+ " print(\"-\" * 75)\n",
+ " print(f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\")\n",
+ " print(f\"text: {event['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a26ebb72e0134d1fbb048cd1f5046636",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/3 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+ " num_rows: 371\n",
+ "})"
+ ]
+ },
+ "execution_count": 143,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "short_issues = ds_user_1.filter(lambda x: x[\"text_size_no_bots\"] < 200 and x[\"text_size_no_bots\"] > 100)\n",
+ "short_issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7563cc55d0984150b677a4dcd9ed99fc",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/3 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+ " num_rows: 6\n",
+ "})"
+ ]
+ },
+ "execution_count": 144,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "long_issues = ds_user_1.filter(lambda x: x[\"text_size\"] > 6000 and x[\"text_size\"] < 7000)\n",
+ "long_issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---------------------------------------------------------------------------\n",
+ "author: rtnpro, opened issue: Improve get API for Channels store\n",
+ "text: Support ``filter``, ``order_by``, ``limit``, ``sort`` queries when fetching Channel entries from stores.\n",
+ "---------------------------------------------------------------------------\n",
+ "author: rtnpro, closed issue: None\n",
+ "text: \n"
+ ]
+ }
+ ],
+ "source": [
+ "print_issue(short_issues[109][\"events\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After visualizing some files with text size higher than 96th percentile (7000 characters), we can see that they are mostly of bad quality like long training logs.\n",
+ "\n",
+ "As for short issues 200 (25th percentile) seems like a good threshold"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "35e7c7102c2541ef80e6b00ffe633327",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/3 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Issues kept: 47.45%\n"
+ ]
+ }
+ ],
+ "source": [
+ "res = ds_user_1.filter(lambda x: x[\"text_size\"] >= 200 and x[\"text_size\"] <= 7000)\n",
+ "print(f\"Issues kept: {len(res)*100/len(ds_user_1):.2f}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-e6de90f06ace1559.arrow\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "event counst in one user dataset {1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21}\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots'],\n",
+ " num_rows: 2\n",
+ "})"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(f\"event counst in one user dataset {set(ds_user_1['event_count'])}\")\n",
+ "# get samples with more than 20 events\n",
+ "res = ds_user_1.filter(lambda x: x[\"event_count\"] >= 10)\n",
+ "res"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print_issue(res[1][\"events\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "An issue with one user and more than 10 events is mostly of bad quality or missed bots"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Filtering based on number of users"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-df9824c6551f7818.arrow\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0b169699e9774bf9a7de843f4280fae9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/9 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "removal of: 13.78% of issues vs 26.20% with users number only filter\n",
+ "removal of: 26.49% of issues compared to the original dataset\n"
+ ]
+ }
+ ],
+ "source": [
+ "from functools import partial\n",
+ "\n",
+ "def filter_based_users(example, minimum=200, maximum=700):\n",
+ " \"\"\" We filter out files with only one user, except if the size\n",
+ " of text in commenst is between 230 and 3600 characters.\n",
+ " \"\"\"\n",
+ " if example[\"user_count\"] >= 2:\n",
+ " return True\n",
+ " else:\n",
+ " if example[\"text_size_no_bots\"] >= minimum and example[\"text_size_no_bots\"] <= maximum and example[\"event_count\"] <= 10:\n",
+ " return True\n",
+ " return False\n",
+ "\n",
+ "initial_filter = dataset_no_bots.filter(lambda x: x[\"user_count\"] >= 2)\n",
+ "x = (len(dataset_no_bots) - len(initial_filter)) * 100 / len(dataset_no_bots)\n",
+ "\n",
+ "data_filter_users = dataset_no_bots.filter(partial(filter_based_users, minimum=200, maximum=7000))\n",
+ "print(f\"removal of: {(len(dataset_no_bots) - len(data_filter_users)) * 100 / len(dataset_no_bots):.2f}% of issues vs {x:.2f}% with users number only filter\")\n",
+ "print(f\"removal of: {(len(dataset) - len(data_filter_users)) * 100 / len(dataset):.2f}% of issues compared to the original dataset\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Filtering based on number of events/comments"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We run this filtering after the filtering based on the number of users & bots.\n",
+ "\n",
+ "We follow the same approach as above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " percentile | \n",
+ " user_count | \n",
+ " event_count | \n",
+ " text_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 25 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 326 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 50 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 779 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 90 | \n",
+ " 4 | \n",
+ " 9 | \n",
+ " 4121 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 95 | \n",
+ " 5 | \n",
+ " 12 | \n",
+ " 6618 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 100 | \n",
+ " 77 | \n",
+ " 192 | \n",
+ " 329077 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " percentile user_count event_count text_size\n",
+ "0 0 1 1 0\n",
+ "1 25 2 2 326\n",
+ "2 50 2 3 779\n",
+ "3 90 4 9 4121\n",
+ "4 95 5 12 6618\n",
+ "5 100 77 192 329077"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_percentiles(data_filter_users)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a757f8f3035342209f1c7556d91e676c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/8 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "removal of: 4.611617467011291% of issues\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_filter_events_1 = data_filter_users.filter(lambda x: x[\"event_count\"] <= 1)\n",
+ "print(f\"removal of: {len(data_filter_events_1) * 100 / len(data_filter_users)}% of issues\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---------------------------------------------------------------------------\n",
+ "author: ndmeiri, opened issue: Add documentation\n",
+ "text: The inline documentation in LGSideMenuController.h is incomplete. For example, the properties associated with these getters are undocumented.\n",
+ "```\n",
+ "- (UIViewController *)rootViewController;\n",
+ "- (UIView *)leftView;\n",
+ "- (UIView *)rightView;\n",
+ "```\n",
+ "\n",
+ "Would you please consider documenting these properties and other members of LGSideMenuController?\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_issue(data_filter_events_1[23][\"events\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This case is handled already by the number of users filter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ed2d850c06094a13bd87ee05569a36fa",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/8 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "removal of: 19.46% of issues\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_filter_events = data_filter_users.filter(lambda x: x[\"event_count\"] == 2)\n",
+ "print(f\"removal of: {len(data_filter_events) * 100 / len(dataset)}% of issues\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 156,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " percentile | \n",
+ " user_count | \n",
+ " event_count | \n",
+ " text_size | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 25 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 59 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 50 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 244 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 90 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1088 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 95 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1672 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 100 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 15082 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " percentile user_count event_count text_size\n",
+ "0 0 1 2 0\n",
+ "1 25 2 2 59\n",
+ "2 50 2 2 244\n",
+ "3 90 2 2 1088\n",
+ "4 95 2 2 1672\n",
+ "5 100 2 2 15082"
+ ]
+ },
+ "execution_count": 156,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_percentiles(data_filter_events, text_col=\"text_size_no_bots\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d2bd6d1bf4bf4152885d3d5e287a59c5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/2 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+ " num_rows: 75\n",
+ "})"
+ ]
+ },
+ "execution_count": 154,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# filter on text size\n",
+ "data_filter_text_size = data_filter_events.filter(lambda x: x[\"text_size_no_bots\"] <= 50 and x[\"text_size_no_bots\"] >= 30)\n",
+ "data_filter_text_size"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---------------------------------------------------------------------------\n",
+ "author: Dan12, opened issue: Project 3 report: fixed formatting\n",
+ "text: Tables were displaying incorrectly.\n",
+ "---------------------------------------------------------------------------\n",
+ "author: sampsyo, created comment: None\n",
+ "text: Thanks!\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_issue(data_filter_text_size[41][\"events\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Short files are of good quality (we already removed the bad ones with one user in previous filter)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a5767b4dd3894add83c61d25480aa5ba",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ " 0%| | 0/2 [00:00, ?ba/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Dataset({\n",
+ " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+ " num_rows: 4\n",
+ "})"
+ ]
+ },
+ "execution_count": 166,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "long_issues = data_filter_events.filter(lambda x: x[\"text_size_no_bots\"] > 7000 and x[\"text_size_no_bots\"] < 9000)\n",
+ "long_issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print_issue(long_issues[1][\"events\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Long files also look ok => we don't use this filter as most poor quality files were removed by the previous filter\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_issue(events):\n",
+ " for event in events:\n",
+ " print(\"-\" * 75)\n",
+ " print(f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\")\n",
+ " print(f\"text: {event['text']}\")\n",
+ "\n",
+ "def print_events(events):\n",
+ " event_text = \"\"\n",
+ " for event in events:\n",
+ " event_metadata= f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\"\n",
+ " event_text += f\"\\n{event_metadata}\\n{event['text']}\\n{'-' * 75}\\n\"\n",
+ " return event_text\n",
+ "\n",
+ "def print_issues(dataset_tf, n=20, col=\"events\"):\n",
+ " all_issues = \"\"\n",
+ " for i in range(n):\n",
+ " delim = \"=\" * 60 + f\" Issue {i} \"+ \"=\" * 60 + \"\\n\"\n",
+ " issue = print_events(dataset_tf[i][col])\n",
+ " all_issues += delim + issue\n",
+ " return all_issues"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_analysis/github_issues_analysis/utils.py b/data_analysis/github_issues_analysis/utils.py
new file mode 100644
index 0000000..dae8d2b
--- /dev/null
+++ b/data_analysis/github_issues_analysis/utils.py
@@ -0,0 +1,144 @@
+import re
+
+import datasets
+import regex
+import torch
+from transformers import pipeline
+
+GITHUB_EMAILS = [
+ re.compile(pattern, re.DOTALL)
+ for pattern in [
+ "(.*)From:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
+ "(.*)On.+notifications@github.com.+wrote:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
+ "(.*)Signed-off-by: .+<.+>(.*?)\n?(.*)",
+ ]
+]
+GITHUB_EMAIL_DATE = re.compile("\d+/\d+/\d+ \d{2}:\d{2} [AP]M.+wrote")
+GITHUB_EMAIL_LINEBREAK = re.compile("_{20,}")
+
+
+BOT_AUTHORS = [
+ "Apache-HBase",
+ "AutorestCI",
+ "CLAassistant",
+ "cmsbuild",
+ "codecov-io",
+ "codecov-commenter",
+ "coveralls",
+ "danger-public",
+ "dnfclas",
+ "msftclas",
+ "PyDocTeur",
+ "SparkQA",
+ "karma-pr-reporter",
+ "danger-public",
+ "claassistantio",
+ "probot-stale",
+]
+
+BOT_KEYWORDS = ["[bot]", "botmanager", "bors-", "jenkins", "k8s-", "-test-", "travis"]
+
+BOT_SUFFIXES = [
+ "-automaton",
+ "-automation",
+ "-benchmark",
+ "-build",
+ "-deployer",
+ "-cloud",
+ "bot",
+ "-ci",
+ "-linter",
+ "-teamcity",
+ "-test",
+ "-testing",
+ "-Service-Account",
+]
+
+
+def merge_text_columns(example):
+ """Combines description and comment to one column (text)
+
+ Descriptions are issue-level text (body of text when opening an issue),
+ comments are replies to the parent issue or one of its comments.
+ We merge them as an event cannot have both at the same time.
+ """
+ events_new = []
+ text_columns = ["comment", "description"]
+ for event_old in example["events"]:
+ event_new = {k: v for k, v in event_old.items() if k not in text_columns}
+ comment, description = event_old["comment"], event_old["description"]
+ text = comment if comment else description
+ event_new["text"] = text if text else ""
+ events_new.append(event_new)
+ example["events"] = events_new
+ return example
+
+
+def _strip_automated_email_text(text):
+ """Removes text auto-generated when users post in issues via email reply"""
+ if text:
+ text = text.strip()
+ else:
+ return ""
+ # try to extract with regex directly
+ for pattern in GITHUB_EMAILS:
+ m = pattern.match(text)
+ if m:
+ break
+ if m:
+ text = m.group(1) + m.group(3)
+ else:
+ # if no exact matches, apply matching line by line and
+ # get potential content before/after automated email text
+ lines = text.split("\n")
+ start, end = 0, -1
+ for i, line in enumerate(lines):
+ line = line.strip()
+ if "notifications@github.com" in line or bool(
+ GITHUB_EMAIL_DATE.search(line)
+ ):
+ start = i
+ if "Reply to this email directly" in line:
+ end = i + 1 if line.endswith(":") else i
+ if line.startswith(">"):
+ # remove quoted text in replies
+ end = i
+ text = "\n".join(lines[:start] + lines[end + 1 :])
+ # remove page break line
+ return GITHUB_EMAIL_LINEBREAK.sub("", text).strip()
+
+
+def strip_automated_email_text(example):
+ """Removes auto-generated text from emails in Github issues"""
+ # assumes merge_text_columns() was already applied on dataset
+ example["events"] = [
+ {
+ k: _strip_automated_email_text(v) if k == "text" else v
+ for k, v in event.items()
+ }
+ for event in example["events"]
+ ]
+ return example
+
+
+def remove_bot_comments(example):
+ """Discard auto comments from issues based on author pattern matching"""
+ filtered_events = []
+ modified = False
+ for event in example["events"]:
+ author = event["author"]
+ # assumes single `text' field rather than comment/description
+ is_bot = (
+ any(bp.lower() in author.lower() for bp in BOT_KEYWORDS)
+ or any(author.lower().endswith(s) for s in BOT_SUFFIXES)
+ or any(author == a for a in BOT_AUTHORS)
+ )
+ if not is_bot:
+ filtered_events.append(event)
+ else:
+ modified = True
+ # example["old_events"] = example["events"]
+ example["events"] = filtered_events
+ example["bot_issue"] = len(example["events"]) == 0
+ example["modified_by_bot"] = modified
+ return example