diff --git a/data_analysis/github_issues_analysis/analysis.ipynb b/data_analysis/github_issues_analysis/analysis.ipynb new file mode 100644 index 0000000..8271f32 --- /dev/null +++ b/data_analysis/github_issues_analysis/analysis.ipynb @@ -0,0 +1,1320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data curation of GitHub Issues\n", + "\n", + "## Preprocessing:\n", + " 1- filtering automated text\n", + "\n", + " 2- filtering non-English text (TODO)\n", + "\n", + " 3- filtering events from bots\n", + "\n", + " 4- filtering based on number of users (keep issues with one user only if text length is larger than 400 and smaller than 7000)\n", + " \n", + " 5- filtering based on number of events (overlaps with previous filter)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import datasets\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from utils import (merge_text_columns, remove_bot_comments,\n", + " strip_automated_email_text)\n", + "\n", + "\n", + "def get_percentiles(ds, x=[0, 25, 50, 90, 95, 100], text_col=\"text_size\"):\n", + " df = pd.DataFrame(\n", + " {\n", + " \"percentile\": x,\n", + " \"user_count\": [int(np.percentile(ds[\"user_count\"], i)) for i in x],\n", + " \"event_count\": [int(np.percentile(ds[\"event_count\"], i)) for i in x],\n", + " \"text_size\": [int(np.percentile(ds[text_col], i)) for i in x],\n", + " }\n", + " )\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration bigcode--subset-github-issues-64ef5cdc6c7e0107\n", + "Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'],\n", + " num_rows: 10000\n", + "})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = datasets.load_dataset(\"bigcode/subset-github-issues\", split=\"train\")\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data preprocessing: \n", + "\n", + "- reformat column name as \"text\" for both description and comments\n", + "- remove automated text\n", + "- replace usernames\n", + "- add number of users and events, and total size of text in the issue (text in comments/description..)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fd9da4a2ae411309.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-067b3f10662b791d.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3978f2e94b3a379f.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-c0ca3a0c29be9300.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-eaf26ba4f8d67172.arrow\n" + ] + } + ], + "source": [ + "data = (\n", + " data.map(merge_text_columns)\n", + " .map(strip_automated_email_text)\n", + " .map(lambda x: {\"user_count\": len(set(event[\"author\"] for event in x[\"events\"]))})\n", + " .map(lambda x: {\"event_count\": len(x[\"events\"])})\n", + " .map(lambda x: {\"text_size\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size'],\n", + " num_rows: 10000\n", + "})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Removal of events from bots" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6d9deeb91d0716c0.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-183f69a510704c03.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-a34eb93828948a16.arrow\n", + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-9fe60754066fa34b.arrow\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Percentage of issues modified by the bot filter: 29.73%\n", + "Removal of 14.74% of issues entirely generated by bots\n", + "Removal of: 17.25% events generated by bots\n" + ] + } + ], + "source": [ + "dataset = data.map(remove_bot_comments)\n", + "# new event count\n", + "dataset = dataset.map(lambda x: {\"event_count_no_bots\": len(x[\"events\"])})\n", + "# filter out issues entirely generated by bots\n", + "dataset_no_bots = dataset.filter(lambda x: not x[\"bot_issue\"])\n", + "# update text size\n", + "dataset_no_bots = dataset_no_bots.map(lambda x: {\"text_size_no_bots\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n", + "\n", + "# let's see how many issues are modified by the bot filter\n", + "modified_by_bot = sum(dataset[\"modified_by_bot\"])\n", + "print(f\"Percentage of issues modified by the bot filter: {modified_by_bot * 100 / len(dataset):.2f}%\")\n", + "\n", + "# let's see hwo many issues are deleted\n", + "print(f\"Removal of {(len(dataset) - len(dataset_no_bots)) * 100 / len(dataset):.2f}% of issues entirely generated by bots\")\n", + "\n", + "# let's see how many events are deleted\n", + "old_number_events = sum(dataset[\"event_count\"])\n", + "new_number_events = sum(dataset_no_bots[\"event_count_no_bots\"])\n", + "print(f\"Removal of: {(old_number_events - new_number_events) * 100 / old_number_events:.2f}% events generated by bots\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-d27764cd36891200.arrow\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percentileuser_countevent_counttext_size
00110
125122480
250124544
3902312339
4952416387
510043788469
\n", + "
" + ], + "text/plain": [ + " percentile user_count event_count text_size\n", + "0 0 1 1 0\n", + "1 25 1 2 2480\n", + "2 50 1 2 4544\n", + "3 90 2 3 12339\n", + "4 95 2 4 16387\n", + "5 100 4 37 88469" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# percentiles of the dataset of issues generated by bots dataset user_count, event_count and text_size\n", + "bots_dataset = dataset.filter(lambda x: x[\"bot_issue\"])\n", + "get_percentiles(bots_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After manual inspection: comments are usually long full of links & not very useful to the conversation" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percentileuser_countevent_counttext_size
00110
12512141
25023479
390483016
4954124903
510077192279048
\n", + "
" + ], + "text/plain": [ + " percentile user_count event_count text_size\n", + "0 0 1 1 0\n", + "1 25 1 2 141\n", + "2 50 2 3 479\n", + "3 90 4 8 3016\n", + "4 95 4 12 4903\n", + "5 100 77 192 279048" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# no bots dataset\n", + "get_percentiles(dataset_no_bots, text_col=\"text_size_no_bots\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Statistics about number of users/authors and events in issues" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percentileuser_countevent_counttext_size
00110
12512141
25023479
390483016
4954124903
5965135653
610077192279048
\n", + "
" + ], + "text/plain": [ + " percentile user_count event_count text_size\n", + "0 0 1 1 0\n", + "1 25 1 2 141\n", + "2 50 2 3 479\n", + "3 90 4 8 3016\n", + "4 95 4 12 4903\n", + "5 96 5 13 5653\n", + "6 100 77 192 279048" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# no bots dataset\n", + "get_percentiles(dataset_no_bots, x=[0, 25, 50, 90, 95, 96, 100], text_col=\"text_size_no_bots\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to keep issues with at least two 2 users, for those with one user, we analyze the text size to see if we keep them or not." + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "09f9e41ea0034aa2aad659fa30da3510", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/9 [00:00 100)\n", + "short_issues" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7563cc55d0984150b677a4dcd9ed99fc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 6000 and x[\"text_size\"] < 7000)\n", + "long_issues" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------\n", + "author: rtnpro, opened issue: Improve get API for Channels store\n", + "text: Support ``filter``, ``order_by``, ``limit``, ``sort`` queries when fetching Channel entries from stores.\n", + "---------------------------------------------------------------------------\n", + "author: rtnpro, closed issue: None\n", + "text: \n" + ] + } + ], + "source": [ + "print_issue(short_issues[109][\"events\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After visualizing some files with text size higher than 96th percentile (7000 characters), we can see that they are mostly of bad quality like long training logs.\n", + "\n", + "As for short issues 200 (25th percentile) seems like a good threshold" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "35e7c7102c2541ef80e6b00ffe633327", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00= 200 and x[\"text_size\"] <= 7000)\n", + "print(f\"Issues kept: {len(res)*100/len(ds_user_1):.2f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-e6de90f06ace1559.arrow\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "event counst in one user dataset {1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21}\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots'],\n", + " num_rows: 2\n", + "})" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(f\"event counst in one user dataset {set(ds_user_1['event_count'])}\")\n", + "# get samples with more than 20 events\n", + "res = ds_user_1.filter(lambda x: x[\"event_count\"] >= 10)\n", + "res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_issue(res[1][\"events\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An issue with one user and more than 10 events is mostly of bad quality or missed bots" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering based on number of users" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-df9824c6551f7818.arrow\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0b169699e9774bf9a7de843f4280fae9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/9 [00:00= 2:\n", + " return True\n", + " else:\n", + " if example[\"text_size_no_bots\"] >= minimum and example[\"text_size_no_bots\"] <= maximum and example[\"event_count\"] <= 10:\n", + " return True\n", + " return False\n", + "\n", + "initial_filter = dataset_no_bots.filter(lambda x: x[\"user_count\"] >= 2)\n", + "x = (len(dataset_no_bots) - len(initial_filter)) * 100 / len(dataset_no_bots)\n", + "\n", + "data_filter_users = dataset_no_bots.filter(partial(filter_based_users, minimum=200, maximum=7000))\n", + "print(f\"removal of: {(len(dataset_no_bots) - len(data_filter_users)) * 100 / len(dataset_no_bots):.2f}% of issues vs {x:.2f}% with users number only filter\")\n", + "print(f\"removal of: {(len(dataset) - len(data_filter_users)) * 100 / len(dataset):.2f}% of issues compared to the original dataset\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering based on number of events/comments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We run this filtering after the filtering based on the number of users & bots.\n", + "\n", + "We follow the same approach as above." + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percentileuser_countevent_counttext_size
00110
12522326
25023779
390494121
4955126618
510077192329077
\n", + "
" + ], + "text/plain": [ + " percentile user_count event_count text_size\n", + "0 0 1 1 0\n", + "1 25 2 2 326\n", + "2 50 2 3 779\n", + "3 90 4 9 4121\n", + "4 95 5 12 6618\n", + "5 100 77 192 329077" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_percentiles(data_filter_users)" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a757f8f3035342209f1c7556d91e676c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/8 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percentileuser_countevent_counttext_size
00120
1252259
25022244
390221088
495221672
51002215082
\n", + "" + ], + "text/plain": [ + " percentile user_count event_count text_size\n", + "0 0 1 2 0\n", + "1 25 2 2 59\n", + "2 50 2 2 244\n", + "3 90 2 2 1088\n", + "4 95 2 2 1672\n", + "5 100 2 2 15082" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_percentiles(data_filter_events, text_col=\"text_size_no_bots\")" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d2bd6d1bf4bf4152885d3d5e287a59c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00= 30)\n", + "data_filter_text_size" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------\n", + "author: Dan12, opened issue: Project 3 report: fixed formatting\n", + "text: Tables were displaying incorrectly.\n", + "---------------------------------------------------------------------------\n", + "author: sampsyo, created comment: None\n", + "text: Thanks!\n" + ] + } + ], + "source": [ + "print_issue(data_filter_text_size[41][\"events\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Short files are of good quality (we already removed the bad ones with one user in previous filter)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a5767b4dd3894add83c61d25480aa5ba", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00 7000 and x[\"text_size_no_bots\"] < 9000)\n", + "long_issues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print_issue(long_issues[1][\"events\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Long files also look ok => we don't use this filter as most poor quality files were removed by the previous filter\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_issue(events):\n", + " for event in events:\n", + " print(\"-\" * 75)\n", + " print(f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\")\n", + " print(f\"text: {event['text']}\")\n", + "\n", + "def print_events(events):\n", + " event_text = \"\"\n", + " for event in events:\n", + " event_metadata= f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\"\n", + " event_text += f\"\\n{event_metadata}\\n{event['text']}\\n{'-' * 75}\\n\"\n", + " return event_text\n", + "\n", + "def print_issues(dataset_tf, n=20, col=\"events\"):\n", + " all_issues = \"\"\n", + " for i in range(n):\n", + " delim = \"=\" * 60 + f\" Issue {i} \"+ \"=\" * 60 + \"\\n\"\n", + " issue = print_events(dataset_tf[i][col])\n", + " all_issues += delim + issue\n", + " return all_issues" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_analysis/github_issues_analysis/utils.py b/data_analysis/github_issues_analysis/utils.py new file mode 100644 index 0000000..dae8d2b --- /dev/null +++ b/data_analysis/github_issues_analysis/utils.py @@ -0,0 +1,144 @@ +import re + +import datasets +import regex +import torch +from transformers import pipeline + +GITHUB_EMAILS = [ + re.compile(pattern, re.DOTALL) + for pattern in [ + "(.*)From:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)", + "(.*)On.+notifications@github.com.+wrote:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)", + "(.*)Signed-off-by: .+<.+>(.*?)\n?(.*)", + ] +] +GITHUB_EMAIL_DATE = re.compile("\d+/\d+/\d+ \d{2}:\d{2} [AP]M.+wrote") +GITHUB_EMAIL_LINEBREAK = re.compile("_{20,}") + + +BOT_AUTHORS = [ + "Apache-HBase", + "AutorestCI", + "CLAassistant", + "cmsbuild", + "codecov-io", + "codecov-commenter", + "coveralls", + "danger-public", + "dnfclas", + "msftclas", + "PyDocTeur", + "SparkQA", + "karma-pr-reporter", + "danger-public", + "claassistantio", + "probot-stale", +] + +BOT_KEYWORDS = ["[bot]", "botmanager", "bors-", "jenkins", "k8s-", "-test-", "travis"] + +BOT_SUFFIXES = [ + "-automaton", + "-automation", + "-benchmark", + "-build", + "-deployer", + "-cloud", + "bot", + "-ci", + "-linter", + "-teamcity", + "-test", + "-testing", + "-Service-Account", +] + + +def merge_text_columns(example): + """Combines description and comment to one column (text) + + Descriptions are issue-level text (body of text when opening an issue), + comments are replies to the parent issue or one of its comments. + We merge them as an event cannot have both at the same time. + """ + events_new = [] + text_columns = ["comment", "description"] + for event_old in example["events"]: + event_new = {k: v for k, v in event_old.items() if k not in text_columns} + comment, description = event_old["comment"], event_old["description"] + text = comment if comment else description + event_new["text"] = text if text else "" + events_new.append(event_new) + example["events"] = events_new + return example + + +def _strip_automated_email_text(text): + """Removes text auto-generated when users post in issues via email reply""" + if text: + text = text.strip() + else: + return "" + # try to extract with regex directly + for pattern in GITHUB_EMAILS: + m = pattern.match(text) + if m: + break + if m: + text = m.group(1) + m.group(3) + else: + # if no exact matches, apply matching line by line and + # get potential content before/after automated email text + lines = text.split("\n") + start, end = 0, -1 + for i, line in enumerate(lines): + line = line.strip() + if "notifications@github.com" in line or bool( + GITHUB_EMAIL_DATE.search(line) + ): + start = i + if "Reply to this email directly" in line: + end = i + 1 if line.endswith(":") else i + if line.startswith(">"): + # remove quoted text in replies + end = i + text = "\n".join(lines[:start] + lines[end + 1 :]) + # remove page break line + return GITHUB_EMAIL_LINEBREAK.sub("", text).strip() + + +def strip_automated_email_text(example): + """Removes auto-generated text from emails in Github issues""" + # assumes merge_text_columns() was already applied on dataset + example["events"] = [ + { + k: _strip_automated_email_text(v) if k == "text" else v + for k, v in event.items() + } + for event in example["events"] + ] + return example + + +def remove_bot_comments(example): + """Discard auto comments from issues based on author pattern matching""" + filtered_events = [] + modified = False + for event in example["events"]: + author = event["author"] + # assumes single `text' field rather than comment/description + is_bot = ( + any(bp.lower() in author.lower() for bp in BOT_KEYWORDS) + or any(author.lower().endswith(s) for s in BOT_SUFFIXES) + or any(author == a for a in BOT_AUTHORS) + ) + if not is_bot: + filtered_events.append(event) + else: + modified = True + # example["old_events"] = example["events"] + example["events"] = filtered_events + example["bot_issue"] = len(example["events"]) == 0 + example["modified_by_bot"] = modified + return example