diff --git a/data_analysis/github_issues_analysis/analysis.ipynb b/data_analysis/github_issues_analysis/analysis.ipynb
new file mode 100644
index 0000000..8271f32
--- /dev/null
+++ b/data_analysis/github_issues_analysis/analysis.ipynb
@@ -0,0 +1,1320 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data curation of GitHub Issues\n",
+    "\n",
+    "## Preprocessing:\n",
+    "  1- filtering automated text\n",
+    "\n",
+    "  2- filtering non-English text (TODO)\n",
+    "\n",
+    "  3- filtering events from bots\n",
+    "\n",
+    "  4- filtering based on number of users (keep issues with one user only if text length is larger than 400 and smaller than 7000)\n",
+    "  \n",
+    "  5- filtering based on number of events (overlaps with previous filter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from utils import (merge_text_columns, remove_bot_comments,\n",
+    "                   strip_automated_email_text)\n",
+    "\n",
+    "\n",
+    "def get_percentiles(ds, x=[0, 25, 50, 90, 95, 100], text_col=\"text_size\"):\n",
+    "    df = pd.DataFrame(\n",
+    "        {\n",
+    "            \"percentile\": x,\n",
+    "            \"user_count\": [int(np.percentile(ds[\"user_count\"], i)) for i in x],\n",
+    "            \"event_count\": [int(np.percentile(ds[\"event_count\"], i)) for i in x],\n",
+    "            \"text_size\": [int(np.percentile(ds[text_col], i)) for i in x],\n",
+    "        }\n",
+    "    )\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration bigcode--subset-github-issues-64ef5cdc6c7e0107\n",
+      "Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'],\n",
+       "    num_rows: 10000\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = datasets.load_dataset(\"bigcode/subset-github-issues\", split=\"train\")\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data preprocessing: \n",
+    "\n",
+    "- reformat column name as \"text\" for both description and comments\n",
+    "- remove automated text\n",
+    "- replace usernames\n",
+    "- add number of users and events, and total size of text in the issue (text in comments/description..)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fd9da4a2ae411309.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-067b3f10662b791d.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3978f2e94b3a379f.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-c0ca3a0c29be9300.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-eaf26ba4f8d67172.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = (\n",
+    "    data.map(merge_text_columns)\n",
+    "    .map(strip_automated_email_text)\n",
+    "    .map(lambda x: {\"user_count\": len(set(event[\"author\"] for event in x[\"events\"]))})\n",
+    "    .map(lambda x: {\"event_count\": len(x[\"events\"])})\n",
+    "    .map(lambda x: {\"text_size\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size'],\n",
+       "    num_rows: 10000\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Removal of events from bots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6d9deeb91d0716c0.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-183f69a510704c03.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-a34eb93828948a16.arrow\n",
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-9fe60754066fa34b.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Percentage of issues modified by the bot filter: 29.73%\n",
+      "Removal of 14.74% of issues entirely generated by bots\n",
+      "Removal of: 17.25% events generated by bots\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = data.map(remove_bot_comments)\n",
+    "# new event count\n",
+    "dataset = dataset.map(lambda x: {\"event_count_no_bots\": len(x[\"events\"])})\n",
+    "# filter out issues entirely generated by bots\n",
+    "dataset_no_bots = dataset.filter(lambda x: not x[\"bot_issue\"])\n",
+    "# update text size\n",
+    "dataset_no_bots = dataset_no_bots.map(lambda x: {\"text_size_no_bots\": sum([len(event[\"text\"]) for event in x[\"events\"]])})\n",
+    "\n",
+    "# let's see how many issues are modified by the bot filter\n",
+    "modified_by_bot = sum(dataset[\"modified_by_bot\"])\n",
+    "print(f\"Percentage of issues modified by the bot filter: {modified_by_bot * 100 / len(dataset):.2f}%\")\n",
+    "\n",
+    "# let's see hwo many issues are deleted\n",
+    "print(f\"Removal of {(len(dataset) - len(dataset_no_bots)) * 100 / len(dataset):.2f}% of issues entirely generated by bots\")\n",
+    "\n",
+    "# let's see how many events are deleted\n",
+    "old_number_events = sum(dataset[\"event_count\"])\n",
+    "new_number_events = sum(dataset_no_bots[\"event_count_no_bots\"])\n",
+    "print(f\"Removal of: {(old_number_events - new_number_events) * 100 / old_number_events:.2f}% events generated by bots\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-d27764cd36891200.arrow\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>percentile</th>\n",
+       "      <th>user_count</th>\n",
+       "      <th>event_count</th>\n",
+       "      <th>text_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2480</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>50</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4544</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>12339</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>95</td>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "      <td>16387</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>100</td>\n",
+       "      <td>4</td>\n",
+       "      <td>37</td>\n",
+       "      <td>88469</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   percentile  user_count  event_count  text_size\n",
+       "0           0           1            1          0\n",
+       "1          25           1            2       2480\n",
+       "2          50           1            2       4544\n",
+       "3          90           2            3      12339\n",
+       "4          95           2            4      16387\n",
+       "5         100           4           37      88469"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# percentiles of the dataset of issues generated by bots dataset user_count, event_count and text_size\n",
+    "bots_dataset = dataset.filter(lambda x: x[\"bot_issue\"])\n",
+    "get_percentiles(bots_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After manual inspection: comments are usually long full of links & not very useful to the conversation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>percentile</th>\n",
+       "      <th>user_count</th>\n",
+       "      <th>event_count</th>\n",
+       "      <th>text_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>141</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>50</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>479</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90</td>\n",
+       "      <td>4</td>\n",
+       "      <td>8</td>\n",
+       "      <td>3016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>95</td>\n",
+       "      <td>4</td>\n",
+       "      <td>12</td>\n",
+       "      <td>4903</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>100</td>\n",
+       "      <td>77</td>\n",
+       "      <td>192</td>\n",
+       "      <td>279048</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   percentile  user_count  event_count  text_size\n",
+       "0           0           1            1          0\n",
+       "1          25           1            2        141\n",
+       "2          50           2            3        479\n",
+       "3          90           4            8       3016\n",
+       "4          95           4           12       4903\n",
+       "5         100          77          192     279048"
+      ]
+     },
+     "execution_count": 140,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# no bots dataset\n",
+    "get_percentiles(dataset_no_bots, text_col=\"text_size_no_bots\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Statistics about number of users/authors and events in issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>percentile</th>\n",
+       "      <th>user_count</th>\n",
+       "      <th>event_count</th>\n",
+       "      <th>text_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>141</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>50</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>479</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90</td>\n",
+       "      <td>4</td>\n",
+       "      <td>8</td>\n",
+       "      <td>3016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>95</td>\n",
+       "      <td>4</td>\n",
+       "      <td>12</td>\n",
+       "      <td>4903</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>96</td>\n",
+       "      <td>5</td>\n",
+       "      <td>13</td>\n",
+       "      <td>5653</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>100</td>\n",
+       "      <td>77</td>\n",
+       "      <td>192</td>\n",
+       "      <td>279048</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   percentile  user_count  event_count  text_size\n",
+       "0           0           1            1          0\n",
+       "1          25           1            2        141\n",
+       "2          50           2            3        479\n",
+       "3          90           4            8       3016\n",
+       "4          95           4           12       4903\n",
+       "5          96           5           13       5653\n",
+       "6         100          77          192     279048"
+      ]
+     },
+     "execution_count": 139,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# no bots dataset\n",
+    "get_percentiles(dataset_no_bots, x=[0, 25, 50, 90, 95, 96, 100], text_col=\"text_size_no_bots\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We want to keep issues with at least two 2 users, for those with one user, we analyze the text size to see if we keep them or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "09f9e41ea0034aa2aad659fa30da3510",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/9 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "22.34% of data removed\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds_user_1 = dataset_no_bots.filter(lambda x: x[\"user_count\"] < 2)\n",
+    "print(f\"{len(ds_user_1) * 100 / len(dataset)}% of data removed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_issue(events):\n",
+    "    for event in events:\n",
+    "        print(\"-\" * 75)\n",
+    "        print(f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\")\n",
+    "        print(f\"text: {event['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a26ebb72e0134d1fbb048cd1f5046636",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+       "    num_rows: 371\n",
+       "})"
+      ]
+     },
+     "execution_count": 143,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_issues = ds_user_1.filter(lambda x: x[\"text_size_no_bots\"] < 200 and x[\"text_size_no_bots\"] > 100)\n",
+    "short_issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7563cc55d0984150b677a4dcd9ed99fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+       "    num_rows: 6\n",
+       "})"
+      ]
+     },
+     "execution_count": 144,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "long_issues = ds_user_1.filter(lambda x: x[\"text_size\"] > 6000 and x[\"text_size\"] < 7000)\n",
+    "long_issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "author: rtnpro, opened issue: Improve get API for Channels store\n",
+      "text: Support ``filter``, ``order_by``, ``limit``, ``sort`` queries when fetching Channel entries from stores.\n",
+      "---------------------------------------------------------------------------\n",
+      "author: rtnpro, closed issue: None\n",
+      "text: \n"
+     ]
+    }
+   ],
+   "source": [
+    "print_issue(short_issues[109][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After visualizing some files with text size higher than 96th percentile (7000 characters), we can see that they are mostly of bad quality like long training logs.\n",
+    "\n",
+    "As for short issues 200 (25th percentile) seems like a good threshold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "35e7c7102c2541ef80e6b00ffe633327",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Issues kept: 47.45%\n"
+     ]
+    }
+   ],
+   "source": [
+    "res = ds_user_1.filter(lambda x: x[\"text_size\"] >= 200 and x[\"text_size\"] <= 7000)\n",
+    "print(f\"Issues kept: {len(res)*100/len(ds_user_1):.2f}%\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-e6de90f06ace1559.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "event counst in one user dataset {1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots'],\n",
+       "    num_rows: 2\n",
+       "})"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(f\"event counst in one user dataset {set(ds_user_1['event_count'])}\")\n",
+    "# get samples with more than 20 events\n",
+    "res = ds_user_1.filter(lambda x: x[\"event_count\"] >= 10)\n",
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_issue(res[1][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "An issue with one user and more than 10 events is mostly of bad quality or missed bots"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filtering based on number of users"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-df9824c6551f7818.arrow\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b169699e9774bf9a7de843f4280fae9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/9 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "removal of: 13.78% of issues vs 26.20% with users number only filter\n",
+      "removal of: 26.49% of issues compared to the original dataset\n"
+     ]
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "\n",
+    "def filter_based_users(example, minimum=200, maximum=700):\n",
+    "    \"\"\" We filter out files with only one user, except if the size\n",
+    "    of text in commenst is between 230 and 3600 characters.\n",
+    "    \"\"\"\n",
+    "    if example[\"user_count\"] >= 2:\n",
+    "        return True\n",
+    "    else:\n",
+    "        if example[\"text_size_no_bots\"] >= minimum and example[\"text_size_no_bots\"] <= maximum and example[\"event_count\"] <= 10:\n",
+    "            return True\n",
+    "        return False\n",
+    "\n",
+    "initial_filter = dataset_no_bots.filter(lambda x: x[\"user_count\"] >= 2)\n",
+    "x = (len(dataset_no_bots) - len(initial_filter)) * 100 / len(dataset_no_bots)\n",
+    "\n",
+    "data_filter_users = dataset_no_bots.filter(partial(filter_based_users, minimum=200, maximum=7000))\n",
+    "print(f\"removal of: {(len(dataset_no_bots) - len(data_filter_users)) * 100 / len(dataset_no_bots):.2f}% of issues vs {x:.2f}% with users number only filter\")\n",
+    "print(f\"removal of: {(len(dataset) - len(data_filter_users)) * 100 / len(dataset):.2f}% of issues compared to the original dataset\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filtering based on number of events/comments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We run this filtering after the filtering based on the number of users & bots.\n",
+    "\n",
+    "We follow the same approach as above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>percentile</th>\n",
+       "      <th>user_count</th>\n",
+       "      <th>event_count</th>\n",
+       "      <th>text_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>326</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>50</td>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>779</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90</td>\n",
+       "      <td>4</td>\n",
+       "      <td>9</td>\n",
+       "      <td>4121</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>95</td>\n",
+       "      <td>5</td>\n",
+       "      <td>12</td>\n",
+       "      <td>6618</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>100</td>\n",
+       "      <td>77</td>\n",
+       "      <td>192</td>\n",
+       "      <td>329077</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   percentile  user_count  event_count  text_size\n",
+       "0           0           1            1          0\n",
+       "1          25           2            2        326\n",
+       "2          50           2            3        779\n",
+       "3          90           4            9       4121\n",
+       "4          95           5           12       6618\n",
+       "5         100          77          192     329077"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_percentiles(data_filter_users)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 173,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a757f8f3035342209f1c7556d91e676c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/8 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "removal of: 4.611617467011291% of issues\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_filter_events_1 = data_filter_users.filter(lambda x: x[\"event_count\"] <= 1)\n",
+    "print(f\"removal of: {len(data_filter_events_1) * 100 / len(data_filter_users)}% of issues\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "author: ndmeiri, opened issue: Add documentation\n",
+      "text: The inline documentation in LGSideMenuController.h is incomplete. For example, the properties associated with these getters are undocumented.\n",
+      "```\n",
+      "- (UIViewController *)rootViewController;\n",
+      "- (UIView *)leftView;\n",
+      "- (UIView *)rightView;\n",
+      "```\n",
+      "\n",
+      "Would you please consider documenting these properties and other members of LGSideMenuController?\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_issue(data_filter_events_1[23][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This case is handled already by the number of users filter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed2d850c06094a13bd87ee05569a36fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/8 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "removal of: 19.46% of issues\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_filter_events = data_filter_users.filter(lambda x: x[\"event_count\"] == 2)\n",
+    "print(f\"removal of: {len(data_filter_events) * 100 / len(dataset)}% of issues\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>percentile</th>\n",
+       "      <th>user_count</th>\n",
+       "      <th>event_count</th>\n",
+       "      <th>text_size</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>25</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>50</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>244</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1088</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>95</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1672</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>100</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>15082</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   percentile  user_count  event_count  text_size\n",
+       "0           0           1            2          0\n",
+       "1          25           2            2         59\n",
+       "2          50           2            2        244\n",
+       "3          90           2            2       1088\n",
+       "4          95           2            2       1672\n",
+       "5         100           2            2      15082"
+      ]
+     },
+     "execution_count": 156,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_percentiles(data_filter_events, text_col=\"text_size_no_bots\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d2bd6d1bf4bf4152885d3d5e287a59c5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+       "    num_rows: 75\n",
+       "})"
+      ]
+     },
+     "execution_count": 154,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# filter on text size\n",
+    "data_filter_text_size = data_filter_events.filter(lambda x: x[\"text_size_no_bots\"] <= 50 and x[\"text_size_no_bots\"] >= 30)\n",
+    "data_filter_text_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "author: Dan12, opened issue: Project 3 report: fixed formatting\n",
+      "text: Tables were displaying incorrectly.\n",
+      "---------------------------------------------------------------------------\n",
+      "author: sampsyo, created comment: None\n",
+      "text: Thanks!\n"
+     ]
+    }
+   ],
+   "source": [
+    "print_issue(data_filter_text_size[41][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Short files are of good quality (we already removed the bad ones with one user in previous filter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 166,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5767b4dd3894add83c61d25480aa5ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],\n",
+       "    num_rows: 4\n",
+       "})"
+      ]
+     },
+     "execution_count": 166,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "long_issues = data_filter_events.filter(lambda x: x[\"text_size_no_bots\"] > 7000 and x[\"text_size_no_bots\"] < 9000)\n",
+    "long_issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_issue(long_issues[1][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Long files also look ok => we don't use this filter as most poor quality files were removed by the previous filter\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_issue(events):\n",
+    "    for event in events:\n",
+    "        print(\"-\" * 75)\n",
+    "        print(f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\")\n",
+    "        print(f\"text: {event['text']}\")\n",
+    "\n",
+    "def print_events(events):\n",
+    "    event_text = \"\"\n",
+    "    for event in events:\n",
+    "        event_metadata= f\"author: {event['author']}, {event['action']} {event['type']}: {event['title']}\"\n",
+    "        event_text += f\"\\n{event_metadata}\\n{event['text']}\\n{'-' * 75}\\n\"\n",
+    "    return event_text\n",
+    "\n",
+    "def print_issues(dataset_tf, n=20, col=\"events\"):\n",
+    "    all_issues = \"\"\n",
+    "    for i in range(n):\n",
+    "        delim = \"=\" * 60 + f\"   Issue {i}   \"+ \"=\" * 60 + \"\\n\"\n",
+    "        issue = print_events(dataset_tf[i][col])\n",
+    "        all_issues += delim + issue\n",
+    "    return all_issues"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_analysis/github_issues_analysis/utils.py b/data_analysis/github_issues_analysis/utils.py
new file mode 100644
index 0000000..dae8d2b
--- /dev/null
+++ b/data_analysis/github_issues_analysis/utils.py
@@ -0,0 +1,144 @@
+import re
+
+import datasets
+import regex
+import torch
+from transformers import pipeline
+
+GITHUB_EMAILS = [
+    re.compile(pattern, re.DOTALL)
+    for pattern in [
+        "(.*)From:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
+        "(.*)On.+notifications@github.com.+wrote:.+Reply to this email directly.+view it on GitHub(.*)\n?(.*)",
+        "(.*)Signed-off-by: .+<.+>(.*?)\n?(.*)",
+    ]
+]
+GITHUB_EMAIL_DATE = re.compile("\d+/\d+/\d+ \d{2}:\d{2} [AP]M.+wrote")
+GITHUB_EMAIL_LINEBREAK = re.compile("_{20,}")
+
+
+BOT_AUTHORS = [
+    "Apache-HBase",
+    "AutorestCI",
+    "CLAassistant",
+    "cmsbuild",
+    "codecov-io",
+    "codecov-commenter",
+    "coveralls",
+    "danger-public",
+    "dnfclas",
+    "msftclas",
+    "PyDocTeur",
+    "SparkQA",
+    "karma-pr-reporter",
+    "danger-public",
+    "claassistantio",
+    "probot-stale",
+]
+
+BOT_KEYWORDS = ["[bot]", "botmanager", "bors-", "jenkins", "k8s-", "-test-", "travis"]
+
+BOT_SUFFIXES = [
+    "-automaton",
+    "-automation",
+    "-benchmark",
+    "-build",
+    "-deployer",
+    "-cloud",
+    "bot",
+    "-ci",
+    "-linter",
+    "-teamcity",
+    "-test",
+    "-testing",
+    "-Service-Account",
+]
+
+
+def merge_text_columns(example):
+    """Combines description and comment to one column (text)
+
+    Descriptions are issue-level text (body of text when opening an issue),
+    comments are replies to the parent issue or one of its comments.
+    We merge them as an event cannot have both at the same time.
+    """
+    events_new = []
+    text_columns = ["comment", "description"]
+    for event_old in example["events"]:
+        event_new = {k: v for k, v in event_old.items() if k not in text_columns}
+        comment, description = event_old["comment"], event_old["description"]
+        text = comment if comment else description
+        event_new["text"] = text if text else ""
+        events_new.append(event_new)
+    example["events"] = events_new
+    return example
+
+
+def _strip_automated_email_text(text):
+    """Removes text auto-generated when users post in issues via email reply"""
+    if text:
+        text = text.strip()
+    else:
+        return ""
+    # try to extract with regex directly
+    for pattern in GITHUB_EMAILS:
+        m = pattern.match(text)
+        if m:
+            break
+    if m:
+        text = m.group(1) + m.group(3)
+    else:
+        # if no exact matches, apply matching line by line and
+        # get potential content before/after automated email text
+        lines = text.split("\n")
+        start, end = 0, -1
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if "notifications@github.com" in line or bool(
+                GITHUB_EMAIL_DATE.search(line)
+            ):
+                start = i
+            if "Reply to this email directly" in line:
+                end = i + 1 if line.endswith(":") else i
+            if line.startswith(">"):
+                # remove quoted text in replies
+                end = i
+        text = "\n".join(lines[:start] + lines[end + 1 :])
+    # remove page break line
+    return GITHUB_EMAIL_LINEBREAK.sub("", text).strip()
+
+
+def strip_automated_email_text(example):
+    """Removes auto-generated text from emails in Github issues"""
+    # assumes merge_text_columns() was already applied on dataset
+    example["events"] = [
+        {
+            k: _strip_automated_email_text(v) if k == "text" else v
+            for k, v in event.items()
+        }
+        for event in example["events"]
+    ]
+    return example
+
+
+def remove_bot_comments(example):
+    """Discard auto comments from issues based on author pattern matching"""
+    filtered_events = []
+    modified = False
+    for event in example["events"]:
+        author = event["author"]
+        # assumes single `text' field rather than comment/description
+        is_bot = (
+            any(bp.lower() in author.lower() for bp in BOT_KEYWORDS)
+            or any(author.lower().endswith(s) for s in BOT_SUFFIXES)
+            or any(author == a for a in BOT_AUTHORS)
+        )
+        if not is_bot:
+            filtered_events.append(event)
+        else:
+            modified = True
+    # example["old_events"] = example["events"]
+    example["events"] = filtered_events
+    example["bot_issue"] = len(example["events"]) == 0
+    example["modified_by_bot"] = modified
+    return example