# Data curation of GitHub Issues

## Preprocessing:
  1- filtering automated text

  2- filtering non-English text (TODO)

  3- filtering events from bots

  4- filtering based on number of users (keep issues with one user only if text length is larger than 400 and smaller than 7000)
  
  5- filtering based on number of events (overlaps with previous filter)

In [3]:
import datasets
import numpy as np
import pandas as pd

from utils import (merge_text_columns, remove_bot_comments,
                   strip_automated_email_text)


def get_percentiles(ds, x=[0, 25, 50, 90, 95, 100], text_col="text_size"):
    df = pd.DataFrame(
        {
            "percentile": x,
            "user_count": [int(np.percentile(ds["user_count"], i)) for i in x],
            "event_count": [int(np.percentile(ds["event_count"], i)) for i in x],
            "text_size": [int(np.percentile(ds[text_col], i)) for i in x],
        }
    )
    return df

In [4]:
data = datasets.load_dataset("bigcode/subset-github-issues", split="train")
data

Using custom data configuration bigcode--subset-github-issues-64ef5cdc6c7e0107
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'],
    num_rows: 10000
})

## Data preprocessing: 

- reformat column name as "text" for both description and comments
- remove automated text
- replace usernames
- add number of users and events, and total size of text in the issue (text in comments/description..)

In [5]:
data = (
    data.map(merge_text_columns)
    .map(strip_automated_email_text)
    .map(lambda x: {"user_count": len(set(event["author"] for event in x["events"]))})
    .map(lambda x: {"event_count": len(x["events"])})
    .map(lambda x: {"text_size": sum([len(event["text"]) for event in x["events"]])})
)

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fd9da4a2ae411309.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-067b3f10662b791d.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3978f2e94b3a379f.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-c0ca3a0c29be9300.arrow
Loading 

In [6]:
data

Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size'],
    num_rows: 10000
})

## Removal of events from bots

In [6]:
dataset = data.map(remove_bot_comments)
# new event count
dataset = dataset.map(lambda x: {"event_count_no_bots": len(x["events"])})
# filter out issues entirely generated by bots
dataset_no_bots = dataset.filter(lambda x: not x["bot_issue"])
# update text size
dataset_no_bots = dataset_no_bots.map(lambda x: {"text_size_no_bots": sum([len(event["text"]) for event in x["events"]])})

# let's see how many issues are modified by the bot filter
modified_by_bot = sum(dataset["modified_by_bot"])
print(f"Percentage of issues modified by the bot filter: {modified_by_bot * 100 / len(dataset):.2f}%")

# let's see hwo many issues are deleted
print(f"Removal of {(len(dataset) - len(dataset_no_bots)) * 100 / len(dataset):.2f}% of issues entirely generated by bots")

# let's see how many events are deleted
old_number_events = sum(dataset["event_count"])
new_number_events = sum(dataset_no_bots["event_count_no_bots"])
print(f"Removal of: {(old_number_events - new_number_events) * 100 / old_number_events:.2f}% events generated by bots")

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6d9deeb91d0716c0.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-183f69a510704c03.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-a34eb93828948a16.arrow
Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-9fe60754066fa34b.arrow


Percentage of issues modified by the bot filter: 29.73%
Removal of 14.74% of issues entirely generated by bots
Removal of: 17.25% events generated by bots


In [7]:
# percentiles of the dataset of issues generated by bots dataset user_count, event_count and text_size
bots_dataset = dataset.filter(lambda x: x["bot_issue"])
get_percentiles(bots_dataset)

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-d27764cd36891200.arrow


Unnamed: 0,percentile,user_count,event_count,text_size
0,0,1,1,0
1,25,1,2,2480
2,50,1,2,4544
3,90,2,3,12339
4,95,2,4,16387
5,100,4,37,88469


After manual inspection: comments are usually long full of links & not very useful to the conversation

In [140]:
# no bots dataset
get_percentiles(dataset_no_bots, text_col="text_size_no_bots")

Unnamed: 0,percentile,user_count,event_count,text_size
0,0,1,1,0
1,25,1,2,141
2,50,2,3,479
3,90,4,8,3016
4,95,4,12,4903
5,100,77,192,279048


## Statistics about number of users/authors and events in issues

In [139]:
# no bots dataset
get_percentiles(dataset_no_bots, x=[0, 25, 50, 90, 95, 96, 100], text_col="text_size_no_bots")

Unnamed: 0,percentile,user_count,event_count,text_size
0,0,1,1,0
1,25,1,2,141
2,50,2,3,479
3,90,4,8,3016
4,95,4,12,4903
5,96,5,13,5653
6,100,77,192,279048


We want to keep issues with at least two 2 users, for those with one user, we analyze the text size to see if we keep them or not.

In [142]:
ds_user_1 = dataset_no_bots.filter(lambda x: x["user_count"] < 2)
print(f"{len(ds_user_1) * 100 / len(dataset)}% of data removed")

  0%|          | 0/9 [00:00<?, ?ba/s]

22.34% of data removed


In [24]:
def print_issue(events):
    for event in events:
        print("-" * 75)
        print(f"author: {event['author']}, {event['action']} {event['type']}: {event['title']}")
        print(f"text: {event['text']}")

In [143]:
short_issues = ds_user_1.filter(lambda x: x["text_size_no_bots"] < 200 and x["text_size_no_bots"] > 100)
short_issues

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],
    num_rows: 371
})

In [144]:
long_issues = ds_user_1.filter(lambda x: x["text_size"] > 6000 and x["text_size"] < 7000)
long_issues

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],
    num_rows: 6
})

In [150]:
print_issue(short_issues[109]["events"])

---------------------------------------------------------------------------
author: rtnpro, opened issue: Improve get API for Channels store
text: Support ``filter``, ``order_by``, ``limit``, ``sort`` queries when fetching Channel entries from stores.
---------------------------------------------------------------------------
author: rtnpro, closed issue: None
text: 


After visualizing some files with text size higher than 96th percentile (7000 characters), we can see that they are mostly of bad quality like long training logs.

As for short issues 200 (25th percentile) seems like a good threshold

In [72]:
res = ds_user_1.filter(lambda x: x["text_size"] >= 200 and x["text_size"] <= 7000)
print(f"Issues kept: {len(res)*100/len(ds_user_1):.2f}%")

  0%|          | 0/3 [00:00<?, ?ba/s]

Issues kept: 47.45%


In [80]:
print(f"event counst in one user dataset {set(ds_user_1['event_count'])}")
# get samples with more than 20 events
res = ds_user_1.filter(lambda x: x["event_count"] >= 10)
res

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-e6de90f06ace1559.arrow


event counst in one user dataset {1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21}


Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots'],
    num_rows: 2
})

In [None]:
print_issue(res[1]["events"])

An issue with one user and more than 10 events is mostly of bad quality or missed bots

## Filtering based on number of users

In [9]:
from functools import partial

def filter_based_users(example, minimum=200, maximum=700):
    """ We filter out files with only one user, except if the size
    of text in commenst is between 230 and 3600 characters.
    """
    if example["user_count"] >= 2:
        return True
    else:
        if example["text_size_no_bots"] >= minimum and example["text_size_no_bots"] <= maximum and example["event_count"] <= 10:
            return True
        return False

initial_filter = dataset_no_bots.filter(lambda x: x["user_count"] >= 2)
x = (len(dataset_no_bots) - len(initial_filter)) * 100 / len(dataset_no_bots)

data_filter_users = dataset_no_bots.filter(partial(filter_based_users, minimum=200, maximum=7000))
print(f"removal of: {(len(dataset_no_bots) - len(data_filter_users)) * 100 / len(dataset_no_bots):.2f}% of issues vs {x:.2f}% with users number only filter")
print(f"removal of: {(len(dataset) - len(data_filter_users)) * 100 / len(dataset):.2f}% of issues compared to the original dataset")

Loading cached processed dataset at /Users/loubnabenallal/.cache/huggingface/datasets/bigcode___json/bigcode--subset-github-issues-64ef5cdc6c7e0107/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-df9824c6551f7818.arrow


  0%|          | 0/9 [00:00<?, ?ba/s]

removal of: 13.78% of issues vs 26.20% with users number only filter
removal of: 26.49% of issues compared to the original dataset


## Filtering based on number of events/comments

We run this filtering after the filtering based on the number of users & bots.

We follow the same approach as above.

In [84]:
get_percentiles(data_filter_users)

Unnamed: 0,percentile,user_count,event_count,text_size
0,0,1,1,0
1,25,2,2,326
2,50,2,3,779
3,90,4,9,4121
4,95,5,12,6618
5,100,77,192,329077


In [173]:
data_filter_events_1 = data_filter_users.filter(lambda x: x["event_count"] <= 1)
print(f"removal of: {len(data_filter_events_1) * 100 / len(data_filter_users)}% of issues")

  0%|          | 0/8 [00:00<?, ?ba/s]

removal of: 4.611617467011291% of issues


In [184]:
print_issue(data_filter_events_1[23]["events"])

---------------------------------------------------------------------------
author: ndmeiri, opened issue: Add documentation
text: The inline documentation in LGSideMenuController.h is incomplete. For example, the properties associated with these getters are undocumented.
```
- (UIViewController *)rootViewController;
- (UIView *)leftView;
- (UIView *)rightView;
```

Would you please consider documenting these properties and other members of LGSideMenuController?


This case is handled already by the number of users filter

In [153]:
data_filter_events = data_filter_users.filter(lambda x: x["event_count"] == 2)
print(f"removal of: {len(data_filter_events) * 100 / len(dataset)}% of issues")

  0%|          | 0/8 [00:00<?, ?ba/s]

removal of: 19.46% of issues


In [156]:
get_percentiles(data_filter_events, text_col="text_size_no_bots")

Unnamed: 0,percentile,user_count,event_count,text_size
0,0,1,2,0
1,25,2,2,59
2,50,2,2,244
3,90,2,2,1088
4,95,2,2,1672
5,100,2,2,15082


In [154]:
# filter on text size
data_filter_text_size = data_filter_events.filter(lambda x: x["text_size_no_bots"] <= 50 and x["text_size_no_bots"] >= 30)
data_filter_text_size

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],
    num_rows: 75
})

In [157]:
print_issue(data_filter_text_size[41]["events"])

---------------------------------------------------------------------------
author: Dan12, opened issue: Project 3 report: fixed formatting
text: Tables were displaying incorrectly.
---------------------------------------------------------------------------
author: sampsyo, created comment: None
text: Thanks!


Short files are of good quality (we already removed the bad ones with one user in previous filter)

In [166]:
long_issues = data_filter_events.filter(lambda x: x["text_size_no_bots"] > 7000 and x["text_size_no_bots"] < 9000)
long_issues

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events', 'user_count', 'event_count', 'text_size', 'bot_issue', 'modified_by_bot', 'event_count_no_bots', 'text_size_no_bots'],
    num_rows: 4
})

In [None]:
print_issue(long_issues[1]["events"])

Long files also look ok => we don't use this filter as most poor quality files were removed by the previous filter


In [None]:
def print_issue(events):
    for event in events:
        print("-" * 75)
        print(f"author: {event['author']}, {event['action']} {event['type']}: {event['title']}")
        print(f"text: {event['text']}")

def print_events(events):
    event_text = ""
    for event in events:
        event_metadata= f"author: {event['author']}, {event['action']} {event['type']}: {event['title']}"
        event_text += f"\n{event_metadata}\n{event['text']}\n{'-' * 75}\n"
    return event_text

def print_issues(dataset_tf, n=20, col="events"):
    all_issues = ""
    for i in range(n):
        delim = "=" * 60 + f"   Issue {i}   "+ "=" * 60 + "\n"
        issue = print_events(dataset_tf[i][col])
        all_issues += delim + issue
    return all_issues