## Import files and Initialization

In [129]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# train_dataset_file_name = 'github-labels-top3-803k-train'
# test_dataset_file_name = 'github-labels-top3-803k-test'
# train_dataset = pd.read_csv(f'../dataset/raw/{train_dataset_file_name}.csv')
# test_dataset = pd.read_csv(f'../dataset/raw/{test_dataset_file_name}.csv')

# dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)
# dataset.to_csv('../dataset/raw/github-labels-top3-803k.csv')

dataset_file_name = 'github-labels-top3-803k'
dataset = pd.read_csv(f'../dataset/raw/{dataset_file_name}.csv')

output_path = '../dataset/preprocess'

## Explore the Data

In [130]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 803417 entries, 0 to 803416
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0.1              803417 non-null  int64 
 1   Unnamed: 0                803417 non-null  int64 
 2   issue_url                 803417 non-null  object
 3   issue_label               803417 non-null  object
 4   issue_created_at          803417 non-null  object
 5   issue_author_association  803417 non-null  object
 6   repository_url            803417 non-null  object
 7   issue_title               803417 non-null  object
 8   issue_body                723629 non-null  object
dtypes: int64(2), object(7)
memory usage: 55.2+ MB
None


### get insights of `issue_label`

In [131]:
issue_label = dataset['issue_label']

print(issue_label.unique())
print(issue_label.value_counts(normalize=True))


['bug' 'enhancement' 'question']
issue_label
bug            0.499605
enhancement    0.413953
question       0.086442
Name: proportion, dtype: float64


### gain insights of data which `issue_body` is null

In [132]:
series_issue_body = dataset['issue_body']
dataset_body_null = dataset[series_issue_body.isna()]

# show only issue_title
print(dataset_body_null['issue_title'][:30])

1      "pcopy invite" and "pcopy paste abc:" does not...
2      UI: Modal overlay is half transparent, shouldn...
11                                          Help bubbles
17                   Move force-create flag to post-page
26     If command is extremely quick to process, .tmp...
35               Eliminating the need for create schemas
47                     Catch DiscordHTTPError and re-try
50                                  Upgrade weapon level
70                            Support reading from stdin
72     Add bar chart to staking panel for total ATMOS...
73             Add total subsidy number to staking panel
77     Snowflake connection not being properly parsed...
86     [Bug] Adding and removing notifications can so...
88     [Bug] Starting Flight or Noclip for the first ...
93                           Improving Settings.test.tsx
96         Mac - detect if VeraCrypt volumes are mounted
111                                Anomaly in login code
113                            

## start processing the training data

#### concatenate `issue_title` and `issue_body` into one feature: `text`.

In [133]:
dataset.issue_body = dataset.issue_body.fillna('')
dataset['text'] = dataset.issue_title + " " + dataset.issue_body

print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 803417 entries, 0 to 803416
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0.1              803417 non-null  int64 
 1   Unnamed: 0                803417 non-null  int64 
 2   issue_url                 803417 non-null  object
 3   issue_label               803417 non-null  object
 4   issue_created_at          803417 non-null  object
 5   issue_author_association  803417 non-null  object
 6   repository_url            803417 non-null  object
 7   issue_title               803417 non-null  object
 8   issue_body                803417 non-null  object
 9   text                      803417 non-null  object
dtypes: int64(2), object(8)
memory usage: 61.3+ MB
None


#### replace tabs and breaks in the `text` with `spaces`, then remove repeating whitespaces

In [134]:
dataset.text = dataset.text.replace(r'[\t\n\r ]+', ' ', regex=True)

print(dataset.text.values[:30])

["Welcome screen on every editor window is very tedious I just discovered Gitlens and find the functionality useful, thank you to all who contribute. I have about a dozen editor windows open, and the install process added a Gitlens welcome tab to each and every one of them. Combined with the snowflake effect, all of the sudden VScode was consuming 300-400% cpu and my fan was raging, as soon as I hunted them all down everything was back to fine. The welcome note content is great (although putting it on _all_ the windows is a bit much, don't know how much control you have on that). But overall it was a bit of a sour first-use experience, just wanted to provide that feedback."
 '"pcopy invite" and "pcopy paste abc:" does not check if clipboard exists '
 "UI: Modal overlay is half transparent, shouldn't be "
 'Make the loading screen scale with browser window size Currently the loading wheel is a fixed size in pixels, but it would be better to specify it in terms of percentage of the brows

#### categorize the `issue_label`

In [135]:
encoder = OrdinalEncoder()
labels = dataset[['issue_label']]
dataset.issue_label = encoder.fit_transform(labels)

print(dataset.issue_label[:10])

0    0.0
1    0.0
2    0.0
3    1.0
4    0.0
5    1.0
6    0.0
7    1.0
8    1.0
9    1.0
Name: issue_label, dtype: float64


#### generate part of the whole dataset, which will be used to training our model

1. We will get samples of 0.1%, 1%, 5%, and 100% to compare the accuracy

In [157]:
def write_data(dataset, percent):
    data_name = f'{output_path}/{dataset_file_name}-{percent*100}%.csv'
    # use stratified split the data, keep the issue_label percentage the same
    _, output = train_test_split(
        dataset, test_size=percent, stratify=dataset['issue_label'], random_state=42)
    print(output.issue_label.value_counts(normalize=True))
    print(output.info())
    output.to_csv(data_name)

# process
for percent in [0.001, 0.01, 0.05]:
    write_data(dataset, percent)

issue_label
0.0    0.500000
1.0    0.414179
2.0    0.085821
Name: proportion, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Index: 804 entries, 115477 to 5869
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0.1              804 non-null    int64  
 1   Unnamed: 0                804 non-null    int64  
 2   issue_url                 804 non-null    object 
 3   issue_label               804 non-null    float64
 4   issue_created_at          804 non-null    object 
 5   issue_author_association  804 non-null    object 
 6   repository_url            804 non-null    object 
 7   issue_title               804 non-null    object 
 8   issue_body                804 non-null    object 
 9   text                      804 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 69.1+ KB
None
issue_label
0.0    0.499564
1.0    0.413939
2.0    0.086497
Name: proport