## Import files and Initialization

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_dataset_file_name = 'github-labels-top3-803k-train'
train_dataset = pd.read_csv(f'../dataset/raw/{train_dataset_file_name}.csv')

output_path = '../dataset/preprocess'

print('train dataset counts:')
print(train_dataset.count())

train dataset counts:
Unnamed: 0                  722899
issue_url                   722899
issue_label                 722899
issue_created_at            722899
issue_author_association    722899
repository_url              722899
issue_title                 722899
issue_body                  651027
dtype: int64


## start processing the training data

#### Step1: drop rows with empty/NAN in `issue_body`/`issue_title`

In [39]:
# empty_issue_body_data = train_dataset[train_dataset.issue_body.str.strip() == '']
# print(empty_issue_body_data)
train_dataset = train_dataset.dropna(subset=['issue_title', 'issue_body'])
train_dataset = train_dataset[train_dataset.issue_body.str.strip() != '']

print(train_dataset.count())

Unnamed: 0                  650659
issue_url                   650659
issue_label                 650659
issue_created_at            650659
issue_author_association    650659
repository_url              650659
issue_title                 650659
issue_body                  650659
dtype: int64


#### Step2: drop rows which label is not in [bug, enhancement, question]

In [40]:
labels = ['bug', 'enhancement', 'question']

# label_not_include_data = train_dataset[~train_dataset.issue_label.isin(labels)]
# print(label_not_include_data)
train_dataset = train_dataset[train_dataset.issue_label.isin(labels)]

print(train_dataset.count())

Unnamed: 0                  650659
issue_url                   650659
issue_label                 650659
issue_created_at            650659
issue_author_association    650659
repository_url              650659
issue_title                 650659
issue_body                  650659
dtype: int64


#### Step3: concatenate `issue_title` and `issue_body` into one metadata: `issue_data`.

In [41]:
train_dataset['issue_data'] = train_dataset.issue_title + " " + train_dataset.issue_body
# print(raw_data_set.count())
train_dataset = train_dataset.drop('issue_title', axis=1)
train_dataset = train_dataset.drop('issue_body', axis=1)

print(train_dataset.count())

Unnamed: 0                  650659
issue_url                   650659
issue_label                 650659
issue_created_at            650659
issue_author_association    650659
repository_url              650659
issue_data                  650659
dtype: int64


Step4: replace tabs and breaks in the `issue_data` with `spaces`, then remove repeating whitespaces

In [42]:
train_dataset.issue_data = train_dataset.issue_data.replace(r'[\t\n\r ]+', ' ', regex=True)

print(train_dataset.issue_data.values[:1])

["Welcome screen on every editor window is very tedious I just discovered Gitlens and find the functionality useful, thank you to all who contribute. I have about a dozen editor windows open, and the install process added a Gitlens welcome tab to each and every one of them. Combined with the snowflake effect, all of the sudden VScode was consuming 300-400% cpu and my fan was raging, as soon as I hunted them all down everything was back to fine. The welcome note content is great (although putting it on _all_ the windows is a bit much, don't know how much control you have on that). But overall it was a bit of a sour first-use experience, just wanted to provide that feedback."]


#### Step5: split the data into training, validation and test

1. 85% training data, in which 15% is validation data
2. 15% test data

In [43]:
train, test = train_test_split(train_dataset, test_size=0.15, random_state=42)
train, validation = train_test_split(train, test_size=0.15, random_state=42)

def write_data(data_frame, suffix):
    data_counts = data_frame.issue_data.size
    data_name = f'{output_path}/github-labels-{int(data_counts / 1000)}k-{suffix}.csv'
    data_frame.to_csv(data_name)
    print(f'write file: {data_name}')

# write the whole precessed data
write_data(train_dataset, 'all')

# write the split data
write_data(train, 'train')
write_data(validation, 'validation')
write_data(test, 'test')

write file: ../dataset/preprocess/github-labels-650k-all.csv
write file: ../dataset/preprocess/github-labels-470k-train.csv
write file: ../dataset/preprocess/github-labels-82k-validation.csv
write file: ../dataset/preprocess/github-labels-97k-test.csv
