## Import files and Initialization

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# train_dataset_file_name = 'github-labels-top3-803k-train'
# test_dataset_file_name = 'github-labels-top3-803k-test'
# train_dataset = pd.read_csv(f'../dataset/raw/{train_dataset_file_name}.csv')
# test_dataset = pd.read_csv(f'../dataset/raw/{test_dataset_file_name}.csv')

# dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)
# dataset.to_csv('../dataset/raw/github-labels-top3-803k.csv')

dataset_file_name = 'github-labels-top3-803k'
dataset = pd.read_csv(f'../dataset/raw/{dataset_file_name}.csv')

output_path = '../dataset/preprocess'

## Explore the Data

In [None]:
print(dataset.info())

### get insights of `issue_label`

In [None]:
issue_label = dataset['issue_label']

print(issue_label.unique())
print(issue_label.value_counts(normalize=True))


### gain insights of data which `issue_body` is null

In [None]:
series_issue_body = dataset['issue_body']
dataset_body_null = dataset[series_issue_body.isna()]

# show only issue_title
print(dataset_body_null['issue_title'][:30])

## start processing the training data

#### concatenate `issue_title` and `issue_body` into one feature: `text`.

In [None]:
dataset.issue_body = dataset.issue_body.fillna('')
dataset['text'] = dataset.issue_title + " " + dataset.issue_body

print(dataset.info())

#### replace tabs and breaks in the `text` with `spaces`, then remove repeating whitespaces

In [None]:
dataset.text = dataset.text.replace(r'[\t\n\r ]+', ' ', regex=True)

print(dataset.text.values[:30])

#### categorize the `issue_label`

In [None]:
encoder = OrdinalEncoder()
labels = dataset[['issue_label']]
dataset.issue_label = encoder.fit_transform(labels)

print(dataset.issue_label[:10])

#### generate part of the whole dataset, which will be used to training our model

1. We will get samples of 0.1%, 1%, 5%, and 100% to compare the accuracy

In [None]:
def write_data(dataset, percent):
    data_name = f'{output_path}/{dataset_file_name}-{percent*100}%.csv'
    # use stratified split the data, keep the issue_label percentage the same
    if percent == 1.0:
        output = dataset
    else:
        _, output = train_test_split(
            dataset, test_size=percent, stratify=dataset['issue_label'], random_state=42)
    print(output.issue_label.value_counts(normalize=True))
    print(output.info())
    output.to_csv(data_name)

# process
for percent in [0.001, 0.01, 0.05, 0.1, 1.0]:
    write_data(dataset, percent)