# Import required libraries

In [25]:
#!pip install simpletransformers
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)
from sklearn.metrics import hamming_loss
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report
)
import warnings
import json
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Presets

- SAVE_MODEL_PATH is the path where the output will be saved after training

- DATA_SET_PATH is the path to the github-issues dataset

- MAX_TRAINING_EXAMPLES is the amount of issues that we sample randomly, default is 100,000

- TRAINING_EPOCHS is the epochs that we run for training, default is 5

- labels are the github labels that will be sampled with an exact match

- SHOULD_TRAIN is an indicator wether to run the training task or not, if set to False then it means that a pre-trained model is being loaded for evaluation tasks, (requires LOAD_MODEL_PATH to be set)

- LOAD_MODEL_PATH is the path of pretraining to use. This can be a path on the disk or name of a default pretraining which is a part of huggingface library which will be automatically downloaded, default is 'roberta-base' 

In [2]:
#PRESETS
SAVE_MODEL_PATH = "outputs" # The directory to save the model to after training
DATA_SET_PATH = 'github-issues.csv'
MAX_TRAINING_EXAMPLES = 100000 # Sampleset
TRAINING_EPOCHS = 5
labels = ["bug", "enhancement", "question"] #The labels that we will be working on

SHOULD_TRAIN = True
LOAD_MODEL_PATH = 'roberta-base'

In [None]:
#Load Data. The dataset contains issues with their labels alongwith a language identifier
#from google.colab import drive
#drive.mount('/content/drive')
issueData = pd.read_csv(DATA_SET_PATH)

# Filtering

- Filter out english issues

- Make a feature by combining title and body

- Add column for each class representing its presence of abence (multi-label mapping)



In [4]:
#Extracts data that are in 'en' / english
issueData = issueData[issueData['language'] == 'en'].reset_index()


#Concatenate the body and title of the issues to be used as a single feature
issueData['title_body_concat'] = issueData['body'].astype(str) + " " + issueData['title']
issueData['title_body_concat'] = issueData["title_body_concat"].str.lower()

def multiLabelEncode(labelStrings, column):
    outputList = [0]*len(labelStrings)
    for i in range(0,len(labelStrings)):
        labelList = json.loads(labelStrings[i])
        for j in labelList:
            labelname = j['name:']
            labelname=labelname.replace(" ",'').replace(":","").replace("/","").replace("type","").replace("kind","")
            if (labelname == column):
                outputList[i] = 1
                break
    return outputList

#Generate columns for every label (TODO: Iterate over labels to assign these)
issueData = issueData.assign(bug=lambda x: multiLabelEncode(x['labels'], 'bug'))
issueData = issueData.assign(enhancement=lambda x: multiLabelEncode(x['labels'], 'enhancement'))
issueData = issueData.assign(question=lambda x: multiLabelEncode(x['labels'], 'question'))

cols = issueData.columns

#Print the number of instances in our dataset
for label in labels:
    print(label + ": " + str(len(issueData[issueData[label]==1])))

label_count = len(labels)

#Shuffle the dataset randomly
issueData = issueData.sample(frac=1).reset_index(drop=True)

bug: 117768
enhancement: 87620
question: 17957


# Sampling

Sample MAX_TRAINING_EXAMPLES randomly form the dataset, then filter out the labels from above

In [5]:
randomSample = issueData.sample(n = MAX_TRAINING_EXAMPLES)

slashedData = []

for label in labels:
    slashedData.append(randomSample[randomSample[label]==1])

# Make a concatenated dataframe for working
df = pd.concat(slashedData)

for label in labels:
    print(label + ": " + str(len(df[df[label]==1])))

bug: 18374
enhancement: 13982
question: 3016


# Further preprocessing
Further preprocessing includes resolving any imbalance
- First we find the label with the largest sample count
- Then for every label we check if it has a sample count that is less than half of the samples of the largest label
- If any such label exists, we oversample it equal to the sample set of the largest label

Then we prepare the data by generating khot encoding for the transformer model

In [6]:
# Solve any label imbalance by over sampling #
largest_sampled_label = labels[0]
for label in labels:
    currentLabelSampleCount = len(df[df[label]==1])
    largestLabelSampleCount = len(df[df[largest_sampled_label]==1])
    if (currentLabelSampleCount > largestLabelSampleCount):
        largest_sampled_label = label

#Check if any label has less than half of the largest label's sample count and oversample
largestLabelSampleCount = len(df[df[largest_sampled_label]==1])
for label in labels:
    currentLabelSampleCount = len(df[df[label]==1])
    if (currentLabelSampleCount < (largestLabelSampleCount*0.5)):
        samplesToAdd = largestLabelSampleCount - currentLabelSampleCount
        samplesInOriginalSet = len(issueData[issueData[label]==1])
        if ((samplesToAdd + currentLabelSampleCount) > samplesInOriginalSet):
            samplesToAdd = samplesInOriginalSet - currentLabelSampleCount
        df = df.append(issueData[issueData[label]==1].sample(n = samplesToAdd))
#

#Shuffle the dataframe again
df = df.sample(frac=1).reset_index(drop=True)

total_sample = 0
for label in labels:
    print(label + ": " + str(len(df[df[label]==1])))
    total_sample+= len(df[df[label]==1])

#Convert the labels into one hot encoding vector form
df['k_hot_labels'] = list(df[labels].values)

bug: 18738
enhancement: 14768
question: 17957


# Split data

In [7]:
#Split dataframes into two frames for training and testing with a threshold of 0.8, 0.2
df = df[['title_body_concat','k_hot_labels']]
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
train.columns = ["text", "labels"]
test.columns = ["text", "labels"]

# Model training

In [10]:
# Preparing train data
train_df = train
eval_df = test

# Optional model configuration
model_args = MultiLabelClassificationArgs(
    num_train_epochs=TRAINING_EPOCHS,
    overwrite_output_dir=True,
    save_model_every_epoch=False,
    save_eval_checkpoints=False,
    save_steps=-1,
    output_dir=SAVE_MODEL_PATH
)

# Create a MultiLabelClassificationModel


model = MultiLabelClassificationModel(
    "roberta",
    LOAD_MODEL_PATH,
    num_labels=label_count,
    use_cuda=True,
    args = model_args
)


# Train the model
if (SHOULD_TRAIN==True):
    model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(
    eval_df
)

print("LRAP Evaluation:-")
print(result)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'c

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/39602 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/4951 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 1 of 5:   0%|          | 0/4951 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/4951 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/4951 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/4951 [00:00<?, ?it/s]

  0%|          | 0/9967 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1246 [00:00<?, ?it/s]

LRAP Evaluation:-
{'LRAP': 0.8896107153606863, 'eval_loss': 0.4543792268539365}


In [11]:
# Make predictions with the model
test_list = eval_df['text'].tolist()
test_list_labels = eval_df['labels'].tolist()
predictions, raw_outputs = model.predict(test_list)

  0%|          | 0/9967 [00:00<?, ?it/s]

  0%|          | 0/1246 [00:00<?, ?it/s]

In [12]:
result_df = pd.DataFrame(predictions, columns=labels)
original_df = pd.DataFrame(test_list_labels, columns = labels)

# Classification matrix for every label

In [24]:
for label in labels:
    print("* " + label)
    print("Confusion matrix")
    print(confusion_matrix(original_df[label],result_df[label]))



* bug
Confusion matrix
[[5564  716]
 [ 699 2988]]
* enhancement
Confusion matrix
[[6223  728]
 [ 723 2293]]
* question
Confusion matrix
[[5739  636]
 [ 715 2877]]


# Print hamming loss for every label

In [14]:
for label in labels:
    print("Hamming loss " + label + ": " +str(hamming_loss(original_df[label],result_df[label])))


Hamming loss bug: 0.14196849603692185
Hamming loss enhancement: 0.1455804153707234
Hamming loss question: 0.13554730611016355


# Classification report
Print all the evaluation metrices for the model

In [27]:
original_onehot = []
predicted_onehot = []


original_onehot = list(original_df[labels].values)
predicted_onehot = list(result_df[labels].values)
print(classification_report(original_onehot,predicted_onehot, zero_division=0, target_names=labels))

              precision    recall  f1-score   support

         bug       0.81      0.81      0.81      3687
 enhancement       0.76      0.76      0.76      3016
    question       0.82      0.80      0.81      3592

   micro avg       0.80      0.79      0.79     10295
   macro avg       0.79      0.79      0.79     10295
weighted avg       0.80      0.79      0.79     10295
 samples avg       0.80      0.80      0.80     10295

