# Evaluate the fine-tuned model
In this notebook we evaluate the previously fine-tuned model on the test data

In [None]:
import pprint
import random
import gensim
import torch
import pandas as pd
import numpy as np
import transformers
from sklearn.metrics import recall_score, precision_score, f1_score, matthews_corrcoef
from transformers import BertTokenizer, BertForSequenceClassification

seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
transformers.set_seed(seed)

In [None]:
def preprocess(row):
    ret = str(row['issue_title']) + " " + str(row['issue_body'])
    ret = gensim.parsing.preprocessing.strip_multiple_whitespaces(ret)
    ret = ret.replace('\r\n', ' ')
    ret = ret.replace('\n', ' ')
    return ret

def labelnum(row):
    if row['issue_label'] == 'bug':
        return 0
    elif row['issue_label'] == 'enhancement':
        return 1
    elif row['issue_label'] == 'question':
        return 2
    else:
        raise Exception('no such type!')

In [None]:
MODEL_PATH = '../models/nlbse/'

model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

df = pd.read_csv('../data/github-labels-top3-803k-test.csv')

df['text_no_newlines'] = df.apply(preprocess, axis=1)
df['label'] = df.apply(labelnum, axis=1)

X = df['text_no_newlines'].values
y_test = df['label'].astype(int).values

data_len = len(df)
y_probs = []
model.eval()
with torch.no_grad():
    for _, X_row in enumerate(X):
        inputs = tokenizer(X_row, padding=True, truncation=True, max_length=128, return_tensors='pt').to('cuda')
        outputs = model.to('cuda')(**inputs)
        probs = outputs[0].softmax(1).cpu().detach().numpy()
        y_probs.append(probs)
        
        if len(y_probs) % 1000 == 0:
            print('{}/{}'.format(len(y_probs), data_len))

y_pred = []
for y_prob in y_probs:
    y_pred.append(y_prob.argmax())

results = [{'model': 'seBERT',
            'mcc': matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            'micro_f1': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'micro_precision': precision_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'micro_recall': recall_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'macro_f1': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'macro_precision': precision_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'macro_recall': recall_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'precision_bug': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'precision_enhancement': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'precision_question': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0],
            'recall_bug': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'recall_enhancement': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'recall_question': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0],
            'f1_bug': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'f1_enhancement': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'f1_question': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0]}]

pprint.pprint(results)

result_df = pd.DataFrame(results)
result_df.to_csv('../data/results.csv', index=False)

# Smaller sample evaluation
We are randomly drawing 1000 instances from the test data here.

In [None]:
MODEL_PATH = '../models/nlbse/'

model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

# this is a small sample for quickly running the model
df = pd.read_csv('../data/github-labels-top3-803k-test.csv').sample(n=1000)

df['text_no_newlines'] = df.apply(preprocess, axis=1)
df['label'] = df.apply(labelnum, axis=1)

X = df['text_no_newlines'].values
y_test = df['label'].astype(int).values


y_probs = []
model.eval()
with torch.no_grad():
    for _, X_row in enumerate(X):
        inputs = tokenizer(X_row, padding=True, truncation=True, max_length=128, return_tensors='pt').to('cuda')
        outputs = model.to('cuda')(**inputs)
        probs = outputs[0].softmax(1).cpu().detach().numpy()
        y_probs.append(probs)

y_pred = []
for y_prob in y_probs:
    y_pred.append(y_prob.argmax())

results = [{'model': 'seBERT',
            'mcc': matthews_corrcoef(y_true=y_test, y_pred=y_pred),
            'micro_f1': f1_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'micro_precision': precision_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'micro_recall': recall_score(y_true=y_test, y_pred=y_pred, average='micro'),
            'macro_f1': f1_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'macro_precision': precision_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'macro_recall': recall_score(y_true=y_test, y_pred=y_pred, average='macro'),
            'precision_bug': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'precision_enhancement': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'precision_question': precision_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0],
            'recall_bug': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'recall_enhancement': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'recall_question': recall_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0],
            'f1_bug': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[0])[0],
            'f1_enhancement': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[1])[0],
            'f1_question': f1_score(y_true=y_test, y_pred=y_pred, average=None, labels=[2])[0]}]

pprint.pprint(results)

result_df = pd.DataFrame(results)
result_df.to_csv('../data/results.csv', index=False)