In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import load, dump, load_ft_data
from evaluate import eval
from os import getcwd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from build import categories
import fasttext
import re

In [2]:
root = f'{getcwd()}/..'

## Let's try applying the fasttext classifier on our annotated Cowait data

In [3]:
data = load(f'{root}/data/testset.json')

In [4]:
model = fasttext.load_model(f"{root}/models/fasttext_base.bin")

def predict(text):
    return model.predict(text)[0][0][9:]



In [5]:
y_true = []
y_pred = []

for pr in data:
    title = pr['title']
    pcat = predict(title)
    tcat = pr['category']
    
    y_true.append(tcat)
    y_pred.append(pcat)

In [6]:
eval(y_true, y_pred)

                precision    recall  f1-score   support

      fix-bugs       0.17      0.66      0.28        76
  new-features       0.42      0.26      0.32       148
 documentation       0.58      0.37      0.45        52
non-functional       0.46      0.19      0.27       228

      accuracy                           0.30       504
     macro avg       0.41      0.37      0.33       504
  weighted avg       0.42      0.30      0.30       504

[[ 50  12   2  12]
 [ 85  38   5  20]
 [ 11   4  19  18]
 [141  37   7  43]]


## Why is the model eager to set the fix-bug category?

Hypothesis is that it was trained on unevenly distributed class data

In [9]:
title_data, label_data = load_ft_data(f'{root}/data/fasttext_data.train')

In [10]:
tot = len(label_data)

for cat in categories:
    m = label_data.count(cat)
    print(f'{cat:.3}: \t{m} \t({m/tot:.2f})')

fix: 	185640 	(0.48)
new: 	73010 	(0.19)
doc: 	42090 	(0.11)
non: 	84780 	(0.22)


Hypothesis holds - it was indeed trained on very unevenly distributed data

## Inspecting the evaluation of validation set

In [11]:
val_titles, val_labels = load_ft_data(f'{root}/data/fasttext_data.valid')

y_true = []
y_pred = []

for title, label in zip(val_titles, val_labels):
    pred = predict(title)    
    y_pred.append(pred)
    y_true.append(label)

eval(y_true, y_pred)

                precision    recall  f1-score   support

      fix-bugs       0.95      0.98      0.96      2411
  new-features       0.93      0.96      0.95       970
 documentation       0.94      0.92      0.93       549
non-functional       0.96      0.88      0.92      1070

      accuracy                           0.95      5000
     macro avg       0.94      0.93      0.94      5000
  weighted avg       0.95      0.95      0.95      5000

[[2353   25   11   22]
 [  27  934    1    8]
 [  27    7  504   11]
 [  71   39   19  941]]
