In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import plot_roc_curve, classification_report, f1_score
from preprocessing import sum_java_words, concat_loc_sum

In [2]:
msr_data = pd.read_csv('data/msr/raw_msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

## Training the knn model on MSR data and evaluating on 20% of the same dataset.

In [3]:
X_train, X_test, y_train, y_test = train_test_split(msr, y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

msr_vectorizer = CountVectorizer(max_features=1000)
bow_train = msr_vectorizer.fit_transform(X_train['token'])
sparse_matrix_train = pd.DataFrame(bow_train.toarray(), columns = msr_vectorizer.get_feature_names())
X_train_count = concat_loc_sum(sparse_matrix_train, X_train)

bow_test = msr_vectorizer.transform(X_test['token'])
sparse_matrix_test = pd.DataFrame(bow_test.toarray(), columns = msr_vectorizer.get_feature_names())
X_test_count = concat_loc_sum(sparse_matrix_test, X_test)

msr_model = KNeighborsClassifier(n_neighbors=20)
msr_model.fit(X_train_count, y_train)
preds = msr_model.predict(X_test_count)

print(classification_report(y_test, preds))
print('f1', f1_score(y_test, preds))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85       283
           1       0.93      0.71      0.80       278

    accuracy                           0.83       561
   macro avg       0.85      0.83      0.83       561
weighted avg       0.85      0.83      0.83       561

f1 0.8032786885245903


### Evaluating MSR model on new data

In [4]:
new_data = pd.read_csv('data/new/raw_new_dataset.csv')
y_new = new_data['class']
new = new_data.drop(columns=['class'])

In [5]:
X_new = msr_vectorizer.transform(new['token'])
sparse_matrix_new = pd.DataFrame(X_new.toarray(), columns = msr_vectorizer.get_feature_names())
X_new_count = concat_loc_sum(sparse_matrix_new, new_data)

new_preds = msr_model.predict(X_new_count)
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.50      0.97      0.66       724
           1       0.64      0.06      0.11       737

    accuracy                           0.51      1461
   macro avg       0.57      0.51      0.38      1461
weighted avg       0.57      0.51      0.38      1461

f1 0.10918114143920596


Seems like an overfitting scenario.