In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd

In [27]:
msr_data = pd.read_csv('../msr_results/tokenslockeywordskeywordcount.csv', encoding = 'ANSI')

msr_data['token'][:10]

0    'test  partial  frame  write  assert  false  i...
1    'inherit  wf  false  parent  libs2  parent  li...
2    'nn_host/  get  default  uri  set  tet  get  d...
3    'end_  points  is_  security_  enabled  get  c...
4    'p1  p2  should  fail  test  client  retries  ...
5    'action  num  test  coord  action  get  pretty...
6    'standard  charsets  get  content  assert  fal...
7    'get  id  assert  equals  get  status  execute...
8    'byte  param  should  handle  null  byte  para...
9    '<property><name>hello</name><value>world</val...
Name: token, dtype: object

### Training on old data

In [28]:
y_msr = msr_data['class']

X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

msr_vectorizer = CountVectorizer()

X_train_cv = msr_vectorizer.fit_transform(X_train)

X_test_cv = msr_vectorizer.transform(X_test)

msr_model = RandomForestClassifier()
msr_model.fit(X_train_cv, y_train)
preds = msr_model.predict(X_test_cv)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       283
           1       0.97      0.92      0.95       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



### Evaluating on new data 

In [29]:
new_data = pd.read_csv('../new_data/dataset.csv')
y_new = new_data['class']

X_new = msr_vectorizer.transform(new_data['token'])

new_preds = msr_model.predict(X_new)
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.51      0.98      0.67       724
           1       0.76      0.08      0.14       737

    accuracy                           0.52      1461
   macro avg       0.64      0.53      0.41      1461
weighted avg       0.64      0.52      0.40      1461



### Testing with MultinomialNB 

In [30]:
nb_model = MultinomialNB()
nb_model.fit(X_train_cv, y_train)
preds = nb_model.predict(X_test_cv)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       283
           1       0.96      0.94      0.95       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



In [31]:
new_preds = nb_model.predict(X_new)
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.63      0.53      0.58       724
           1       0.60      0.70      0.65       737

    accuracy                           0.62      1461
   macro avg       0.62      0.62      0.61      1461
weighted avg       0.62      0.62      0.61      1461

