In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
import pandas as pd

In [17]:
msr_data = pd.read_csv('../msr_results/tokenslockeywordskeywordcount.csv', encoding = 'ANSI')

msr_data['token'][:10]

0    'test  partial  frame  write  assert  false  i...
1    'inherit  wf  false  parent  libs2  parent  li...
2    'nn_host/  get  default  uri  set  tet  get  d...
3    'end_  points  is_  security_  enabled  get  c...
4    'p1  p2  should  fail  test  client  retries  ...
5    'action  num  test  coord  action  get  pretty...
6    'standard  charsets  get  content  assert  fal...
7    'get  id  assert  equals  get  status  execute...
8    'byte  param  should  handle  null  byte  para...
9    '<property><name>hello</name><value>world</val...
Name: token, dtype: object

### Training on old data

In [22]:
y_msr = msr_data['class']

X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

msr_vectorizer = CountVectorizer()
tf = TfidfTransformer()

X_train_cv = msr_vectorizer.fit_transform(X_train)
X_train_tf = tf.fit_transform(X_train_cv)

X_test_cv = msr_vectorizer.transform(X_test)
X_test_tf = tf.transform(X_test_cv)

msr_model = RandomForestClassifier()
msr_model.fit(X_train_tf, y_train)
preds = msr_model.predict(X_test_tf)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       283
           1       0.98      0.91      0.94       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



### Evaluating on new data 

In [23]:
new_data = pd.read_csv('../new_data/dataset.csv')
y_new = new_data['class']

X_new_cv = msr_vectorizer.transform(new_data['token'])
X_new_tf = tf.transform(X_new_cv)

new_preds = msr_model.predict(X_new_tf)
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.51      0.90      0.65       724
           1       0.62      0.16      0.26       737

    accuracy                           0.53      1461
   macro avg       0.57      0.53      0.46      1461
weighted avg       0.57      0.53      0.45      1461



### Testing with MultinomialNB 

In [24]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tf, y_train)
preds = nb_model.predict(X_test_tf)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       283
           1       0.89      0.95      0.92       278

    accuracy                           0.92       561
   macro avg       0.92      0.92      0.92       561
weighted avg       0.92      0.92      0.92       561



In [25]:
new_preds = nb_model.predict(X_new_tf)
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.65      0.43      0.52       724
           1       0.58      0.77      0.66       737

    accuracy                           0.60      1461
   macro avg       0.62      0.60      0.59      1461
weighted avg       0.62      0.60      0.59      1461



### Testing SGDClassifier

In [26]:
sgd_model = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)
sgd_model.fit(X_train_tf, y_train)
preds = nb_model.predict(X_test_tf)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       283
           1       0.89      0.95      0.92       278

    accuracy                           0.92       561
   macro avg       0.92      0.92      0.92       561
weighted avg       0.92      0.92      0.92       561



In [27]:
new_preds = sgd_model.predict(X_new_tf)
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.52      0.81      0.64       724
           1       0.60      0.27      0.37       737

    accuracy                           0.54      1461
   macro avg       0.56      0.54      0.51      1461
weighted avg       0.56      0.54      0.50      1461

