In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import pandas as pd

### Training on MSR data

In [5]:
msr_data = pd.read_csv('data/msr_dataset.csv', encoding = 'ANSI')
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(msr_data['token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

text_clf1 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', use_idf=False, 
                                   token_pattern=r'(?u)\b[A-Za-z]+\b', stop_words='english')),
    ('model', RandomForestClassifier())])

text_clf1.fit(X_train, y_train)
preds = text_clf1.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       283
           1       0.98      0.91      0.94       278

    accuracy                           0.94       561
   macro avg       0.95      0.94      0.94       561
weighted avg       0.95      0.94      0.94       561



### Evaluating on new data 

In [7]:
new_data = pd.read_csv('data/new_dataset.csv')
y_new = new_data['class']

new_preds = text_clf1.predict(new_data['token'])
print(classification_report(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.52      0.87      0.65       724
           1       0.61      0.20      0.31       737

    accuracy                           0.53      1461
   macro avg       0.57      0.54      0.48      1461
weighted avg       0.57      0.53      0.48      1461



### Testing MultinomialNB model with TfIdf approach

In [43]:
text_clf2 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', token_pattern=r'(?u)\b[A-Za-z]+\b',
                                   stop_words='english', ngram_range=(1,20))),
    ('model', MultinomialNB())])

text_clf2.fit(X_train, y_train)
preds = text_clf2.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.91      0.81      0.86       283
           1       0.83      0.92      0.87       278

    accuracy                           0.86       561
   macro avg       0.87      0.87      0.86       561
weighted avg       0.87      0.86      0.86       561



In [44]:
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.69      0.55      0.61       724
           1       0.63      0.76      0.69       737

    accuracy                           0.66      1461
   macro avg       0.66      0.66      0.65      1461
weighted avg       0.66      0.66      0.65      1461

f1 0.6912669126691267


### Testing SGDClassifier

In [31]:
text_clf3 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', use_idf=False, 
                                   token_pattern=r'(?u)\b[A-Za-z]+\b', stop_words='english')),
    ('model', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))])

text_clf3.fit(X_train, y_train)
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print(f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.68      0.51      0.58       724
           1       0.61      0.76      0.68       737

    accuracy                           0.64      1461
   macro avg       0.64      0.64      0.63      1461
weighted avg       0.64      0.64      0.63      1461

0.6783767413688673
