In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import preprocessing as pp
import pandas as pd

### Training RF on MSR data

In [2]:
msr_data = pd.read_csv('data/msr_dataset.csv', encoding = 'ANSI')
msr_data['processed_token'] = pp.preprocess_tokens(msr_data)
y_msr = msr_data['class']
msr = msr_data.drop(columns=['class'])

X_train, X_test, y_train, y_test = train_test_split(msr_data['processed_token'], 
                                                    y_msr, train_size=0.8, 
                                                    random_state=33, shuffle=True)

In [3]:
text_clf1 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', use_idf=False, 
                                   token_pattern=r'(?u)\b[A-Za-z]+\b', stop_words='english')),
    ('model', RandomForestClassifier())])

text_clf1.fit(X_train, y_train)
preds = text_clf1.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       283
           1       0.97      0.92      0.95       278

    accuracy                           0.95       561
   macro avg       0.95      0.95      0.95       561
weighted avg       0.95      0.95      0.95       561



### Evaluating on new data 

In [4]:
new_data = pd.read_csv('data/new_dataset.csv')
new_data['processed_token'] = pp.preprocess_tokens(new_data)
y_new = new_data['class']

In [5]:
new_preds = text_clf1.predict(new_data['processed_token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.51      0.92      0.65       724
           1       0.62      0.13      0.22       737

    accuracy                           0.52      1461
   macro avg       0.56      0.52      0.44      1461
weighted avg       0.56      0.52      0.43      1461

f1 0.21700223713646533


### Testing MultinomialNB model with TfIdf approach

In [8]:
text_clf2 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', token_pattern=r'(?u)\b[A-Za-z]+\b',
                                   stop_words='english', ngram_range=(1,2))),
    ('model', MultinomialNB())])

text_clf2.fit(X_train, y_train)
preds = text_clf2.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.83      0.87       283
           1       0.84      0.94      0.89       278

    accuracy                           0.88       561
   macro avg       0.88      0.88      0.88       561
weighted avg       0.89      0.88      0.88       561



In [9]:
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print('f1', f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.75      0.47      0.58       724
           1       0.62      0.84      0.71       737

    accuracy                           0.66      1461
   macro avg       0.68      0.66      0.65      1461
weighted avg       0.68      0.66      0.65      1461

f1 0.7137119908204246


### Testing SGDClassifier

In [31]:
text_clf3 = Pipeline([
    ('vectorizer', TfidfVectorizer(norm='l1', use_idf=False, 
                                   token_pattern=r'(?u)\b[A-Za-z]+\b', stop_words='english')),
    ('model', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))])

text_clf3.fit(X_train, y_train)
new_preds = text_clf2.predict(new_data['token'])
print(classification_report(y_new, new_preds))
print(f1_score(y_new, new_preds))

              precision    recall  f1-score   support

           0       0.68      0.51      0.58       724
           1       0.61      0.76      0.68       737

    accuracy                           0.64      1461
   macro avg       0.64      0.64      0.63      1461
weighted avg       0.64      0.64      0.63      1461

0.6783767413688673
