In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [3]:
ds_train = pd.read_csv ('/content/drive/MyDrive/Colab Notebooks/ds_train.csv')
ds_test = pd.read_csv ('/content/drive/MyDrive/Colab Notebooks/ds_test.csv')

In [4]:
y_train = ds_train['label']
y_test = ds_test['label']

In [5]:
tfidf_v = TfidfVectorizer()
bow_v = CountVectorizer()


In [6]:
X_train_t = tfidf_v.fit_transform(ds_train['clean_text'])
X_test_t = tfidf_v.transform(ds_test['clean_text'])

X_train_b = bow_v.fit_transform(ds_train['clean_text'])
X_test_b = bow_v.transform(ds_test['clean_text'])

In [7]:
def te_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy_s = accuracy_score(y_test, y_pred)
    precision_s = precision_score(y_test, y_pred)
    recall_s = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    confusion_m = confusion_matrix(y_test, y_pred)
    c_report = classification_report(y_test, y_pred)

    print(f'Accuracy: {accuracy_s}')
    print(f'Precision: {precision_s}')
    print(f'Recall: {recall_s}')
    print(f'F1 Score: {f1_s}')
    print('Confusion Matrix:')
    print(confusion_m)
    print('Classification Report:')
    print(c_report)

    return accuracy_s, precision_s, recall_s, f1_s, confusion_m, c_report

In [9]:
print("TF-IDF Models")

# Logistic Regression
print("Logistic Regression")
logr_model = LogisticRegression()
te_model(logr_model, X_train_t, y_train, X_test_t, y_test)

# Random Forest
print("Random Forest")
ranf_model = RandomForestClassifier()
te_model(ranf_model, X_train_t, y_train, X_test_t, y_test)

# Gradient Boosting
print("Gradient Boosting")
gradb_model = GradientBoostingClassifier()
te_model(gradb_model, X_train_t, y_train, X_test_t, y_test)

# SVM
print("SVM")
svm_model = SVC(kernel='linear', probability=True)
te_model(svm_model, X_train_t, y_train, X_test_t, y_test)

TF-IDF Models
Logistic Regression
Accuracy: 0.879
Precision: 0.877760944103341
Recall: 0.88064
F1 Score: 0.8791981150912503
Confusion Matrix:
[[10967  1533]
 [ 1492 11008]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Random Forest
Accuracy: 0.8466
Precision: 0.8580282621270969
Recall: 0.83064
F1 Score: 0.8441120279663429
Confusion Matrix:
[[10782  1718]
 [ 2117 10383]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85     12500
           1       0.86      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85    

(0.87548,
 0.8818647791066634,
 0.86712,
 0.8744302367794765,
 array([[11048,  1452],
        [ 1661, 10839]]),
 '              precision    recall  f1-score   support\n\n           0       0.87      0.88      0.88     12500\n           1       0.88      0.87      0.87     12500\n\n    accuracy                           0.88     25000\n   macro avg       0.88      0.88      0.88     25000\nweighted avg       0.88      0.88      0.88     25000\n')

In [16]:
print("BoW Models")

# Logistic Regression
print("Logistic Regression")
logr_model = LogisticRegression(max_iter=1000)
te_model(logr_model, X_train_b, y_train, X_test_b, y_test)

# Random Forest
print("Random Forest")
ranf_model = RandomForestClassifier(n_estimators=100)
te_model(ranf_model, X_train_b, y_train, X_test_b, y_test)

# Gradient Boosting
print("Gradient Boosting")
gradb_model = GradientBoostingClassifier(n_estimators=100)
te_model(gradb_model, X_train_b, y_train, X_test_b, y_test)

# SVM
print("SVM")
svm_model = SVC(kernel='linear', probability=True)
te_model(svm_model, X_train_b, y_train, X_test_b, y_test)

BoW Models
Logistic Regression
Accuracy: 0.8598
Precision: 0.864967946117017
Recall: 0.85272
F1 Score: 0.8588003061676672
Confusion Matrix:
[[10836  1664]
 [ 1841 10659]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     12500
           1       0.86      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

Random Forest
Accuracy: 0.84492
Precision: 0.8497606879208242
Recall: 0.838
F1 Score: 0.8438393684295322
Confusion Matrix:
[[10648  1852]
 [ 2025 10475]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.85     12500
           1       0.85      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0

(0.84,
 0.8472222222222222,
 0.8296,
 0.8383185125303153,
 array([[10630,  1870],
        [ 2130, 10370]]),
 '              precision    recall  f1-score   support\n\n           0       0.83      0.85      0.84     12500\n           1       0.85      0.83      0.84     12500\n\n    accuracy                           0.84     25000\n   macro avg       0.84      0.84      0.84     25000\nweighted avg       0.84      0.84      0.84     25000\n')