# Model Training 1
In this notebook we try different ML classifiers and clustering models using Bag of Word and Tf-idf encodings


## Imports

In [29]:
!pip install -q textblob
!python -m textblob.download_corpora

[0m[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [30]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob.classifiers import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# import custom helper module
import importlib
import helpers
importlib.reload(helpers)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'helpers' from '/notebooks/helpers.py'>

## Load preprocessed dataset

In [31]:
X_train, X_test, y_train, y_test = helpers.load_dataset("training_data_lowercase.csv", force_reload=True)

# print message
helpers.print_text(X_train, y_train)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

No pickle file found. Loading and cleaning dataset.
Cleaning Text


100%|██████████| 34152/34152 [00:00<00:00, 124698.47it/s]


Removing Stopwords


100%|██████████| 34152/34152 [00:02<00:00, 14142.54it/s]


Removing short sentences


100%|██████████| 34152/34152 [00:00<00:00, 777594.92it/s]
100%|██████████| 34152/34152 [00:00<00:00, 818775.02it/s]


Lemmatizing Text


100%|██████████| 33904/33904 [01:11<00:00, 475.25it/s]


----------------------------------------------------------------------------------------------------
[4005] angry leftist catch video steal student trump hatdemand school make stop wear hat f fing freedom speech --> 0
----------------------------------------------------------------------------------------------------
(27123,) (6781,) (27123,) (6781,)


## Train TextBlob NB Classifier
Very long, to run on PaperSpace

In [32]:
# Let's train a first textblop NaiveBayes classifier using the raw text as input

# let's pack our train dataset into a list of tuples (text, label)
train_data = list(zip(X_train, y_train))

cl = NaiveBayesClassifier(train_data)

cl.show_informative_features(10)

Most Informative Features
         contains(video) = True                0 : 1      =    134.6 : 1.0
           contains(dem) = True                0 : 1      =     96.0 : 1.0
        contains(turkey) = True                1 : 0      =     79.8 : 1.0
           contains(gop) = True                0 : 1      =     79.6 : 1.0
        contains(racist) = True                0 : 1      =     75.3 : 1.0
           contains(cop) = True                0 : 1      =     72.5 : 1.0
         contains(sarah) = True                0 : 1      =     64.1 : 1.0
     contains(coalition) = True                1 : 0      =     61.6 : 1.0
        contains(brexit) = True                1 : 0      =     54.2 : 1.0
         contains(audio) = True                0 : 1      =     53.3 : 1.0


In [33]:
# Evaluate classifier

# Zip the test data like the training data
test_data = list(zip(X_test, y_test))

# Evaluate the classifier
accuracy = cl.accuracy(test_data)
print(f"TextBlob Accuracy: {accuracy}")



TextBlob Accuracy: 0.9321633977289485


: 

## MultinomialNB using BoW

In [20]:
# vectorize data
X_train_bow, X_test_bow = helpers.vectorize_bow(X_train, X_test)

# Initialize the Multinomial Naive Bayes model
nb_classifier = MultinomialNB()

# Train the model on vectorized data
nb_classifier.fit(X_train_bow, y_train)




In [21]:
# Predict labels on the test set
y_pred = nb_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))

MultinomialNB Accuracy: 0.943
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      3515
           1       0.95      0.93      0.94      3316

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## MultinomialNB using TF-idf

In [27]:
# vectorize data using tfidf
X_train_tfidf, X_test_tfidf = helpers.vectorize_tfidf(X_train, X_test)

# Initialize the Multinomial Naive Bayes model
tfidf_nb_classifier = MultinomialNB()

# Train the model on vectorized data
tfidf_nb_classifier.fit(X_train_tfidf, y_train)



In [28]:
# Predict labels on the test set
y_pred = tfidf_nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"MultinomialNB Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))

MultinomialNB Accuracy: 0.939
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      3515
           1       0.96      0.91      0.93      3316

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## RandomForrest using Tf-idf

In [25]:
# Try Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 93.38%
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3515
           1       0.93      0.94      0.93      3316

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831



## K-Means with 2 clusters

In [26]:
# Try K-Means
from sklearn.cluster import KMeans

# Initialize K-Means
kmeans = KMeans(n_clusters=2,random_state=100)

# Train the classifier
kmeans.fit(X_train_tfidf)

# Predict on the test data
y_pred_k = kmeans.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_k = accuracy_score(y_test, y_pred_k)
print(f"K-Means Accuracy: {accuracy_k * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_k))

  super()._check_params_vs_input(X, default_n_init=10)


K-Means Accuracy: 73.88%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.51      0.67      3515
           1       0.65      0.98      0.78      3316

    accuracy                           0.74      6831
   macro avg       0.81      0.75      0.73      6831
weighted avg       0.81      0.74      0.72      6831



## K-Nearest Neighbors

In [27]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Means Accuracy: {accuracy_knn * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_knn))

K-Means Accuracy: 52.64%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.52      1.00      0.68      3515
           1       0.99      0.02      0.05      3316

    accuracy                           0.53      6831
   macro avg       0.75      0.51      0.37      6831
weighted avg       0.75      0.53      0.38      6831



: 