# Model Training 1
In this notebook we try different ML classifiers and clustering models using Bag of Word and Tf-idf encodings


## Imports

In [22]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob.classifiers import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# import custom helper module
import importlib
import helpers
importlib.reload(helpers)


<module 'helpers' from 'c:\\Development\\_repos\\Ironhack\\Projects\\ik-p3-nlp\\helpers.py'>

## Load preprocessed dataset

In [23]:
X_train, X_test, y_train, y_test = helpers.load_dataset("TRAINING_DATA.txt"#)

# print message
helpers.print_text(X_train, y_train)

No pickle file found. Loading and cleaning dataset.


100%|██████████| 14924/14924 [00:01<00:00, 14747.05it/s]
100%|██████████| 14924/14924 [00:06<00:00, 2290.36it/s]

----------------------------------------------------------------------------------------------------
[3251] primero servicios incluso permite bitcoin interaccion moneda renminbi suena divertido varias veces marlo acerco embistio cabeza estomago leta causando leta toser respirar manera voluntaria marlo lidiar hecho solo programa aun habia sido recogido temporada mas shades of blue tambien ser productor ejecutivo ryan seacrest adi hasak escribiendo tambien trabajo oficina gato apagarlas casa da miedo lleguen aquellos productores ve hongo flacido entonces pone enorme --> 0
----------------------------------------------------------------------------------------------------





## Train TextBlob NB Classifier
Very long, to run on PaperSpace

In [29]:
# Let's train a first textblop NaiveBayes classifier using the raw text as input

# let's pack our train dataset into a list of tuples (text, label)
train_data = list(zip(X_train, y_train))

cl = NaiveBayesClassifier(train_data)

cl.show_informative_features(10)

Most Informative Features
           contains(aun) = True                1 : 0      =      7.7 : 1.0
       contains(autobús) = True                1 : 0      =      7.7 : 1.0
           contains(out) = True                0 : 1      =      7.7 : 1.0
           contains(box) = True                1 : 0      =      7.4 : 1.0
       contains(felices) = True                1 : 0      =      7.0 : 1.0
           contains(mas) = True                1 : 0      =      6.6 : 1.0
         contains(aquel) = True                1 : 0      =      6.3 : 1.0
      contains(cuidados) = True                1 : 0      =      6.3 : 1.0
         contains(cómic) = True                0 : 1      =      6.3 : 1.0
     contains(necesitas) = True                1 : 0      =      5.7 : 1.0


In [None]:
# Evaluate classifier
accuracy = cl.accuracy(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")


## MultinomialNB using BoW

In [33]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# fit and transform messages
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Initialize the Multinomial Naive Bayes model
nb_classifier = MultinomialNB()

# Train the model on vectorized data
nb_classifier.fit(X_train_bow, y_train)




In [34]:
# Predict labels on the test set
y_pred = nb_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.374
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.39      0.43      0.41      2240
           1       0.36      0.32      0.34      2238

    accuracy                           0.37      4478
   macro avg       0.37      0.37      0.37      4478
weighted avg       0.37      0.37      0.37      4478



## MultinomialNB using TF-idf

In [35]:
# Initialise Tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 1)

# fit vectorizer on train data, then apply it to test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Initialize the Multinomial Naive Bayes model
tfidf_nb_classifier = MultinomialNB()

# Train the model on vectorized data
tfidf_nb_classifier.fit(X_train_tfidf, y_train)



In [36]:
# Predict labels on the test set
y_pred = tfidf_nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.367
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.38      0.44      0.41      2240
           1       0.34      0.29      0.32      2238

    accuracy                           0.37      4478
   macro avg       0.36      0.37      0.36      4478
weighted avg       0.36      0.37      0.36      4478



## RandomForrest using Tf-idf

In [39]:
# Try Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 34.23%
              precision    recall  f1-score   support

           0       0.33      0.31      0.32      2240
           1       0.35      0.38      0.36      2238

    accuracy                           0.34      4478
   macro avg       0.34      0.34      0.34      4478
weighted avg       0.34      0.34      0.34      4478



## K-Means with 2 clusters

In [38]:
# Try K-Means
from sklearn.cluster import KMeans

# Initialize K-Means
kmeans = KMeans(n_clusters=2,random_state=100)

# Train the classifier
kmeans.fit(X_train_tfidf)

# Predict on the test data
y_pred_k = kmeans.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_k = accuracy_score(y_test, y_pred_k)
print(f"K-Means Accuracy: {accuracy_k * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_k))

  super()._check_params_vs_input(X, default_n_init=10)


K-Means Accuracy: 48.86%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.96      0.65      2240
           1       0.29      0.02      0.03      2238

    accuracy                           0.49      4478
   macro avg       0.39      0.49      0.34      4478
weighted avg       0.39      0.49      0.34      4478



## K-Nearest Neighbors

In [40]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Means Accuracy: {accuracy_knn * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_knn))

K-Means Accuracy: 49.71%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.99      0.66      2240
           1       0.11      0.00      0.00      2238

    accuracy                           0.50      4478
   macro avg       0.30      0.50      0.33      4478
weighted avg       0.30      0.50      0.33      4478

