# Models Training 2
In this notebook we are using ngrams when encoding words using BoW or Tfidf, to try to retain some context for a better classification

## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob.classifiers import NaiveBayesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# import custom helper module
import importlib
import helpers
importlib.reload(helpers)


<module 'helpers' from 'c:\\Development\\_repos\\Ironhack\\Projects\\ik-p3-nlp\\helpers.py'>

## Load preprocessed dataset

In [3]:
X_train, X_test, y_train, y_test = helpers.load_dataset("TRAINING_DATA.txt")

# print message
helpers.print_text(X_train, y_train)


No pickle file found. Loading and cleaning dataset.


100%|██████████| 14924/14924 [00:00<00:00, 34073.12it/s]
100%|██████████| 14924/14924 [00:06<00:00, 2269.47it/s]

----------------------------------------------------------------------------------------------------
[1203] golpe mina cayó valor después encuesta frente dólar dólares nivel bajo marzo frente euro ahora probable cuentan respuestas campaña --> 0
----------------------------------------------------------------------------------------------------





## MultinomialNB using BoW and Ngrams

In [45]:
# Initialize CountVectorizer
ngrange = (2, 5)
max_feat = 50
vectorizer = CountVectorizer(ngram_range=ngrange, max_features=max_feat)

# fit and transform messages
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Initialize the Multinomial Naive Bayes model
nb_classifier = MultinomialNB()

# Train the model on vectorized data
nb_classifier.fit(X_train_bow, y_train)

# Predict labels on the test set
y_pred = nb_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.497
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.50      0.93      0.65      2240
           1       0.47      0.06      0.11      2238

    accuracy                           0.50      4478
   macro avg       0.49      0.50      0.38      4478
weighted avg       0.49      0.50      0.38      4478



## MultinomialNB using TF-idf and Ngrams

In [47]:
# Initialise Tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 1, ngram_range=ngrange, max_features=max_feat)

# fit vectorizer on train data, then apply it to test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Initialize the Multinomial Naive Bayes model
tfidf_nb_classifier = MultinomialNB()

# Train the model on vectorized data
tfidf_nb_classifier.fit(X_train_tfidf, y_train)

# Predict labels on the test set
y_pred = tfidf_nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.497
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.50      0.93      0.65      2240
           1       0.47      0.06      0.11      2238

    accuracy                           0.50      4478
   macro avg       0.49      0.50      0.38      4478
weighted avg       0.49      0.50      0.38      4478



## RandomForrest using Tf-idf

In [48]:
# Try Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 50.40%
              precision    recall  f1-score   support

           0       0.53      0.07      0.13      2240
           1       0.50      0.93      0.65      2238

    accuracy                           0.50      4478
   macro avg       0.52      0.50      0.39      4478
weighted avg       0.52      0.50      0.39      4478



## K-Means with 2 clusters

In [49]:
# Try K-Means
from sklearn.cluster import KMeans

# Initialize K-Means
kmeans = KMeans(n_clusters=2,random_state=100)

# Train the classifier
kmeans.fit(X_train_tfidf)

# Predict on the test data
y_pred_k = kmeans.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_k = accuracy_score(y_test, y_pred_k)
print(f"K-Means Accuracy: {accuracy_k * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_k))

  super()._check_params_vs_input(X, default_n_init=10)


K-Means Accuracy: 49.98%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.99      0.66      2240
           1       0.48      0.01      0.02      2238

    accuracy                           0.50      4478
   macro avg       0.49      0.50      0.34      4478
weighted avg       0.49      0.50      0.34      4478



## K-Nearest Neighbors

In [50]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Means Accuracy: {accuracy_knn * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_knn))

K-Means Accuracy: 49.55%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.92      0.65      2240
           1       0.47      0.07      0.13      2238

    accuracy                           0.50      4478
   macro avg       0.48      0.50      0.39      4478
weighted avg       0.48      0.50      0.39      4478

