# Models Training 2
In this notebook we are using ngrams when encoding words using BoW or Tfidf, to try to retain some context for a better classification

## Imports

In [6]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.naive_bayes import ComplementNB
# from sklearn.ensemble import StackingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.cluster import KMeans
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# import custom helper module
import importlib
import helpers
importlib.reload(helpers)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'helpers' from '/notebooks/helpers.py'>

## Load preprocessed dataset

In [3]:
X_train, X_test, y_train, y_test = helpers.load_dataset("training_data_clean.csv")

# print message
helpers.print_text(X_train, y_train)

Loading split dataset from pickle files
----------------------------------------------------------------------------------------------------
[22466] us congress secure health benefit coal miner --> 1
----------------------------------------------------------------------------------------------------


## MultinomialNB using BoW and Ngrams

In [6]:
# vectorise data using BoW
X_train_bow = helpers.vectorize_bow(X_train)
X_test_bow = helpers.vectorize_bow(X_test)


In [4]:

# Initialize the Multinomial Naive Bayes model
nb_classifier = MultinomialNB()

# Train the model on vectorized data
nb_classifier.fit(X_train_bow, y_train)

# Predict labels on the test set
y_pred = nb_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.942
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      3515
           1       0.96      0.92      0.94      3316

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## MultinomialNB using TF-idf and Ngrams

In [4]:
# vectorize data using tfidf
X_train_tfidf = helpers.vectorize_tfidf(X_train)
X_test_tfidf = helpers.vectorize_tfidf(X_test)

In [5]:
# Initialize the Multinomial Naive Bayes model
tfidf_nb_classifier = MultinomialNB()

# Train the model on vectorized data
tfidf_nb_classifier.fit(X_train_tfidf, y_train)

# Predict labels on the test set
y_pred = tfidf_nb_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Print classification report
print("Classificatoin Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.938
Classificatoin Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      3515
           1       0.95      0.92      0.93      3316

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## RandomForrest using Tf-idf

In [None]:
# Try Random Forest Classifier

# Initialize Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf))

## K-Means with 2 clusters

In [None]:
# Try K-Means

# Initialize K-Means
kmeans = KMeans(n_clusters=2, random_state=100)

# Train the classifier
kmeans.fit(X_train_tfidf)

# Predict on the test data
y_pred_k = kmeans.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_k = accuracy_score(y_test, y_pred_k)
print(f"K-Means Accuracy: {accuracy_k * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_k))

## K-Nearest Neighbors

In [6]:


knn = KNeighborsClassifier()

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"K-Means Accuracy: {accuracy_knn * 100:.2f}%")
print("K-Means Classification Report:")
print(classification_report(y_test, y_pred_knn))

K-Means Accuracy: 51.93%
K-Means Classification Report:
              precision    recall  f1-score   support

           0       0.52      1.00      0.68      3515
           1       1.00      0.01      0.02      3316

    accuracy                           0.52      6831
   macro avg       0.76      0.50      0.35      6831
weighted avg       0.75      0.52      0.36      6831



## KNN with different algorithms

In [7]:
# Try K-Nearest Neighbours
knn = KNeighborsClassifier(n_neighbors = 2, algorithm='ball_tree')

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn * 100:.2f}%")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

In [None]:
# Try K-Nearest Neighbours

knn = KNeighborsClassifier(n_neighbors = 5, algorithm='kd_tree')

# Train the Classifier
knn.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred_knn = knn.predict(X_test_tfidf)

# Evaluate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn * 100:.2f}%")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

## Logistic Regression

In [7]:
# Train a logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(y_test, y_pred))

ValueError: X has 95337 features, but LogisticRegression is expecting 316536 features as input.

## SVC

In [6]:
# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy}")

# Print a detailed classification report
print(classification_report(y_test, y_pred))

## Stacking models as ensamble method

In [None]:
# Define base models
base_estimators = [
    ('multinomial', MultinomialNB()),
    ('bernoulli', BernoulliNB()),
    ('complement', ComplementNB()),
    ('randomforest', RandomForestClassifier(n_estimators=100, random_state=42))
]

# Meta-Learner
meta_classifier = LogisticRegression(max_iter=1000)

# Stacking Classifier
stacked_clf = StackingClassifier(
    estimators=base_estimators, 
    final_estimator=meta_classifier
)

# Train the stacking classifier
stacked_clf.fit(X_train_tfidf, y_train)

# Test the stacked model
y_test_pred = stacked_clf.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test, y_test_pred))