# Initial Model

In [None]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [21]:
# Read the data file into a Pandas DataFrame
df = pd.read_csv('/Users/blankajarmoszko/PycharmProjects/thesis/data/df_cleaned.csv')

In [None]:
df.head()

In [None]:
# Function to map stars to sentiment
def map_sentiment(stars_received):
    if stars_received <= 3:
        return 0
    elif stars_received <= 4:
        return 1
    else:
        return 2
# Mapping stars to sentiment into three categories
df['sentiment'] = [ map_sentiment(x) for x in df['star_rating']]
print("Number of rows per star rating:")
print(df['sentiment'].value_counts())

# Plotting the sentiment distribution
plt.figure()
pd.value_counts(df['sentiment']).plot.bar(title="Sentiment distribution in df")
plt.xlabel("Sentiment")
plt.ylabel("No. of rows in df")
plt.show()

In [None]:
df.head()

In [None]:
# Drop rows with NaN values in the 'cleaned_text' column
df = df.dropna(subset=['cleaned_text'])

# Reset index after removing rows
df.reset_index(drop=True, inplace=True)

In [None]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['cleaned_text'], df['sentiment'], test_size=0.2, random_state=42)


In [None]:
df.head()

In [None]:
# bag of words
# Vectorize the text using Bag-of-Words
vectorizer_bow = CountVectorizer()
X_train_vectorized_bow = vectorizer_bow.fit_transform(train_data)
X_test_vectorized_bow = vectorizer_bow.transform(test_data)


In [None]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_data)
X_test_tfidf = vectorizer.transform(test_data)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-1,1e-2, 1e-3)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_data, train_labels)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

In [None]:
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.5, ngram_range=(1,3))
X_train_tfidf = vectorizer.fit_transform(train_data)
X_test_tfidf = vectorizer.transform(test_data)

### Bigrams and Uni-bigrams with TFIDF

In [None]:
import nltk
from nltk import word_tokenize, ngrams

# necessary for this to work
def convert_to_list(text):
    return text.split(',')
df["cleaned_text"] = df["cleaned_text"].apply(convert_to_list)

# Generate bigrams
df['bigrams'] = df['cleaned_text'].apply(lambda x: list(ngrams(x, 2)))

# Convert bigrams back to text
df['bigrams_text'] = df['bigrams'].apply(lambda x: ' '.join([' '.join(gram) for gram in x]))

# Use only bigrams for TF-IDF vectorization
df['combined_text'] = df['bigrams_text']

# Split the dataset
X_train_bi, X_test_bi, y_train_bi, y_test_bi = train_test_split(df['combined_text'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_bi = tfidf_vectorizer.fit_transform(X_train_bi)
X_test_tfidf_bi = tfidf_vectorizer.transform(X_test_bi)


In [None]:
# Generate uni-bigrams
df['uni_bigrams'] = df['cleaned_text'].apply(lambda x: list(ngrams(x, 2)))

# Convert uni-bigrams back to text
df['uni_bigrams_text'] = df['uni_bigrams'].apply(lambda x: ' '.join([' '.join(gram) for gram in x]))

# Combine unigrams and uni-bigrams for TF-IDF vectorization
df['combined_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x)) + ' ' + df['uni_bigrams_text']

# Split the dataset
X_train_uni, X_test_uni, y_train_uni, y_test_uni = train_test_split(df['combined_text'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_uni = tfidf_vectorizer.fit_transform(X_train_uni)
X_test_tfidf_uni = tfidf_vectorizer.transform(X_test_uni)

### Get Misclassified Results

In [None]:
def get_wrong_reviews(model, x_test, test_labels, name):
    # Make predictions on the test data
    y_pred = model.predict(X_test_tfidf)

    # Identify misclassifications
    misclassified_indices = [i for i in range(len(test_labels)) if test_labels.iloc[i] != y_pred[i]]

    # Retrieve misclassified entries
    misclassified_entries = df.iloc[misclassified_indices].copy()

    # Add predicted labels to the DataFrame
    misclassified_entries['Predicted Label'] = y_pred[misclassified_indices]

    # Save DataFrame with only misclassified entries as csv
    file_path = f"/Users/blankajarmoszko/PycharmProjects/thesis/models/missclassified_data/{name}.csv"
    misclassified_entries.to_csv(file_path, index=False)
    print("Done")




## Naive Bayes 

### Naive Bayes w/ BoW

In [None]:
# Train a Naive Bayes classifier
naive_bayes_model_bow = MultinomialNB()
naive_bayes_model_bow.fit(X_train_vectorized_bow, train_labels)

print(f"Naive Bayes w/ BOW: ")
# Make predictions on the test set
test_predictions = naive_bayes_model_bow.predict(X_test_vectorized_bow)
# Evaluate the model
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = naive_bayes_model_bow.predict(X_train_vectorized_bow)
# Evaluate the model
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))


### Naive Bayes with tfidf

In [None]:
test_labels

In [None]:
# Train a Naive Bayes classifier
naive_bayes_model_tf = MultinomialNB(alpha=0.01)
naive_bayes_model_tf.fit(X_train_tfidf, train_labels)
print(f"Naive Bayes w/ tfidf: ")
# Make predictions on the test set
test_predictions = naive_bayes_model_tf.predict(X_test_tfidf)
# Evaluate the model
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = naive_bayes_model_tf.predict(X_train_tfidf)
# Evaluate the model
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))

### Naive Bayes with tfidf and bigrams

In [None]:
# Train a Naive Bayes classifier
naive_bayes_model_tf_bi = MultinomialNB()
naive_bayes_model_tf_bi.fit(X_train_tfidf_bi, y_train_bi)

# Make predictions on the test set
test_predictions = naive_bayes_model_tf_bi.predict(X_test_tfidf_bi)
# Evaluate the model
test_accuracy = accuracy_score(y_test_bi, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test_bi, test_predictions))

# Make predictions on the train set
train_predictions = naive_bayes_model_tf_bi.predict(X_train_tfidf_bi)
# Evaluate the model
train_accuracy = accuracy_score(y_train_bi, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(y_train_bi, train_predictions))

### Naive Bayes with tfidf and uni-bigrams

In [None]:
# Train a Naive Bayes classifier
naive_bayes_model_tf_uni = MultinomialNB()
naive_bayes_model_tf_uni.fit(X_train_tfidf_uni, y_train_uni)

# Make predictions on the test set
test_predictions = naive_bayes_model_tf_uni.predict(X_test_tfidf_uni)
# Evaluate the model
test_accuracy = accuracy_score(y_test_uni, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test_uni, test_predictions))

# Make predictions on the train set
train_predictions = naive_bayes_model_tf_uni.predict(X_train_tfidf_uni)
# Evaluate the model
train_accuracy = accuracy_score(y_train_uni, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(y_train_uni, train_predictions))

## SVM

## SVM w/ bow

In [None]:
# Classifier - Algorithm - SVM
from sklearn import model_selection, naive_bayes, svm
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_vectorized_bow,train_labels)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_vectorized_bow)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_labels)*100)

## SVM w/ tfidf

In [None]:
# Classifier - Algorithm - SVM
from sklearn import model_selection, naive_bayes, svm
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,train_labels)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_labels)*100)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Obtain the confusion matrix
cm = confusion_matrix(test_labels, predictions_SVM)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - SVM Classifier")
plt.show()


In [None]:
train_predictions_SVM = SVM.predict(X_train_tfidf)
print("TRAIN SVM Accuracy Score -> ",accuracy_score(train_predictions_SVM, train_labels)*100)

### SVM with random search hyperparam tunning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the parameter distributions
param_distributions = {
    'C': uniform(loc=0.1, scale=100),  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (if polynomial)
    'gamma': ['scale', 'auto']  # Kernel coefficient (for 'rbf', 'poly', and 'sigmoid')
}

# Create the SVM classifier
svm_classifier = svm.SVC()

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(svm_classifier, param_distributions, n_iter=20, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_tfidf, train_labels)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Make predictions using the best model
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy Score:", accuracy)


## Logistic Regression

### Logistic Regression w/ BoW

In [None]:
# Train a Logistic Regression classifier
logistic_regression_model_bow = LogisticRegression()
logistic_regression_model_bow.fit(X_train_vectorized_bow, train_labels)

print("Logistic Regression w/ BOW")
# Make predictions on the test set
test_predictions = logistic_regression_model_bow.predict(X_test_vectorized_bow)
# Evaluate the model
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = logistic_regression_model_bow.predict(X_train_vectorized_bow)
# Evaluate the model
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))


### Logistic Regression w/ tfidf

In [None]:
# Train a Logistic Regression classifier
logistic_regression_model_tfidf = LogisticRegression()
logistic_regression_model_tfidf.fit(X_train_tfidf, train_labels)
print("Logistic Regression w/ tfidf")
# Make predictions on the test set
test_predictions = logistic_regression_model_tfidf.predict(X_test_tfidf)
# Evaluate the model
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = logistic_regression_model_tfidf.predict(X_train_tfidf)
# Evaluate the model
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))

### Tunning Logistic Regression with TFIDF 

In [None]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the Logistic Regression model
logistic_regression_model = LogisticRegression()

# Define hyperparameters to tune
param_grid = {
    'C': [0.1, 1],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty norm
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Perform Grid Search cross-validation
grid_search = GridSearchCV(estimator=logistic_regression_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose =2)
grid_search.fit(X_train_tfidf, train_labels)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_tfidf)

# Evaluate the best model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(test_labels, predictions))

# Make predictions on the train and test set using the best model
train_predictions = best_model.predict(X_train_tfidf)
test_predictions = best_model.predict(X_test_tfidf)

# Calculate train and test accuracy
train_accuracy = accuracy_score(train_labels, train_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)

# Print train and test accuracy for the best model
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

from sklearn.metrics import confusion_matrix

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
test_predictions = best_model.predict(X_test_tfidf)

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}

In [None]:
best_params = {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
# Train a Logistic Regression classifier
logistic_regression_model_tfidf = LogisticRegression(best_params)
logistic_regression_model_tfidf.fit(X_train_tfidf, train_labels)
print("Tunned Logistic Regression w/ tfidf")
print("Best params:", best_params)
# Make predictions on the test set
test_predictions = logistic_regression_model_tfidf.predict(X_test_tfidf)
# Evaluate the model
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = logistic_regression_model_tfidf.predict(X_train_tfidf)
# Evaluate the model
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))

### Logistic Regression with TFIDF and bigrams

In [None]:
# Train a model (e.g., logistic regression)
model = LogisticRegression()
model.fit(X_train_tfidf_bi, y_train_bi)

# Make predictions on the test set
test_predictions = model.predict(X_test_tfidf_bi)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test_bi, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test_bi, test_predictions))

# Make predictions on the train set
train_predictions = model.predict(X_train_tfidf_bi)

# Evaluate the model on the train set
train_accuracy = accuracy_score(y_train_bi, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(y_train_bi, train_predictions))


### Logistic Regression with TFIDF and uni-bigrams

In [None]:
# Train a model (e.g., logistic regression)
model = LogisticRegression()
model.fit(X_train_tfidf_uni, y_train_uni)

# Make predictions on the test set
test_predictions = model.predict(X_test_tfidf_uni)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test_uni, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test_uni, test_predictions))

# Make predictions on the train set
train_predictions = model.predict(X_train_tfidf_uni)

# Evaluate the model on the train set
train_accuracy = accuracy_score(y_train_uni, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(y_train_uni, train_predictions))

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

### Random Forrest with bofw

In [None]:
# Build the Random Forest model
random_forest_model_bofw = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model
random_forest_model_bofw.fit(X_train_vectorized_bow, train_labels)

# Make predictions on the test set
test_predictions = random_forest_model_bofw.predict(X_test_vectorized_bow)

# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = random_forest_model_bofw.predict(X_train_vectorized_bow)

# Evaluate the model on the train set
train_accuracy = accuracy_score(train_labels, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

### Random Forrest witb tfidf

In [16]:
# Build the Random Forest model
random_forest_model_tfidf = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model
random_forest_model_tfidf.fit(X_train_tfidf, train_labels)
print("Random Forest tfidf")
# Make predictions on the train set
test_predictions = random_forest_model_tfidf.predict(X_test_tfidf)
# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = random_forest_model_tfidf.predict(X_train_tfidf)

# Evaluate the model on the train set
train_accuracy = accuracy_score(train_labels, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))

Random Forest tfidf

Test Set:
Test Accuracy: 0.5707845873135129
              precision    recall  f1-score   support

           0       0.65      0.65      0.65      2767
           1       0.48      0.23      0.31      2941
           2       0.55      0.79      0.65      3609

    accuracy                           0.57      9317
   macro avg       0.56      0.56      0.54      9317
weighted avg       0.56      0.57      0.54      9317
Train Set:
Train Accuracy: 0.9975582268970699
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11034
           1       1.00      1.00      1.00     11868
           2       1.00      1.00      1.00     14366

    accuracy                           1.00     37268
   macro avg       1.00      1.00      1.00     37268
weighted avg       1.00      1.00      1.00     37268


Confusion Matrix for Train Set:
[[10995    33     6]
 [   13 11840    15]
 [    4    20 14342]]

Confusion Matrix for Test Set:

### Tunning Random Forest

In [15]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search
# Best Parameters: {'n_estimators': 75, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create a Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy')

# Train the model
grid_search.fit(X_train_tfidf, train_labels)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Build the Random Forest model with the best parameters
random_forest_model_tfidf = RandomForestClassifier(**best_params, random_state=42)

# Train the model
random_forest_model_tfidf.fit(X_train_tfidf, train_labels)

# Make predictions on the test set
test_predictions = random_forest_model_tfidf.predict(X_test_tfidf)

# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = random_forest_model_tfidf.predict(X_train_tfidf)

# Evaluate the model on the train set
train_accuracy = accuracy_score(train_labels, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Best Parameters: {'n_estimators': 75, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}

KeyboardInterrupt: 

In [14]:
# Build the Tuned Random Forest model
best_params = {'n_estimators': 75, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}
random_forest_model_tfidf = RandomForestClassifier(n_estimators=75,min_samples_split=10, min_samples_leaf=4, max_features='sqrt', random_state=42)

# Train the model
random_forest_model_tfidf.fit(X_train_tfidf, train_labels)
print("Tuned Random Forest tfidf")
print("Best Params: ", best_params)
# Make predictions on the train set
test_predictions = random_forest_model_tfidf.predict(X_test_tfidf)
# Evaluate the model on the test set
test_accuracy = accuracy_score(test_labels, test_predictions)
print("\nTest Set:")
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(test_labels, test_predictions))

# Make predictions on the train set
train_predictions = random_forest_model_tfidf.predict(X_train_tfidf)

# Evaluate the model on the train set
train_accuracy = accuracy_score(train_labels, train_predictions)
print("Train Set:")
print(f"Train Accuracy: {train_accuracy}")
print(classification_report(train_labels, train_predictions))

# Print confusion matrices for train and test
print("\nConfusion Matrix for Train Set:")
print(confusion_matrix(train_labels, train_predictions))

print("\nConfusion Matrix for Test Set:")
print(confusion_matrix(test_labels, test_predictions))

Tuned Random Forest tfidf
Best Params:  {'n_estimators': 75, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None}

Test Set:
Test Accuracy: 0.5699259418267683
              precision    recall  f1-score   support

           0       0.78      0.54      0.64      2767
           1       0.59      0.15      0.24      2941
           2       0.51      0.93      0.66      3609

    accuracy                           0.57      9317
   macro avg       0.63      0.54      0.51      9317
weighted avg       0.62      0.57      0.52      9317
Train Set:
Train Accuracy: 0.6676236986154341
              precision    recall  f1-score   support

           0       0.90      0.62      0.74     11034
           1       0.88      0.33      0.47     11868
           2       0.56      0.99      0.71     14366

    accuracy                           0.67     37268
   macro avg       0.78      0.64      0.64     37268
weighted avg       0.76      0.67      0.64     372

2### ADA Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define a shallow decision tree as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=2)

# Reduce the number of estimators to make training faster
classifier = AdaBoostClassifier( base_estimator, n_estimators=50)

# Fit the classifier
classifier.fit(X_train_tfidf, train_labels)

# Predictions on training set
train_preds = classifier.predict(X_train_tfidf)

# Predictions on test set
test_preds = classifier.predict(X_test_tfidf)

# Classification report and confusion matrix for training set
print("Training Set:")
print("Classification Report:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(train_labels, train_preds))

# Classification report and confusion matrix for test set
print("\nTest Set:")
print("Classification Report:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the base estimator
base_estimator = DecisionTreeClassifier()

# Define the AdaBoostClassifier with the base estimator
classifier = AdaBoostClassifier(estimator=base_estimator)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [20, 30, 50],  # Try different numbers of estimators
    'estimator__max_depth': [1, 2, 3],  # Try different depths for the decision trees
    'learning_rate': [0.1, 0.5, 1.0]  # Try different learning rates
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_tfidf, train_labels)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator
best_classifier = grid_search.best_estimator_

# Fit the best classifier to the training data
best_classifier.fit(X_train_tfidf, train_labels)



### Tunned ADABoost

In [None]:
# Define a shallow decision tree as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=3)

# Reduce the number of estimators to make training faster
classifier = AdaBoostClassifier( base_estimator, n_estimators=50, learning_rate= 0.5)

# Fit the classifier
classifier.fit(X_train_tfidf, train_labels)

# Predictions on training set
train_preds = classifier.predict(X_train_tfidf)

# Predictions on test set
test_preds = classifier.predict(X_test_tfidf)

# Classification report and confusion matrix for training set
print("Training Set:")
print("Classification Report:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(train_labels, train_preds))

# Classification report and confusion matrix for test set
print("\nTest Set:")
print("Classification Report:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the base estimator
base_estimator = DecisionTreeClassifier()

# Define the AdaBoostClassifier with the base estimator
classifier = AdaBoostClassifier(estimator=base_estimator, learning_rate=0.5)
# Best Parameters: {'estimator__max_depth': 3, 'learning_rate': 0.5, 'n_estimators': 50}
# Define the parameter grid to search
param_grid = {
    'n_estimators': [40,50,100],  # Try different numbers of estimators
    'estimator__max_depth': [3,5,6]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_tfidf, train_labels)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator
best_classifier = grid_search.best_estimator_

# Fit the best classifier to the training data
best_classifier.fit(X_train_tfidf, train_labels)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Define the KNN classifier with default parameters
knn_classifier = KNeighborsClassifier()

# Fit the KNN classifier to the training data
knn_classifier.fit(X_train_tfidf, train_labels)

# Predictions on training set
train_preds = knn_classifier.predict(X_train_tfidf)

# Predictions on test set
test_preds = knn_classifier.predict(X_test_tfidf)

# Classification report and confusion matrix for training set
print("Training Set:")
print("Classification Report:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(train_labels, train_preds))

# Classification report and confusion matrix for test set
print("\nTest Set:")
print("Classification Report:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))

In [None]:
# Tunning KNN

from sklearn.model_selection import GridSearchCV
# weights, distance params
# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11]  # Adjust the range as needed
}

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_tfidf, train_labels)

# Best parameter found during grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Refit the best model to the entire training set
best_knn_classifier = grid_search.best_estimator_
best_knn_classifier.fit(X_train_tfidf, train_labels)

# Predictions on training set
train_preds = best_knn_classifier.predict(X_train_tfidf)

# Predictions on test set
test_preds = best_knn_classifier.predict(X_test_tfidf)

# Classification report and confusion matrix for training set
print("Training Set:")
print("Classification Report:")
print(classification_report(train_labels, train_preds))
print("Confusion Matrix:")
print(confusion_matrix(train_labels, train_preds))

# Classification report and confusion matrix for test set
print("\nTest Set:")
print("Classification Report:")
print(classification_report(test_labels, test_preds))
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))
