In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

import html
import contractions

import re

from IPython.display import display

import seaborn as sns

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score, f1_score, roc_auc_score, log_loss

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


from pathlib import Path

SEED = 1979

do_grids = True

In [None]:
df = pd.read_csv('../data/preprocessed.csv')
df.drop(columns='Unnamed: 0', inplace=True)

In [None]:
%store -r engineered_features

# Modeling

## Stop words

In [None]:
punctuation_list = [char for char in string.punctuation]
punctuation_list.extend(['', '``', "''", '...'])

# obtain the standard list of stopwords
nltk.download('stopwords', quiet=True)
# start our own list of stopwords with these words
stop_list_heavy = stopwords.words('english')
# stop words to keep
# 44-59 be/have/do verbs
# 64-178 prepositions/subordinate conjunctions/modals
stop_list_light = stop_list_heavy.copy()
stop_list_light = stop_list_light[:44] + stop_list_light[60:64]
# add punctuation characters
for char in string.punctuation:
    stop_list_light.append(char)
    stop_list_heavy.append(char)
# add misc other tokens
stop_list_light.extend(['', 'll', 're', 've', 'ha', 'wa', '``', "''"])
stop_list_heavy.extend(['', 'll', 're', 've', 'ha', 'wa', '``', "''"])

In [None]:
df['target'] = df.method

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(df[['review'] + engineered_features], df['target'], test_size=0.2, random_state=SEED)

In [None]:
# save this value to compare to future model crossval scores
plurality_cv = round(y_train.value_counts(normalize=True)[1],4)
# show the sentiment breakdown
round(y_train.value_counts(normalize=True),4)

----------------------------------

## Preprocess data

In [None]:
max_features = None
stop_words = stop_list_light
ngram_range = (1,3)

In [None]:
text_preprocessor = TfidfVectorizer(
    max_features=max_features,
    ngram_range=ngram_range
)

numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocessor, 'review'),
        ('numerical', numerical_preprocessor, engineered_features)
    ]
)

In [None]:
def run_model_1(model):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    pipeline.fit(X_train, y_train)
    # generate predictions for the test data
    y_pred = pipeline.predict(X_test)
    # display the training and test accuracy scores
    print(f"Training Score: {round(pipeline.score(X_train, y_train),4)} \
    \nTest Score:     {round(pipeline.score(X_test, y_test),4)}")
    
    # generate predictions for the test data
    y_pred = pipeline.predict(X_test)
    
    # calculate different evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
#     roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # display different evaluation metrics
    print(f"\nAccuracy Score: {round(accuracy, 4)}")
    print(f"F1 Macro Score: {round(f1_macro, 4)}")
#     print(f"ROC-AUC Score: {round(roc_auc, 4)}")
    
    # plot the confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap='Greens', ax=ax)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
def tune_model_1(model, param_grid):
    pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])
    param_grid = param_grid
    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, cv=5, scoring='f1_macro')
    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    return gridsearch.best_params_

## Decision tree

## Tuning the decision tree model

In [None]:
%%time

if do_grids == True:
    best_params_ = tune_model_1(DecisionTreeClassifier(random_state=SEED), param_grid={
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [10, 20, None],
        'model__min_samples_leaf': [1, 2, 3]
    })
else:
    best_params_ = "{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 1}"
print(best_params_)

## Decision tree — tuned

## Random forest

## Tuning the random forest model

In [None]:
%%time

if do_grids == True:
    best_params_ = tune_model_1(RandomForestClassifier(random_state=SEED), param_grid={
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [10, 20, None],
        'model__min_samples_leaf': [1, 2, 3]
    })
else:
    best_params_ = "{'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 1}"
print(best_params_)

## Random forest — tuned

## Logistic regression

## Tuning the logistic regression model

In [None]:
%%time

if do_grids == True:
    best_params_ = tune_model_1(LogisticRegression(random_state=SEED, max_iter=1000), param_grid={
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
        'model__penalty': ['l1', 'l2'],  # Regularization penalty ('l1' for Lasso, 'l2' for Ridge)
        'model__solver': ['liblinear', 'saga']  # Algorithm to use in the optimization problem
    })
else:
    best_params_ = "{'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'liblinear'}"
print(best_params_)

## Logistic regression — tuned

## Bagged trees

## Tuning the bagged trees model

In [None]:
%%time

if do_grids == True:
    best_params_ = tune_model_1(BaggingClassifier(random_state=SEED, estimator=DecisionTreeClassifier()), param_grid={
        'model__n_estimators': [10, 50, 100],  # Number of base estimators (decision trees in this case)
        'model__max_samples': [0.5, 0.7, 1.0],  # Sample size for each base estimator
        'model__max_features': [0.5, 0.7, 1.0],  # Number of features to consider for each base estimator
        'model__estimator__max_depth': [None, 5, 10]  # Max depth of the decision trees
    })
else:
    best_params_ = "{'model__estimator__max_depth': None, 'model__max_features': 0.7, \
    'model__max_samples': 1.0, 'model__n_estimators': 100}"
print(best_params_)

## Bagged trees — tuned