In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

import html
import contractions

import re

from IPython.display import display

import seaborn as sns

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score, f1_score, roc_auc_score, log_loss

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


from pathlib import Path

SEED = 1979

do_grids = True

In [2]:
df = pd.read_csv('../data/preprocessed.csv')
df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
%store -r engineered_features

# Modeling

## Stop words

In [4]:
punctuation_list = [char for char in string.punctuation]
punctuation_list.extend(['', '``', "''", '...'])

# obtain the standard list of stopwords
nltk.download('stopwords', quiet=True)
# start our own list of stopwords with these words
stop_list_heavy = stopwords.words('english')
# stop words to keep
# 44-59 be/have/do verbs
# 64-178 prepositions/subordinate conjunctions/modals
stop_list_light = stop_list_heavy.copy()
stop_list_light = stop_list_light[:44] + stop_list_light[60:64]
# add punctuation characters
for char in string.punctuation:
    stop_list_light.append(char)
    stop_list_heavy.append(char)
# add misc other tokens
stop_list_light.extend(['', 'll', 're', 've', 'ha', 'wa', '``', "''"])
stop_list_heavy.extend(['', 'll', 're', 've', 'ha', 'wa', '``', "''"])

In [5]:
rating_threshold = [4, 6]

print(len(df[df.rating <= rating_threshold[0]]), len(df[df.rating >= rating_threshold[1]]))

7249 12948


In [6]:
df.drop(df[
    (df.rating > rating_threshold[0]) & \
    (df.rating < rating_threshold[1])
].index, inplace=True)

df['target'] = df.rating.apply(lambda x: 1 if x >= rating_threshold[1] else 0)

In [7]:
X_train, X_test, y_train, y_test = \
train_test_split(df[['review'] + engineered_features], df['target'], test_size=0.2, random_state=SEED)

In [8]:
# save this value to compare to future model crossval scores
plurality_cv = round(y_train.value_counts(normalize=True)[1],4)
# show the sentiment breakdown
round(y_train.value_counts(normalize=True),4)

target
1    0.6416
0    0.3584
Name: proportion, dtype: float64

----------------------------------

## Preprocess data

In [10]:
max_features = None
stop_words = stop_list_light
ngram_range = (1,3)

In [11]:
text_preprocessor = TfidfVectorizer(
    max_features=max_features,
    ngram_range=ngram_range
)

numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocessor, 'review'),
        ('numerical', numerical_preprocessor, engineered_features)
    ]
)

## Decision tree tuning

In [12]:
%%time

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=SEED))
])

if do_grids == True:

    param_grid = {'model__criterion': ['gini', 'entropy'], 
            'model__max_depth': [10, 20, None],
            'model__min_samples_leaf': [1, 2, 3]
           }

    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, cv=5, scoring='accuracy')

    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    print(gridsearch.best_params_,'\n')
else:
    print("{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 2}")

KeyboardInterrupt: 

## Random forest tuning

In [None]:
%%time
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=SEED))
])

if do_grids == True:

    param_grid = {'model__criterion': ['gini', 'entropy'], 
            'model__max_depth': [10, 20, None],
            'model__min_samples_leaf': [1, 2, 3]
           }

    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, scoring='accuracy')

    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    print(gridsearch.best_params_,'\n')
else:
    print("{'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 1}")

## Adaboost tuning

In [None]:
%%time
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', AdaBoostClassifier(estimator=DecisionTreeClassifier()))
])

if do_grids == True:

    param_grid = {'model__n_estimators': [50, 100, 200],  # Number of estimators (weak learners)
            'model__learning_rate': [0.1, 0.5, 1.0],  # Learning rate for the updates
            'model__estimator__max_depth': [1, 2, 3]  # Max depth of the weak learners (Decision Trees)
           }

    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, scoring='accuracy')

    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    print(gridsearch.best_params_,'\n')
else:
    print("{'model__estimator__max_depth': 1, 'model__learning_rate': 1.0, 'model__n_estimators': 50}")

## Logistic regression tuning

Beware this one took over 6 minutes to tune on the light set.

In [None]:
%%time

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

if do_grids == True:

    param_grid = {
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
        'model__penalty': ['l1', 'l2'],  # Regularization penalty ('l1' for Lasso, 'l2' for Ridge)
        'model__solver': ['liblinear', 'saga']  # Algorithm to use in the optimization problem
    }

    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, cv=5, scoring='accuracy')

    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    print(gridsearch.best_params_,'\n')
else:
    print("{'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'liblinear'}")

## Bagged trees tuning

Beware this took over 12 minutes to tune on the light set.

In [None]:
%%time

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', BaggingClassifier(estimator=DecisionTreeClassifier()))
])

if do_grids == True:
    
    param_grid = {
        'model__n_estimators': [10, 50, 100],  # Number of base estimators (decision trees in this case)
        'model__max_samples': [0.5, 0.7, 1.0],  # Sample size for each base estimator
        'model__max_features': [0.5, 0.7, 1.0],  # Number of features to consider for each base estimator
        'model__estimator__max_depth': [None, 5, 10]  # Max depth of the decision trees
    }

    gridsearch = GridSearchCV(estimator=pipeline, param_grid = param_grid, scoring='accuracy')

    gridsearch.fit(X_train,  y_train)
    gridsearch.best_params_
    print(gridsearch.best_params_,'\n')
else:
    print("{'model__estimator__max_depth': None, 'model__max_features': 0.7, \
    'model__max_samples': 1.0, 'model__n_estimators': 100} ")

## Gradient boost tuning

This simply took too long and should not be tuned.

## XGB tuning

Based on running 30 seconds with single parameters, this could take nearly an hour to run on the light set.