In [1]:
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.class_weight import compute_class_weight
from google.cloud import storage
import io

In [2]:
bucket_name = "bf-review-nlp"
blob_name = "processed_data/processed_df.pickle"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
pickle_data = blob.download_as_bytes()
pickle_file = io.BytesIO(pickle_data)
df_raw = pickle.load(pickle_file)

In [3]:
# Trim dataset for runtime
from sklearn.model_selection import StratifiedShuffleSplit

X_raw = df_raw["stemmedText"]
y_raw = df_raw["overall"]
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=42)
_, idx = next(stratified_split.split(X_raw, y_raw))

df = df_raw.iloc[idx]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79122 entries, 256187 to 892717
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   overall      79122 non-null  category
 1   verified     79122 non-null  bool    
 2   style        65261 non-null  object  
 3   reviewText   79122 non-null  object  
 4   stemmedText  79122 non-null  object  
dtypes: bool(1), category(1), object(3)
memory usage: 2.6+ MB


In [4]:
# Parse the report into a DataFrame
def csv_report(report, name):
    lines = report.split('\n')
    data = [line.split()[1:] for line in lines[2:-5]]
    columns = ['precision', 'recall', 'f1-score', 'support']
    df = pd.DataFrame(data, columns=columns)

    output_csv_path = name+".csv"
    df.to_csv(output_csv_path)

In [5]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X = vectorizer.fit_transform(df['stemmedText'])
y = df["overall"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(stratified_split.split(X, y))
X_train = X[train_idx]
X_test = X[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]

In [6]:
# Initialise and run model using a grid search to find parameters, and evaluating using f1-score with cross-validation.
# We could do a much more comprehensive parameter search, but due to computation time on my machine, we'll keep it small.
rf = RandomForestClassifier(random_state=42)
rf_params = {'max_depth': [None, 10],
             'min_samples_split': [10, 20],
             'min_samples_leaf': [4, 8]}

rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring="f1_macro")
rf_grid.fit(X_train, y_train)
rf_best_params = rf_grid.best_params_

best_rf = RandomForestClassifier(**rf_best_params, random_state=42)
best_rf.fit(X_train, y_train)
rf_pred = best_rf.predict(X_test)
rf_report = classification_report(y_test, rf_pred)
csv_report(rf_report, "rf")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Let's try class weighting due to imbalanced classes.
# This is less computationally expensive than oversampling, and the dataset is already large (and slow).
# Ideally we'd re-do a grid search, but for the sake of computation time we'll just re-use our parameters from earlier.
class_weights = compute_class_weight('balanced', classes=y_train.unique(), y=y_train)
class_weight_dict = dict(zip(y_train.unique(), class_weights))

rf_weighted = RandomForestClassifier(**rf_best_params, class_weight=class_weight_dict, random_state=42)
rf_weighted.fit(X_train, y_train)
rf_wighted_pred = rf_weighted.predict(X_test)
rf_weighted_report = classification_report(y_test, rf_wighted_pred)
csv_report(rf_weighted_report, "rf_weighted")

In [8]:
# Initialise and run model using a grid search to find parameters, and evaluating using f1-score with cross-validation.
nb = MultinomialNB()
nb_params = {'alpha': [0.1, 1, 10],
             'fit_prior': [True, False]}

nb_grid = GridSearchCV(nb, nb_params, cv=5, scoring="f1_macro")
nb_grid.fit(X_train, y_train)
nb_best_params = nb_grid.best_params_

best_nb = MultinomialNB(**nb_best_params)
best_nb.fit(X_train, y_train)
nb_pred = best_nb.predict(X_test)
nb_report = classification_report(y_test, nb_pred)
csv_report(nb_report, "nb")

In [9]:
nb_weighted = MultinomialNB(**nb_best_params, class_prior=class_weights)
nb_weighted.fit(X_train, y_train)
nb_weighted_pred = nb_weighted.predict(X_test)
nb_weighted_report = classification_report(y_test, nb_weighted_pred)
csv_report(nb_weighted_report, "nb_weighted")