# Overview

This notebook is used to train a random forest model to predict the rating of a Hostelworld review based on the sentiment of the text in the review.

Possibly an ordinal regression model would perform better than a random forest but there is no package that works well to do this in Python (Note the package mord is not maintained).The below link could be worth investigating further or else find a way to manually train the model.
https://www.statsmodels.org/dev/examples/notebooks/generated/ordinal_regression.html

# Import Modules

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# Read in Data and Prepare

In [None]:
df = pd.read_csv('data/message_df_labelled.csv')
df.loc[(df['rating']=="MIXED")|(df['rating']=="NEUTRAL"), "rating"] = 'OTHER'
df = df.loc[df['rating'].notnull()]
df = pd.concat([df, pd.get_dummies(df['rating'])], axis=1)
df.head()

In [3]:
# the df contains each message within a review and the sentiment of that review. We want to aggregate each review
# so that we get a count of how many positive, negative and neutral sentences are in each review
# we will use these number to predict the rating of a review. The rest of variables are solely to provide
# information on each review incase we need it, we will exclude these when training
df_agg = (
df.groupby('review_id')
.agg(
    review_id=('review_id', min),
    city=('city', min),
    country=('country', min),
    hostel_id=('hostel_id', min),
    hostel_url=('hostel_url', min),
    hostel_name=('hostel_name', min),
    hostel_overall_rating=('hostel_overall_rating', min),
    review_text=('review_text', min),
    review_rating=('review_rating', min),
    eviewer_nationality=('reviewer_nationality', min),
    reviewer_gender=('reviewer_gender', min),
    reviewer_age_group=('reviewer_age_group', min),
    stay_date=('stay_date', min),
    total_messages=('hostel_id', 'count'),
    count_negative_sentences=('NEGATIVE', sum),
    count_positive_sentences=('POSITIVE', sum),
    count_other_sentences=('OTHER', sum)
)
.reset_index(drop=True)
)

In [4]:
# we are going to scale the response variable for training between 0 and 1
scaler = MinMaxScaler()
df_agg['review_rating_scaled'] = scaler.fit_transform(np.array(df_agg['review_rating']).reshape(-1, 1))

# keeping some extra cols in for now so we can look through predictions better on the test set
cols = [
        'review_id', 
        'review_text',
        'hostel_name',
        'count_positive_sentences',
        'count_negative_sentences',
        'count_other_sentences',
        'review_rating',
        'review_rating_scaled'
       ]
df_agg = df_agg[cols]
predictors = [x for x in df_agg.columns if x not in ["review_rating_scaled"]]
X = df_agg[predictors]
Y = df_agg[['review_rating_scaled']]

# Random Forest Model

In [5]:
# create train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=222)
x_test_original = x_test.copy() # we want this so we can use the review id to look at the test set predictions
x_test_original['review_rating_scaled'] = Y

predictors = [x for x in x_train.columns if x not in ["review_id", "hostel_name", 'review_rating', 'review_text']]
# predictors = [x for x in x_train.columns if x in relevant_features]
x_train = x_train[predictors]
x_test = x_test[predictors]
x_train_original = x_train.copy() # so we can look at feature importance

In [None]:
# hyperparameter tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 8)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 8)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 15, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
rf_best_parameters  = rf_random.best_params_

In [None]:
# a random forest model predicts scores between the min and max that it sees in the training set which is why it works
# well for this task compared to some other models that may predict more or less than the scale of ratings (0-10)

# regression_model = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
regression_model = RandomForestRegressor(**rf_best_parameters)
regression_model.fit(x_train.values, y_train['review_rating_scaled'].ravel())
y_pred  = regression_model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))
print("R2: %f" % (r2_score(y_test, y_pred)))

In [8]:
# save and load the model
dump(regression_model, 'rating_prediction_model.joblib') 
regression_model = load('rating_prediction_model.joblib') 

# Look at Results

In [None]:
print(f'Minimum rating in test set {y_test.min()}')
print(f'Minimum rating in test set {y_test.max()}')
print(f'Minimum rating in test set predictions {y_pred.min()}')
print(f'Maximum rating in test set predictions {y_pred.max()}')

In [None]:
# check the distribution of predicted ratings
plt.hist(y_pred)

In [None]:
# get the feature importance for the model
importances = regression_model.feature_importances_
features = x_train_original.columns
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# plot the correlation between predictions and actuals with a scatter plot
plt.scatter(y_test, y_pred, c ="blue")

In [None]:
x_test_original['pred'] = y_pred
x_test_original