# Import packages

In [1]:
import pandas as pd
pd.set_option("display.max_columns", 50)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn import feature_extraction, model_selection, naive_bayes, preprocessing, ensemble, metrics
import numpy as np
import pickle

# Read dataset 

In [2]:
reviews = pd.read_csv('../datasets/hotel_reviews.csv')

# Extract columns required

In [3]:
modeling_data = reviews[['reviews_rating', 'reviews_text', 'reviews_title']]

# Remove nulls - Atleast one of title and review should have a value and rating should not be null

In [4]:
modeling_data = modeling_data[((modeling_data.reviews_text.notnull())|(modeling_data.reviews_title.notnull()))\
                             &(modeling_data.reviews_rating.notnull())]

In [5]:
modeling_data.shape

(10000, 3)

# remove punctuation, convert to lowercase, remove stop words and do lemmatization

In [6]:
def text_process(text):
    if type(text) == float:
        return ""
    else:
        nopunc = [i.lower() for i in text if (i not in string.punctuation)&(i.isdigit()==False)]
        nopunc_text = ''.join(nopunc)
        lemmatizer = WordNetLemmatizer()
        nonpunc_nonstopwords_lemma_text = [lemmatizer.lemmatize(i) for i in nopunc_text.split() if i not in stopwords.words('english')]
        text = " ".join(nonpunc_nonstopwords_lemma_text)
        return text

# Transform columns reviews_text and reviews_title using the function text_process

In [7]:
modeling_data['reviews_text'] = modeling_data.apply(lambda row: text_process(row['reviews_text']), axis=1)
modeling_data['reviews_title'] = modeling_data.apply(lambda row: text_process(row['reviews_title']), axis=1)

# Build scoring function

In [8]:
def get_scores(preds,true):
    accuracy = {
            'mae' : np.round(metrics.mean_absolute_error(true, preds),2),
    'rmse': np.round(metrics.mean_squared_error(true, preds),2),
    'mape':np.round(metrics.mean_absolute_percentage_error(true, preds),2),
    }
    return accuracy
    
    

# Bag of words - baseline model

In [9]:
train_data = pd.DataFrame(modeling_data.apply(lambda x: x['reviews_text']+' '+x['reviews_title'], axis=1), columns = ['text'])
vectorizer_bow = feature_extraction.text.CountVectorizer()

trained_bow_transformer = vectorizer_bow.fit(train_data['text'])

train_data = trained_bow_transformer.transform(train_data['text'])

train_data.columns =  trained_bow_transformer.get_feature_names()

# Train bow model

In [10]:
y = modeling_data['reviews_rating'].to_numpy()

In [11]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(train_data, y, test_size=0.3, random_state=20)

In [12]:
model_bow = ensemble.RandomForestRegressor(max_depth=20)

In [13]:
model_bow.fit(x_train, y_train)

RandomForestRegressor(max_depth=20)

In [14]:
predictions = model_bow.predict(x_test)

In [15]:
get_scores(predictions, y_test)

{'mae': 0.67, 'rmse': 0.77, 'mape': 0.26}

# TFIDF (advanced bow)

In [16]:
train_data = pd.DataFrame(modeling_data.apply(lambda x: x['reviews_text']+' '+x['reviews_title'], axis=1), columns = ['text'])
vectorizer_tfidf = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), max_df=0.9)

trained_tf_idf_transformer = vectorizer_tfidf.fit(train_data['text'])

train_data = trained_tf_idf_transformer.transform(train_data['text'])

train_data.columns =  trained_tf_idf_transformer.get_feature_names()

# Train TFIDF model 

In [17]:
y = modeling_data['reviews_rating'].to_numpy()

In [18]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(train_data, y, test_size=0.3, random_state=20)

In [19]:
model_tfidf = ensemble.RandomForestRegressor(max_depth=20)

In [20]:
model_tfidf.fit(x_train, y_train)

RandomForestRegressor(max_depth=20)

In [21]:
predictions = model_tfidf.predict(x_test)

In [22]:
get_scores(predictions, y_test)

{'mae': 0.68, 'rmse': 0.79, 'mape': 0.26}

# Saving models

In [23]:
# saving bow forest model
with open('../models/model_bow.pkl', 'wb') as file:
    pickle.dump(model_bow, file)
    
# saving bow vectorizer
with open('../models/transformer_bow.pkl', 'wb') as file:
    pickle.dump(trained_bow_transformer, file)

In [24]:
# saving tfidf forest model
with open('../models/model_tfidf.pkl', 'wb') as file:
    pickle.dump(model_tfidf, file)
    
# saving tfidf vectorizer
with open('../models/transformer_tfidf.pkl', 'wb') as file:
    pickle.dump(trained_tf_idf_transformer, file)

# Conclusion - 

we dont see a major jump in accuracy metrics by changing the underlying models. Even though it was expected TFIDF to improve the accuracy, I believe due to the increase in the number of sparse features, our accuracy went down a bit (curse of dimensionality)

As next steps, we can try more advanced encodings like:
1- word2vec
2- BERT

Due to long training times, I am not building these at the momemt 

Also I have only used random forest model since it is the quickest to train

We can also try models like naive bayes in the future

I believe if we were to change the output variable from a continuious variable to a categorical avariable like ratings less than 1, rating between 1 and 2 and so on, we can expect a significant jump in accuracy. Since this was not in scope of our problem, I did not try it

Moving on, we will deploy our 2 models