In [1]:
#NOTE: nltk, ggplot and missingno was not been installed using pip, so it is instaled directly in the cluster. Also, all these librarires are explained in the other notebook 'Sentiment'
#Importing Libraries
import pandas as pd
import numpy as np
import scipy as sci
import seaborn as sns
import matplotlib.pyplot as plt
#%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:

# Reading Data from the source
# File location and type
file_location = "/FileStore/tables/Hotel_Reviews.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
#Converting the columns to repsective data types 
integer = ['Additional_Number_of_Scoring', 'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews', 'Review_Total_Positive_Word_Counts', 'Total_Number_of_Reviews_Reviewer_Has_Given', ]
float_val = ['Average_Score', 'Reviewer_Score', 'lat', 'lng']

for col in df.columns:
  if col in integer:
    df = df.withColumn(col, df[col].cast(IntegerType()))
  if col in float_val:
    df = df.withColumn(col, df[col].cast(FloatType()))
df = df.toPandas() #Converting Spark dataframes to Pandas dataframes


In [3]:
df.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360577,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360577,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360577,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360577,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360577,4.915968


In [4]:
df['all_review'] = df.apply(lambda x:x['Positive_Review']+' '+x['Negative_Review'],axis=1)

In [5]:
# The size of data is not quite small, and we want to execute the code quickily as we got a time limitation in kernel! So I decide to train a model on 20% of the data and valid the model on 80% of the data. The validation set (80%) will be splitted into three parts and we will compare the statistics of validation seperately. This is always my validation strategy when I the dataset is large or I do not have enough computation resources.

In [6]:
from sklearn.model_selection import train_test_split # splitting the dataset into training and testing
train,test1 = train_test_split(df,test_size=0.8,random_state=42)
test1,test2 = train_test_split(test1,test_size=0.67,random_state=42)
test2,test3 = train_test_split(test2,test_size=0.5,random_state=42)
print(train.shape);print(test1.shape);print(test2.shape);print(test3.shape)

In [7]:
#Plan to train a TFIDF model on both train and test set, in order to provide the data for sklearn model.
from sklearn.feature_extraction.text import TfidfVectorizer
t = TfidfVectorizer(max_features=10000)
train_feats = t.fit_transform(train['all_review'])
test_feats1 = t.transform(test1['all_review'])
test_feats2 = t.transform(test2['all_review'])
test_feats3 = t.transform(test3['all_review'])

In [8]:
#Model Fitting, as we are done with feature preparation so now move to classifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [9]:
gbdt = GradientBoostingRegressor(max_depth=5,learning_rate=0.1,n_estimators=150) # Large iteration, fewer estimators
gbdt.fit(train_feats,train['Reviewer_Score'])

In [10]:
#Evaluating the performance
pred_inbag = gbdt.predict(train_feats)
pred_test1 = gbdt.predict(test_feats1)
pred_test2 = gbdt.predict(test_feats2)
pred_test3 = gbdt.predict(test_feats3)

In [11]:
#Let's first compare the mean absolute error of the inbag data and three out bag data.

MAEs = pd.DataFrame({'data':['in_bag','out_bag1','out_bag2','out_bag3'],'MAE':[mean_absolute_error(train['Reviewer_Score'],pred_inbag),mean_absolute_error(test1['Reviewer_Score'],pred_test1),mean_absolute_error(test2['Reviewer_Score'],pred_test2),mean_absolute_error(test3['Reviewer_Score'],pred_test3)]})
MAEs

Unnamed: 0,data,MAE
0,in_bag,0.886415
1,out_bag1,0.917057
2,out_bag2,0.911851
3,out_bag3,0.915324


In [12]:
!pip install pandas==0.19.2

In [13]:
#MAE Mean Average Error
from ggplot import *
p = ggplot(MAEs,aes(x='data',weight='MAE')) + geom_bar()+theme_bw()+ggtitle('Mean Absolute Error of GBDT models')
print(p)

In [14]:
RMSEs = pd.DataFrame({'data':['in_bag','out_bag1','out_bag2','out_bag3'],'RMSE':[mean_squared_error(train['Reviewer_Score'],pred_inbag)**0.5,mean_squared_error(test1['Reviewer_Score'],pred_test1)**0.5,mean_squared_error(test2['Reviewer_Score'],pred_test2)**0.5,mean_squared_error(test3['Reviewer_Score'],pred_test3)**0.5]})

In [15]:
#Root Mean Squared Errors
RMSEs

Unnamed: 0,data,RMSE
0,in_bag,1.16035
1,out_bag1,1.210926
2,out_bag2,1.200709
3,out_bag3,1.20996


In [16]:
#Root Mean Squared Errors
p = ggplot(RMSEs,aes(x='data',weight='RMSE')) + geom_bar()+theme_bw()+ggtitle('Rooted Mean Squared Error of GBDT models')
print(p)

In [17]:
# Check the top 5 most important words
words = t.get_feature_names()
importance = gbdt.feature_importances_
impordf = pd.DataFrame({'Word' : words,
'Importance' : importance})
impordf = impordf.sort_values(['Importance', 'Word'], ascending=[0, 1])
impordf.head(5)

Unnamed: 0,Word,Importance
5869,negative,0.209593
5941,no,0.067875
5973,not,0.057698
6679,positive,0.041718
5981,nothing,0.040536


In [18]:
#Loading a csv file into the Pandas
impordf.to_csv('Most_important_words.csv',index=False)