# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV

In [39]:
#Reading the data
df = pd.read_csv('concated.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,HotelName,HotelPrice,HotelAddress,HotelRatings,AuthorName,Authorlocation,Review,title,Service,Cleanliness,Overall,Value,Sleep Quality,Rooms,Location,"Business service (e.g., internet access)",Check in / front desk
0,0,Hilton Garden Inn Baltimore White Marsh,$135 - $193*,"5015 Campbell Blvd, Baltimore, MD 21236","{'Service': '5', 'Cleanliness': '5', 'Overall'...",Nadine R,"Jacksonville, Florida","Usually stay near the airport, but this trip w...",“Great place and location”,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,
1,1,Hilton Garden Inn Baltimore White Marsh,$135 - $193*,"5015 Campbell Blvd, Baltimore, MD 21236","{'Service': '4', 'Cleanliness': '4', 'Overall'...",blt3116,"Chillicothe, Ohio",Stayed at this Hilton for 2 nights. It was lik...,"“Nice stay, nice surroundings”",4.0,4.0,4.0,4.0,4.0,4.0,5.0,,
2,2,Hilton Garden Inn Baltimore White Marsh,$135 - $193*,"5015 Campbell Blvd, Baltimore, MD 21236","{'Service': '5', 'Cleanliness': '5', 'Overall'...",ExSpec5,Northern Virginia,"Stayed there one night, December 16, on the wa...",“Perfect for an overnight just off I-95”,5.0,5.0,4.0,3.0,3.0,5.0,5.0,,
3,3,Hilton Garden Inn Baltimore White Marsh,$135 - $193*,"5015 Campbell Blvd, Baltimore, MD 21236","{'Service': '5', 'Cleanliness': '5', 'Overall'...",kevinlynch3,Raleigh,I just stayed here last weekend and have alrea...,“Great Hotel”,5.0,5.0,5.0,4.0,5.0,5.0,5.0,,
4,4,Hilton Garden Inn Baltimore White Marsh,$135 - $193*,"5015 Campbell Blvd, Baltimore, MD 21236","{'Service': '5', 'Cleanliness': '5', 'Overall'...",nobumponalog,"Belmont, MA",My mother who is 90 and I stayed one night on ...,“good room for handicapped person”,5.0,5.0,5.0,4.0,5.0,5.0,5.0,,


In [40]:
#Droping the unwanted columns
df.drop(['HotelRatings','HotelAddress','HotelName','Unnamed: 0','Authorlocation','AuthorName'],axis=1,inplace=True)

In [41]:
df.head()

Unnamed: 0,HotelPrice,Review,title,Service,Cleanliness,Overall,Value,Sleep Quality,Rooms,Location,"Business service (e.g., internet access)",Check in / front desk
0,$135 - $193*,"Usually stay near the airport, but this trip w...",“Great place and location”,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,
1,$135 - $193*,Stayed at this Hilton for 2 nights. It was lik...,"“Nice stay, nice surroundings”",4.0,4.0,4.0,4.0,4.0,4.0,5.0,,
2,$135 - $193*,"Stayed there one night, December 16, on the wa...",“Perfect for an overnight just off I-95”,5.0,5.0,4.0,3.0,3.0,5.0,5.0,,
3,$135 - $193*,I just stayed here last weekend and have alrea...,“Great Hotel”,5.0,5.0,5.0,4.0,5.0,5.0,5.0,,
4,$135 - $193*,My mother who is 90 and I stayed one night on ...,“good room for handicapped person”,5.0,5.0,5.0,4.0,5.0,5.0,5.0,,


In [42]:
#checking the null values
(df.isnull().sum(axis=0)/len(df))*100

HotelPrice                                   0.000000
Review                                       0.000000
title                                        0.000000
Service                                      9.054203
Cleanliness                                  9.181028
Overall                                      0.000000
Value                                        9.029691
Sleep Quality                               47.947068
Rooms                                       17.773840
Location                                    22.571790
Business service (e.g., internet access)    89.300052
Check in / front desk                       82.523484
dtype: float64

In [43]:
#Droping columns with nulls with high percentage
df.drop(['Sleep Quality','Business service (e.g., internet access)','Check in / front desk'],axis=1,inplace=True)

In [48]:
#Droping over all rating with null values
df.drop(df[df.Overall == '0.0'].index, inplace = True) 

In [49]:
df.Overall.value_counts()

5.0    615476
4.0    463650
3.0    195121
2.0    103418
1.0     99173
Name: Overall, dtype: int64

In [51]:
df["ReviewDetails"]=df['title']+" "+df["Review"]

In [9]:
df.head()

Unnamed: 0,HotelPrice,Review,title,Service,Cleanliness,Overall,Value,Rooms,Location,ReviewDetails
0,$135 - $193*,"Usually stay near the airport, but this trip w...",“Great place and location”,5.0,5.0,5.0,5.0,5.0,5.0,“Great place and location” Usually stay near t...
1,$135 - $193*,Stayed at this Hilton for 2 nights. It was lik...,"“Nice stay, nice surroundings”",4.0,4.0,4.0,4.0,4.0,5.0,"“Nice stay, nice surroundings” Stayed at this ..."
2,$135 - $193*,"Stayed there one night, December 16, on the wa...",“Perfect for an overnight just off I-95”,5.0,5.0,4.0,3.0,5.0,5.0,“Perfect for an overnight just off I-95” Staye...
3,$135 - $193*,I just stayed here last weekend and have alrea...,“Great Hotel”,5.0,5.0,5.0,4.0,5.0,5.0,“Great Hotel” I just stayed here last weekend ...
4,$135 - $193*,My mother who is 90 and I stayed one night on ...,“good room for handicapped person”,5.0,5.0,5.0,4.0,5.0,5.0,“good room for handicapped person” My mother w...


# Removing Stop words and performing Stemming

In [52]:
stemmer = PorterStemmer()
words=stopwords.words("english")
df['cleaned'] = df['ReviewDetails'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [53]:
df['cleaned'].head()

0    great place locat usual stay near airport trip...
1    nice stay nice surround stay hilton night it l...
2    perfect overnight i stay one night decemb way ...
3    great hotel i stay last weekend alreadi plan s...
4    good room handicap person my mother i stay one...
Name: cleaned, dtype: object

In [54]:
df['Cleaned_with_price'] = df['cleaned']+" "+df['HotelPrice']

In [55]:
df['Cleaned_with_price'].to_pickle("cleaned.pkl")

In [56]:
df_cleaned=pd.read_pickle("cleaned.pkl")

### Splitting to test and train 

In [44]:
df['Overall'] =df['Overall'].astype('str')


In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(df_cleaned,df['Overall'], test_size = 0.10, random_state = 42)

In [58]:
X_train[0]

'great place locat usual stay near airport trip busi north side balto the traffic crazi stay elsewher thi hotel perfect hot breakfast wonder recommend get breakfast packag room onli hotel area restaur bar thi area whole lot good restaur compar stay near columbia pikesvil it next white marsh mall huge complex shop includ ikea the bed fabul adjust side firm $135 - $193*'

# TF-IDF

In [71]:
#TF-IDF on Training set
vectorizer=TfidfVectorizer(min_df=0.1,stop_words="english",sublinear_tf=True,ngram_range=(1,1),norm='l2')
final_features_train = vectorizer.fit(X_train)
final_features_train = vectorizer.transform(X_train)
final_features_train.shape

(1329154, 112)

In [77]:
#TF-IDF on Test set
final_features_test = vectorizer.transform(X_test)
final_features_test.shape

(147684, 112)

### Applying Grid Search to avoid Overfitting

In [72]:
RANDOM_STATE = 100

In [60]:
param_test = {
'max_depth':[4,5,6,10,15,20,25,30,35,50,100],
'learning_rate': [0.0001, 0.001, 0.01,0.5]
}

In [61]:
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1, max_depth=5, gamma=0, subsample=0.8, colsample_bytree=0.8,objective= 'multi:softmax', num_class = 6 ,scale_pos_weight=1), 
param_grid = param_test, scoring='accuracy', cv=5)
gsearch2.fit(final_features_train,Y_train)
gsearch2.best_params_ , gsearch2.best_score_


({'learning_rate': 0.01, 'max_depth': 50}, 0.6157480583465943)

# Training  XG Boost Model

In [73]:
xgb_model = xgb.XGBClassifier(max_depth=50, random_state=RANDOM_STATE,
learning_rate=0.01, colsample_bytree=.7, gamma=0, alpha=0,objective='multi:softmax', eta=0.3,subsample=0.8).fit(final_features_train, Y_train)

In [74]:
xgb_prediction = xgb_model.predict(final_features_train)

In [75]:
print('training score:', f1_score(Y_train, xgb_model.predict(final_features_train), average='macro'))

training score: 0.9913874307309687


In [76]:
print(classification_report(Y_train, xgb_model.predict(final_features_train)))

              precision    recall  f1-score   support

         1.0       0.99      0.99      0.99     89392
         2.0       1.00      0.98      0.99     93182
         3.0       1.00      0.99      0.99    175566
         4.0       1.00      0.99      0.99    417315
         5.0       0.99      1.00      0.99    553699

    accuracy                           0.99   1329154
   macro avg       0.99      0.99      0.99   1329154
weighted avg       0.99      0.99      0.99   1329154



# Model Evaluation

In [78]:
xgb_prediction = xgb_model.predict(final_features_test)

In [79]:
print('validation score:', f1_score(Y_test, xgb_prediction, average='macro'))
print(classification_report(Y_test, xgb_prediction))

validation score: 0.7303109519016657
              precision    recall  f1-score   support

         1.0       0.78      0.76      0.77      9781
         2.0       0.90      0.51      0.65     10236
         3.0       0.84      0.55      0.66     19555
         4.0       0.73      0.74      0.74     46335
         5.0       0.78      0.91      0.84     61777

    accuracy                           0.77    147684
   macro avg       0.80      0.69      0.73    147684
weighted avg       0.78      0.77      0.76    147684



In [80]:
accuracy = accuracy_score(Y_test, xgb_prediction)
accuracy

0.7710246201348826

# Confusion Matrix

In [81]:
#Confusion Matrix
confusion_matrix(Y_test, xgb_prediction)

array([[ 7387,   272,   452,   573,  1097],
       [ 1141,  5188,   897,  1581,  1429],
       [  553,   238, 10762,  5330,  2672],
       [  197,    59,   571, 34447, 11061],
       [  206,    24,   171,  5292, 56084]], dtype=int64)

# Test with a new review.

In [82]:
new_review = ['The room was dump; there was a small wardrobe which we could not use as it was not pleasant (to me). I will have rather place a open wardrobe.']
#TF-IDF on Test set
final_features_new = vectorizer.transform(new_review)
type(final_features_new)
#final_features_new=final_features_new.toarray()
#final_features_new.shape

scipy.sparse.csr.csr_matrix

In [84]:
pred = xgb_model.predict(final_features_new)
print("Rating for the given review",pred)

['1.0']
