<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

# Hotel Review Data

This project uses the hotel review data vectorized in the previous phase to build descriptive and predictive models for hotel ratings. First, the data split to train and test sets is loaded as follows:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import timeit
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('train_dataframe.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13651 entries, 0 to 13650
Columns: 2587 entries, Additional_Number_of_Scoring to Reviewer_Score
dtypes: float64(2587)
memory usage: 269.4 MB


In [4]:
train_df.head()

Unnamed: 0,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,days_since_review,lat,lng,Review_Month,...,p_working,p_world,p_worth,p_wouldn,p_year,p_years,p_yes,p_young,p_yummy,Reviewer_Score
0,220.0,9.1,20.0,902.0,21.0,1.0,275.0,51.494308,-0.175558,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1190.0,7.5,5.0,5180.0,23.0,6.0,481.0,51.514879,-0.16065,4.0,...,0.0,0.0,0.0,0.0,0.0,0.425849,0.0,0.0,0.0,1.0
2,299.0,8.3,81.0,1361.0,27.0,4.0,672.0,51.521009,-0.123097,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,87.0,9.0,17.0,355.0,13.0,7.0,412.0,51.499749,-0.161524,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,317.0,7.6,14.0,1458.0,0.0,1.0,499.0,51.516114,-0.174952,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_df = pd.read_csv('test_dataframe.csv')

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3413 entries, 0 to 3412
Columns: 2587 entries, Additional_Number_of_Scoring to Reviewer_Score
dtypes: float64(2587)
memory usage: 67.4 MB


In [7]:
#train_df = train_df.sample(frac=0.1, random_state=42)
#test_df = test_df.sample(frac=0.1, random_state=42)

In [8]:
X_train = train_df.iloc[:,:-1]
X_train.head()

Unnamed: 0,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,days_since_review,lat,lng,Review_Month,...,p_worked,p_working,p_world,p_worth,p_wouldn,p_year,p_years,p_yes,p_young,p_yummy
0,220.0,9.1,20.0,902.0,21.0,1.0,275.0,51.494308,-0.175558,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1190.0,7.5,5.0,5180.0,23.0,6.0,481.0,51.514879,-0.16065,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.425849,0.0,0.0,0.0
2,299.0,8.3,81.0,1361.0,27.0,4.0,672.0,51.521009,-0.123097,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,87.0,9.0,17.0,355.0,13.0,7.0,412.0,51.499749,-0.161524,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,317.0,7.6,14.0,1458.0,0.0,1.0,499.0,51.516114,-0.174952,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
y_train = train_df.iloc[:,-1]
y_train.head()

0    1.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Reviewer_Score, dtype: float64

In [10]:
X_test = test_df.iloc[:,:-1]
y_test = test_df.iloc[:,-1]

# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42).fit(X_train, y_train)
print('Train Accuracy: ', lr.score(X_train, y_train))
print('Test Accuracy: ', lr.score(X_test, y_test))
print('Computation Time: ', timeit.timeit)

Train Accuracy:  0.8272654018020658
Test Accuracy:  0.7896278933489599
Computation Time:  <function timeit at 0x0000019EB28CA318>


In [12]:
coef_df = pd.DataFrame({'Feature':X_train.columns, 'Coefficient': lr.coef_[0]})
coef_df.sort_values(by='Coefficient', ascending=False, inplace=True)

In [13]:
word_df = coef_df[coef_df.Feature.str.contains('^[n,p]_')]
word_df

Unnamed: 0,Feature,Coefficient
2522,p_upgraded,2.059033
1719,p_amazing,2.037435
1978,p_fantastic,1.781315
1228,n_perfect,1.706760
2308,p_professional,1.698139
...,...,...
1561,n_tiny,-1.797868
722,n_dated,-1.839281
753,n_dirty,-2.228388
1366,n_room,-2.232058


In [14]:
twenty_df = word_df.head(20).append(word_df.tail(20))
twenty_df

Unnamed: 0,Feature,Coefficient
2522,p_upgraded,2.059033
1719,p_amazing,2.037435
1978,p_fantastic,1.781315
1228,n_perfect,1.70676
2308,p_professional,1.698139
1972,p_fabulous,1.64479
2005,p_friendliness,1.591903
2271,p_perfect,1.561669
1955,p_excellent,1.513672
850,n_fault,1.504318


## Standard Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [16]:
 
    +from sklearn.linear_model import LogisticRegression
scale_lr = LogisticRegression(solver='lbfgs', random_state=42, max_iter=200).fit(X_train_scale, y_train)
print('Train Accuracy: ', scale_lr.score(X_train_scale, y_train))
print('Test Accuracy: ', scale_lr.score(X_test_scale, y_test))
print('Computation Time: ', timeit.timeit())

SyntaxError: invalid syntax (<ipython-input-16-f7848ce8478c>, line 1)

In [None]:
coef_df = pd.DataFrame({'Feature':X_train.columns, 'Coefficeint': scale_lr.coef_[0]})
coef_df.sort_values(by='Coefficeint', ascending=False, inplace=True)
word_df = coef_df[coef_df.Feature.str.contains('^[n,p]_')]
twenty_df = word_df.head(20).append(word_df.tail(20))
twenty_df

## Principle Component Analysis

In [None]:
# principle component analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=500)
X_train_scale_pca = pca.fit_transform(X_train_scale)
X_test_scale_pca = pca.transform(X_test_scale)

In [None]:
from sklearn.linear_model import LogisticRegression
pca_lr = LogisticRegression(solver='lbfgs', random_state=42, max_iter=200).fit(X_train_scale_pca, y_train)
print('Train Accuracy: ', pca_lr.score(X_train_scale_pca, y_train))
print('Test Accuracy: ', pca_lr.score(X_test_scale_pca, y_test))
print('Computation Time: ', timeit.timeit())

In [None]:
# + improved test accuracy 
# - lost interpreability

# K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=30).fit(X_train_scale_pca, y_train)
print(knn.score(X_train_scale_pca, y_train))
print(knn.score(X_test_scale_pca, y_test))
print(timeit.timeit())

## Sampling

In [None]:
remainder_sample_df = train_df.sample(frac=0.1, random_state=42)
test_sample_df = test_df.sample(frac=0.1, random_state=42)

In [None]:
X_remainder_sample = remainder_sample_df.iloc[:,:-1]
y_remainder_sample = remainder_sample_df.iloc[:,-1]
X_test_sample = test_sample_df.iloc[:,:-1]
y_test_sample = test_sample_df.iloc[:,-1]

In [None]:
# split data to training data and validation data
from sklearn.model_selection import train_test_split
X_train_sample, X_valid_sample, y_train_sample, y_valid_sample = train_test_split(X_remainder_sample, y_remainder_sample, test_size=0.25, random_state=42)

In [None]:
# scale data with standard scaler
scaler = StandardScaler()
X_train_sample_scale = scaler.fit_transform(X_train_sample)
X_valid_sample_scale = scaler.transform(X_valid_sample)
X_test_sample_scale = scaler.transform(X_test_sample)

In [None]:
# principle component analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
X_train_sample_scale_pca = pca.fit_transform(X_train_sample_scale)
X_valid_sample_scale_pca = pca.transform(X_valid_sample_scale)
X_test_sample_scale_pca = pca.transform(X_test_sample_scale)

In [None]:
%%time
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=30).fit(X_train_sample_scale_pca, y_train_sample)
print(knn.score(X_train_sample_scale_pca, y_train_sample))
print(knn.score(X_valid_sample_scale_pca, y_valid_sample))
print(knn.score(X_test_sample_scale_pca, y_test_sample))
print(timeit.timeit())

In [None]:
+ computation time 
- test accuracy 

## Hyperparameter Optimization

In [None]:
train_accuracies = []
valid_accuracies = []

for n in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=n).fit(X_train_sample_scale_pca, y_train_sample)
    train_accuracies.append(knn.score(X_train_sample_scale_pca, y_train_sample))
    valid_accuracies.append(knn.score(X_valid_sample_scale_pca, y_valid_sample))    

plt.plot(range(1,30), train_accuracies)
plt.plot(range(1,30), valid_accuracies)
plt.legend(['train', 'valid'])
xlabel('Number of Neighbors')
ylabel('Classification Accuracy')


In [None]:
best_n = np.argmax(valid_accuracies)
range(1,30)[best_n]

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train_scale_pca, y_train)

print(dt.score(X_train_scale_pca, y_train))
print(dt.score(X_test_scale_pca, y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train_sample_scale_pca, y_train_sample)

print(dt.score(X_train_sample_scale_pca, y_train_sample))
print(dt.score(X_valid_sample_scale_pca, y_valid_sample))

## Hyperparameter Optimization

In [None]:
train_accuracies = []
valid_accuracies = []

param_range = range(1,20)

for n in param_range:
    dt = DecisionTreeClassifier(max_depth=n).fit(X_train_sample_scale_pca, y_train_sample)
    train_accuracies.append(dt.score(X_train_sample_scale_pca, y_train_sample))
    valid_accuracies.append(dt.score(X_valid_sample_scale_pca, y_valid_sample))    

plt.plot(param_range, train_accuracies)
plt.plot(param_range, valid_accuracies)
plt.legend(['train', 'valid'])
xlabel('Number of Neighbors')
ylabel('Classification Accuracy')
print(range(1,30)[np.argmax(valid_accuracies)])

In [None]:
dt = DecisionTreeClassifier()
scores = cross_val_score(lr, X_remainder_sample, y_remainder_sample, cv = 5)
print(scores)

In [None]:
from sklearn.model_selection import cross_val_score
cv_accuracies = []

param_range = range(1,20)

for n in param_range:
    dt = DecisionTreeClassifier(max_depth=n)
    cv_accuracies.append((cross_val_score(dt, X_remainder_sample, y_remainder_sample, cv = 5)).mean())

plt.plot(param_range, cv_accuracies)


In [None]:
best_param = param_range[np.argmax(cv_accuracies)]
print(best_param)
print(cv_accuracies[np.argmax(cv_accuracies)])

In [None]:
from sklearn.metrics import confusion_matrix
dt = DecisionTreeClassifier(max_depth=best_param).fit(X_train_sample_scale_pca, y_train_sample)
y_pred = dt.predict(X_test_sample_scale_pca)
confusion_matrix(y_test_sample, y_pred)

In [None]:
X_train.Positive_Ratio = (X_train.Review_Total_Positive_Word_Counts - X_train.Review_Total_Negative_Word_Counts) / \
(X_train.Review_Total_Positive_Word_Counts + X_train.Review_Total_Negative_Word_Counts)
X_train.Positive_Ratio

In [None]:
# add new feature to sample set
X_remainder_sample.Positive_Ratio = (X_remainder_sample.Review_Total_Positive_Word_Counts - X_remainder_sample.Review_Total_Negative_Word_Counts) / \
(X_remainder_sample.Review_Total_Positive_Word_Counts + X_remainder_sample.Review_Total_Negative_Word_Counts)
X_remainder_sample.Positive_Ratio

# scale data with standard scaler
scaler = StandardScaler()
X_remainder_sample_scale = scaler.fit_transform(X_remainder_sample)


'''
# principle component analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=2587 )
X_train_sample_scale_pca = pca.fit_transform(X_train_sample_scale)
X_valid_sample_scale_pca = pca.transform(X_valid_sample_scale)
X_test_sample_scale_pca = pca.transform(X_test_sample_scale)
'''

In [None]:
cv_accuracies = []
param_range = range(1,20)
for n in param_range:
    dt = DecisionTreeClassifier(max_depth=n)
    cv_accuracies.append((cross_val_score(dt, X_remainder_sample, y_remainder_sample, cv = 5)).mean())


plt.plot(param_range, cv_accuracies)
print(range(1,20)[np.argmax(cv_accuracies)])