### Open File

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

In [3]:
df = df.dropna(subset=['Dist_Center'])

In [4]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Diff_Percentage',
       'Review_Month', 'Review_Year', 'Country', 'City', 'Pet', 'Purpose',
       'Whom', 'Room', 'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars', 'Length_N', 'Reservation_ADR',
       'food_Neg', 'staff_Neg', 'location_Neg', 'value_Neg', 'comfort_Neg',
       'room_Neg', 'facilities_Neg', 'cleanliness_Neg', 'food_Pos',
       'staff_Pos', 'location_Pos', 'value_Pos', 'comfort_Pos', 'room_Pos',
       'facilities_Pos', 'cleanliness_Pos', 'food_Neg_Hotel',
       'staff_Neg_Hotel', 'loca

### Select Features in Model

In [5]:
df_select = df[df.Reviewer_Score < 7]
df_select.shape

(86309, 69)

In [109]:
y_col = 'location_Neg'
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode',
                 'Stars']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Airport', 'Dist_Train', 'food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 
               'value_Neg_Hotel','comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel']
x_col = x_categorical + x_numerical

In [110]:
df_balance = None
minclass = np.min(df_select[y_col].value_counts())
classes = list(df_select[y_col].value_counts().index)
for i in classes:
    selected_class = df_select[df_select[y_col] == i].sample(minclass, random_state=1)
    df_balance = pd.concat([df_balance, selected_class])

In [111]:
df_model = df_balance.sample(n=10000, random_state=1)

In [112]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [113]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [114]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

In [115]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=100)

In [116]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((7500, 59), (7500,), (2500, 59), (2500,))

In [117]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score

In [118]:
gbt = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, random_state=1)
rf = RandomForestClassifier(n_estimators = 25, max_depth = 4, random_state=1)
log = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
dt = DecisionTreeClassifier(max_depth=4, random_state=1)
svm = SVC(C=0.5, random_state=1)

In [119]:
classifiers = [('Logistic Regression',log),
               ('Gradient Boosted Trees',gbt),
               ('Random Forest', rf),
               ('Decision Trees',dt),
               ('SVM',svm)]

In [120]:
for name, model in classifiers:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_pred, y_test)
    print('{} Accuracy: {:.3f}'.format(name,acc))

Logistic Regression Accuracy: 0.571
Gradient Boosted Trees Accuracy: 0.565
Random Forest Accuracy: 0.559
Decision Trees Accuracy: 0.547
SVM Accuracy: 0.567


In [121]:
vc = VotingClassifier(estimators=classifiers, voting='hard')
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('{} Accuracy: {:.3f}'.format('Voting Classifier',acc))

Voting Classifier Accuracy: 0.571


### Models

In [98]:
pred_knn = evaluate(KNeighborsClassifier(n_neighbors=5))

Test : 0.5209 | 0.0419
Train: 0.7176 | 0.4352


In [99]:
pred_gbt = evaluate(GradientBoostingClassifier(learning_rate=0.1, max_depth=2, random_state=1))

Test : 0.5625 | 0.1250
Train: 0.6648 | 0.3296


In [100]:
pred_rf = evaluate(RandomForestClassifier(n_estimators = 25, max_depth = 4, random_state=1))

Test : 0.5620 | 0.1240
Train: 0.6500 | 0.3000


In [101]:
pred_log = evaluate(LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1))

Test : 0.5704 | 0.1408
Train: 0.6108 | 0.2216


In [102]:
pred_tree = evaluate(DecisionTreeClassifier(max_depth=4, random_state=1))

Test : 0.5563 | 0.1124
Train: 0.6200 | 0.2402


In [103]:
pred_svm = evaluate(SVC(C=0.5, random_state=1))

Test : 0.5636 | 0.1272
Train: 0.6292 | 0.2584
