In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it inconrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [4]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [5]:
df = df.dropna(subset=['Distance'])

### Create Predicted Category for final models

We create from 4 up to 2 categories.
We can check the results for the models in this different categories

##### 4 Categories

In [6]:
tmp = []
for i in df.Diff:
    if i < -2:
        tmp.append('Worse')
    elif i < -0.5:
        tmp.append('Bad')
    elif i < 0.5:
        tmp.append('Average')
    elif i >= 0.5:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Good       244622
Average    118854
Bad         92901
Worse       56093
Name: Diff_Recode, dtype: int64

##### 3 Categories

In [7]:
tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i < 0.6:
        tmp.append('Average')
    elif i >= 0.6:
        tmp.append('Better')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Better     215560
Average    156602
Bad        140308
Name: Diff_Recode, dtype: int64

##### 2 categories

In [8]:
tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i >= -0.6:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Good    372162
Bad     140308
Name: Diff_Recode, dtype: int64

### Balance Categories and UK

In [9]:
nationalities = list(df.Nationality_Recode.value_counts().index)

In [10]:
balanced1 = None

for i in nationalities:
    tmp = df[df.Nationality_Recode == i]
    n = 30000
    if len(tmp) < 30000:
        n = len(tmp)
    tmp = tmp.sample(n)
    balanced1 = pd.concat([balanced1, tmp])

In [11]:
minclass = np.min(balanced1.Diff_Recode.value_counts())
classes = list(balanced1.Diff_Recode.value_counts().index)
minclass, classes

(59685, ['Good', 'Bad'])

In [12]:
balanced2 = None

for i in classes:
    tmp = balanced1[balanced1.Diff_Recode == i].sample(minclass)
    balanced2 = pd.concat([balanced2, tmp])

### Start

In [13]:
df2 = balanced2.sample(n=50000, random_state=1)

In [14]:
y_col = 'Diff_Recode'
x_col = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode',
         'Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance']

In [15]:
X = df2[x_col]
y = df2[y_col]

In [16]:
X = X.fillna('Not Available')
X.columns

Index(['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode',
       'Nationality_Recode', 'Length_Recode', 'Average_Score',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance'],
      dtype='object')

In [17]:
X_normal = X[['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance']]

In [18]:
X_dummy = pd.get_dummies(X[['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode',
       'Nationality_Recode', 'Length_Recode']], prefix_sep='_', drop_first=True)

In [19]:
X = pd.concat([X_normal, X_dummy], axis=1, sort=False)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [21]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 50), (10000,), (40000, 50), (40000,))

### KNN

In [75]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

for i in range(6,7):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    train_knn = clf.predict(X_train)
    pred_knn = clf.predict(X_test)
    acc = accuracy_score(pred_knn, y_test)
    kap = cohen_kappa_score(pred_knn, y_test)
    f1s = f1_score(pred_knn, y_test, pos_label='Bad')
    pre = precision_score(pred_knn, y_test, pos_label='Bad')
    rec = recall_score(pred_knn, y_test, pos_label='Bad')
    print('Neighbours:', i, '|',
          'Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
         )

Neighbours: 6 | Accuracy: 0.537 | Kappa: 0.076 | F1-Score: 0.595 | Precision: 0.688 | Recall: 0.524 |


### Gradient Boosting

In [76]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
clf.fit(X_train, y_train)
train_gbt = clf.predict(X_train)
pred_gbt = clf.predict(X_test)
accuracy_score(pred_gbt, y_test), cohen_kappa_score(pred_gbt, y_test)

(0.5849, 0.169320930767192)

### Random Forest

In [77]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf.fit(X_train, y_train)
train_rf = clf.predict(X_train)
pred_rf = clf.predict(X_test)
accuracy_score(pred_rf, y_test), cohen_kappa_score(pred_rf, y_test)

(0.5895, 0.17924287844740983)

In [36]:
mat = clf.predict_proba(X_test)

for threshold in np.arange(0.30, 0.55, 0.05):
    pred_rf = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred_rf, y_test)
    kap = cohen_kappa_score(pred_rf, y_test)
    f1s = f1_score(pred_rf, y_test, pos_label='Bad')
    pre = precision_score(pred_rf, y_test, pos_label='Bad')
    rec = recall_score(pred_rf, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.505 | Kappa: 0.020 | F1-Score: 0.665 | Precision: 0.993 | Recall: 0.500 | Mean: 0.397 | Threshold: 0.30
Accuracy: 0.524 | Kappa: 0.057 | F1-Score: 0.669 | Precision: 0.972 | Recall: 0.510 | Mean: 0.416 | Threshold: 0.35
Accuracy: 0.540 | Kappa: 0.087 | F1-Score: 0.664 | Precision: 0.920 | Recall: 0.520 | Mean: 0.430 | Threshold: 0.40
Accuracy: 0.567 | Kappa: 0.138 | F1-Score: 0.652 | Precision: 0.820 | Recall: 0.541 | Mean: 0.452 | Threshold: 0.45
Accuracy: 0.590 | Kappa: 0.180 | F1-Score: 0.595 | Precision: 0.610 | Recall: 0.581 | Mean: 0.455 | Threshold: 0.50
Accuracy: 0.570 | Kappa: 0.136 | F1-Score: 0.438 | Precision: 0.339 | Recall: 0.620 | Mean: 0.382 | Threshold: 0.55


### XGBoost

In [78]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 3000)
clf.fit(X_train, y_train)
train_xgb = clf.predict(X_train)
pred_xgb = clf.predict(X_test)
accuracy_score(pred_xgb, y_test), cohen_kappa_score(pred_xgb, y_test)

(0.5496, 0.0993167285519797)

### Logistic Regresion

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
train_log = clf.predict(X_train)
pred_log = clf.predict(X_test)
print('Accuracy:', accuracy_score(pred_log, y_test), ' | ',
      'Kappa:',round(cohen_kappa_score(pred_log, y_test),4), ' | ',
      'F1-Score:',round(f1_score(pred_log, y_test, pos_label='Bad'),4), ' | ',
      'Precision:',round(precision_score(pred_log, y_test, pos_label='Bad'),4), ' | ',
      'Recall:',round(recall_score(pred_log, y_test, pos_label='Bad'),4)
     )

Accuracy: 0.59  |  Kappa: 0.1797  |  F1-Score: 0.5814  |  Precision: 0.5756  |  Recall: 0.5873


In [40]:
mat = clf.predict_proba(X_test)

for threshold in np.arange(0.30, 0.55, 0.05):
    pred_log = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred_log, y_test)
    kap = cohen_kappa_score(pred_log, y_test)
    f1s = f1_score(pred_log, y_test, pos_label='Bad')
    pre = precision_score(pred_log, y_test, pos_label='Bad')
    rec = recall_score(pred_log, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.507 | Kappa: 0.024 | F1-Score: 0.665 | Precision: 0.987 | Recall: 0.501 | Mean: 0.398 | Threshold: 0.30
Accuracy: 0.533 | Kappa: 0.074 | F1-Score: 0.668 | Precision: 0.951 | Recall: 0.515 | Mean: 0.425 | Threshold: 0.35
Accuracy: 0.558 | Kappa: 0.121 | F1-Score: 0.657 | Precision: 0.858 | Recall: 0.533 | Mean: 0.445 | Threshold: 0.40
Accuracy: 0.585 | Kappa: 0.171 | F1-Score: 0.634 | Precision: 0.726 | Recall: 0.562 | Mean: 0.463 | Threshold: 0.45
Accuracy: 0.590 | Kappa: 0.180 | F1-Score: 0.581 | Precision: 0.576 | Recall: 0.587 | Mean: 0.450 | Threshold: 0.50
Accuracy: 0.580 | Kappa: 0.156 | F1-Score: 0.489 | Precision: 0.407 | Recall: 0.613 | Mean: 0.408 | Threshold: 0.55


### Neural Network

In [80]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

In [81]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [82]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

<keras.callbacks.callbacks.History at 0x15990dbb4e0>

In [83]:
predictions_test = list(model.predict(pred_data)[:,1])
predictions_train = list(model.predict(predictors)[:,1])
pred_net = ['Good' if round(i) else 'Bad' for i in predictions_test]
train_net = ['Good' if round(i) else 'Bad' for i in predictions_train]
print('Accuracy:',accuracy_score(pred_net, y_test), '|', 
      'Kappa:', round(cohen_kappa_score(pred_net,y_test),4), '|', 
      'F1:',round(f1_score(pred_net, y_test, pos_label='Bad'),4))

Accuracy: 0.5605 | Kappa: 0.1246 | F1: 0.6296


In [86]:
df_models_test = pd.DataFrame(list(zip(pred_log, pred_knn, pred_xgb, pred_gbt, pred_net, pred_rf)), 
                         columns=['logistic','knn','xgb','gbt','nnet','rf']) 
df_models_test.head()

Unnamed: 0,logistic,knn,xgb,gbt,nnet,rf
0,Bad,Good,Bad,Bad,Bad,Bad
1,Bad,Bad,Bad,Good,Bad,Bad
2,Bad,Bad,Bad,Good,Bad,Bad
3,Bad,Good,Bad,Bad,Bad,Bad
4,Bad,Bad,Bad,Bad,Bad,Bad


In [87]:
df_models_train = pd.DataFrame(list(zip(train_log, train_knn, train_xgb, train_gbt, train_net, train_rf)), 
                         columns=['logistic','knn','xgb','gbt','nnet','rf']) 
df_models_train.head()

Unnamed: 0,logistic,knn,xgb,gbt,nnet,rf
0,Good,Good,Good,Good,Good,Good
1,Good,Bad,Bad,Good,Bad,Good
2,Good,Good,Good,Good,Good,Good
3,Good,Good,Good,Good,Good,Good
4,Bad,Bad,Bad,Bad,Bad,Bad


### Gradient Boosting

In [144]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
clf.fit(X_train, y_train)
pred_gbt = clf.predict(X_test)
accuracy_score(pred_gbt, y_test), cohen_kappa_score(pred_gbt, y_test)

(0.5849, 0.169320930767192)

### Random Forest

In [139]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf.fit(X_train_2, y_train)
train_rf = clf.predict(X_train_2)
pred_rf = clf.predict(X_test_2)
accuracy_score(pred_rf, y_test), cohen_kappa_score(pred_rf, y_test)

(0.5496, 0.0993167285519797)

### Logistic Regresion

In [140]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train_2, y_train)
train_log = clf.predict(X_train_2)
pred_log = clf.predict(X_test_2)
print('Accuracy:', accuracy_score(pred_log, y_test), ' | ',
      'Kappa:',round(cohen_kappa_score(pred_log, y_test),4), ' | ',
      'F1-Score:',round(f1_score(pred_log, y_test, pos_label='Bad'),4), ' | ',
      'Precision:',round(precision_score(pred_log, y_test, pos_label='Bad'),4), ' | ',
      'Recall:',round(recall_score(pred_log, y_test, pos_label='Bad'),4)
     )

Accuracy: 0.5496  |  Kappa: 0.0993  |  F1-Score: 0.5499  |  Precision: 0.5562  |  Recall: 0.5437
