In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it inconrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [4]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [5]:
df = df.dropna(subset=['Distance'])

### Create Predicted Category for final models

We create from 4 up to 2 categories.
We can check the results for the models in this different categories

##### 4 Categories

In [6]:
tmp = []
for i in df.Diff:
    if i < -2:
        tmp.append('Worse')
    elif i < -0.5:
        tmp.append('Bad')
    elif i < 0.5:
        tmp.append('Average')
    elif i >= 0.5:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Good       244622
Average    118854
Bad         92901
Worse       56093
Name: Diff_Recode, dtype: int64

##### 3 Categories

In [7]:
tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i < 0.6:
        tmp.append('Average')
    elif i >= 0.6:
        tmp.append('Better')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Better     215560
Average    156602
Bad        140308
Name: Diff_Recode, dtype: int64

##### 2 categories

In [8]:
tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i >= -0.6:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Good    372162
Bad     140308
Name: Diff_Recode, dtype: int64

### Balance Categories and UK

In [9]:
nationalities = list(df.Nationality_Recode.value_counts().index)

In [10]:
balanced1 = None

for i in nationalities:
    tmp = df[df.Nationality_Recode == i]
    n = 30000
    if len(tmp) < 30000:
        n = len(tmp)
    tmp = tmp.sample(n)
    balanced1 = pd.concat([balanced1, tmp])

In [11]:
minclass = np.min(balanced1.Diff_Recode.value_counts())
classes = list(balanced1.Diff_Recode.value_counts().index)
minclass, classes

(59691, ['Good', 'Bad'])

In [12]:
balanced2 = None

for i in classes:
    tmp = balanced1[balanced1.Diff_Recode == i].sample(minclass)
    balanced2 = pd.concat([balanced2, tmp])

### Start

In [13]:
df2 = balanced2.sample(n=50000, random_state=1)

In [14]:
y_col = 'Diff_Recode'
x_col = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode',
         'Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance']

In [15]:
X = df2[x_col]
y = df2[y_col]

In [16]:
X = X.fillna('Not Available')
X.columns

Index(['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode',
       'Nationality_Recode', 'Length_Recode', 'Average_Score',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance'],
      dtype='object')

In [17]:
X_normal = X[['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance']]

In [18]:
X_dummy = pd.get_dummies(X[['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode',
       'Nationality_Recode', 'Length_Recode']], prefix_sep='_', drop_first=True)

In [19]:
X = pd.concat([X_normal, X_dummy], axis=1, sort=False)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [25]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 50), (10000,), (40000, 50), (40000,))

### KNN

In [119]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(1,10):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    print('Neighbours:', i, '|',
          'Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
         )

Neighbours: 1 | Accuracy: 0.522 | Kappa: 0.045 | F1-Score: 0.520 | Precision: 0.517 | Recall: 0.524 |
Neighbours: 2 | Accuracy: 0.515 | Kappa: 0.029 | F1-Score: 0.611 | Precision: 0.761 | Recall: 0.511 |
Neighbours: 3 | Accuracy: 0.523 | Kappa: 0.047 | F1-Score: 0.522 | Precision: 0.519 | Recall: 0.525 |
Neighbours: 4 | Accuracy: 0.527 | Kappa: 0.054 | F1-Score: 0.597 | Precision: 0.700 | Recall: 0.521 |
Neighbours: 5 | Accuracy: 0.532 | Kappa: 0.065 | F1-Score: 0.531 | Precision: 0.527 | Recall: 0.534 |
Neighbours: 6 | Accuracy: 0.524 | Kappa: 0.048 | F1-Score: 0.581 | Precision: 0.658 | Recall: 0.520 |
Neighbours: 7 | Accuracy: 0.532 | Kappa: 0.064 | F1-Score: 0.529 | Precision: 0.524 | Recall: 0.534 |
Neighbours: 8 | Accuracy: 0.536 | Kappa: 0.072 | F1-Score: 0.582 | Precision: 0.644 | Recall: 0.531 |
Neighbours: 9 | Accuracy: 0.536 | Kappa: 0.072 | F1-Score: 0.530 | Precision: 0.523 | Recall: 0.538 |


### Gradient Boosting

In [111]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.5857, 0.17144255471039005)

### Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.5851, 0.17022588895226465)

In [115]:
mat = clf.predict_proba(X_test)

for threshold in np.arange(0.30, 0.55, 0.01):
    pred = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.508 | Kappa: 0.015 | F1-Score: 0.670 | Precision: 0.996 | Recall: 0.505 | Mean: 0.398 | Threshold: 0.30
Accuracy: 0.509 | Kappa: 0.017 | F1-Score: 0.670 | Precision: 0.992 | Recall: 0.505 | Mean: 0.399 | Threshold: 0.31
Accuracy: 0.512 | Kappa: 0.023 | F1-Score: 0.671 | Precision: 0.990 | Recall: 0.507 | Mean: 0.402 | Threshold: 0.32
Accuracy: 0.515 | Kappa: 0.028 | F1-Score: 0.671 | Precision: 0.986 | Recall: 0.508 | Mean: 0.405 | Threshold: 0.33
Accuracy: 0.518 | Kappa: 0.034 | F1-Score: 0.671 | Precision: 0.982 | Recall: 0.510 | Mean: 0.408 | Threshold: 0.34
Accuracy: 0.521 | Kappa: 0.040 | F1-Score: 0.672 | Precision: 0.977 | Recall: 0.512 | Mean: 0.411 | Threshold: 0.35
Accuracy: 0.525 | Kappa: 0.048 | F1-Score: 0.672 | Precision: 0.973 | Recall: 0.514 | Mean: 0.415 | Threshold: 0.36
Accuracy: 0.529 | Kappa: 0.056 | F1-Score: 0.673 | Precision: 0.967 | Recall: 0.516 | Mean: 0.419 | Threshold: 0.37
Accuracy: 0.534 | Kappa: 0.066 | F1-Score: 0.673 | Precision: 0.959 | Re

### XGBoost

In [210]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 3000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.5455, 0.09102843263062732)

### Logistic Regresion

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(pred, y_test), ' | ',
      'Kappa:',round(cohen_kappa_score(pred, y_test),4), ' | ',
      'F1-Score:',round(f1_score(pred, y_test, pos_label='Bad'),4), ' | ',
      'Precision:',round(precision_score(pred, y_test, pos_label='Bad'),4), ' | ',
      'Recall:',round(recall_score(pred, y_test, pos_label='Bad'),4)
     )

Accuracy: 0.5852  |  Kappa: 0.1705  |  F1-Score: 0.5721  |  Precision: 0.5533  |  Recall: 0.5923


In [110]:
mat = clf.predict_proba(X_test)

for threshold in np.arange(0.30, 0.55, 0.01):
    pred = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.513 | Kappa: 0.024 | F1-Score: 0.671 | Precision: 0.990 | Recall: 0.507 | Mean: 0.403 | Threshold: 0.300
Accuracy: 0.516 | Kappa: 0.029 | F1-Score: 0.671 | Precision: 0.984 | Recall: 0.509 | Mean: 0.405 | Threshold: 0.310
Accuracy: 0.520 | Kappa: 0.038 | F1-Score: 0.672 | Precision: 0.979 | Recall: 0.511 | Mean: 0.410 | Threshold: 0.320
Accuracy: 0.526 | Kappa: 0.050 | F1-Score: 0.673 | Precision: 0.974 | Recall: 0.514 | Mean: 0.417 | Threshold: 0.330
Accuracy: 0.531 | Kappa: 0.060 | F1-Score: 0.674 | Precision: 0.966 | Recall: 0.517 | Mean: 0.421 | Threshold: 0.340
Accuracy: 0.537 | Kappa: 0.073 | F1-Score: 0.675 | Precision: 0.957 | Recall: 0.521 | Mean: 0.428 | Threshold: 0.350
Accuracy: 0.543 | Kappa: 0.085 | F1-Score: 0.674 | Precision: 0.944 | Recall: 0.525 | Mean: 0.434 | Threshold: 0.360
Accuracy: 0.549 | Kappa: 0.096 | F1-Score: 0.674 | Precision: 0.930 | Recall: 0.528 | Mean: 0.440 | Threshold: 0.370
Accuracy: 0.554 | Kappa: 0.106 | F1-Score: 0.671 | Precision: 0.

### Neural Network

In [120]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

Using TensorFlow backend.


In [169]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [170]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

<keras.callbacks.callbacks.History at 0x2b3ba85cf28>

In [171]:
predictions = list(model.predict(pred_data)[:,1])
pred = ['Good' if round(i) else 'Bad' for i in predictions]
print('Accuracy:',accuracy_score(pred, y_test), '|', 
      'Kappa:', round(cohen_kappa_score(pred,y_test),4), '|', 
      'F1:',round(f1_score(pred,y_test, pos_label='Bad'),4))

Accuracy: 0.5667 | Kappa: 0.1338 | F1: 0.5254
