In [119]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

%matplotlib inline

In [120]:
df = pd.read_csv('./data/df_features.gz')

In [121]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [122]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

In [123]:
df = df.dropna(subset=['Distance'])

### Config

In [124]:
# 4 categories

tmp = []
for i in df.Diff:
    if i < -2:
        tmp.append('Worse')
    elif i < -0.5:
        tmp.append('Bad')
    elif i < 0.5:
        tmp.append('Average')
    elif i >= 0.5:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()/len(df.Diff_Recode)

Good       0.477339
Average    0.231924
Bad        0.181281
Worse      0.109456
Name: Diff_Recode, dtype: float64

In [125]:
# 3 categories

tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i < 0.6:
        tmp.append('Average')
    elif i >= 0.6:
        tmp.append('Better')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Better     215560
Average    156602
Bad        140308
Name: Diff_Recode, dtype: int64

In [126]:
# 2 categories

tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i >= -0.6:
        tmp.append('Good')

df['Diff_Recode'] = tmp

### First Train / Test

In [127]:
df_test = df.sample(10000, random_state=1)
df_test['test'] = 1
df_test.shape

(10000, 36)

In [128]:
df_train = df.drop(df_test.index)
df_train['test'] = 0
df_train.shape

(502470, 36)

### Balance Categories and UK

In [129]:
nationalities = list(df_train.Nationality_Recode.value_counts().index)

In [130]:
balanced1 = None

for i in nationalities:
    tmp = df_train[df_train.Nationality_Recode == i]
    n = 30000
    if len(tmp) < 30000:
        n = len(tmp)
    tmp = tmp.sample(n)
    balanced1 = pd.concat([balanced1, tmp])
balanced1.shape

(207278, 36)

In [131]:
minclass = np.min(balanced1.Diff_Recode.value_counts())
classes = list(balanced1.Diff_Recode.value_counts().index)
minclass, classes

(59170, ['Good', 'Bad'])

In [132]:
balanced2 = None

for i in classes:
    tmp = balanced1[balanced1.Diff_Recode == i].sample(minclass)
    balanced2 = pd.concat([balanced2, tmp])
balanced2.shape

(118340, 36)

### Start

In [133]:
df_balanced = balanced2.sample(50000, random_state=1)
df2 = pd.concat([df_balanced, df_test])

In [134]:
y_col = 'Diff_Recode'
x_col_dummy = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode']
x_col_normal = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Distance', 'test']

In [135]:
X_dummy = df2[x_col_dummy]
X_normal = df2[x_col_normal]
y = df2[y_col]

In [136]:
X_dummy = X_dummy.fillna('Not Available')
X_dummy = pd.get_dummies(X_dummy, prefix_sep='_', drop_first=True)

In [137]:
df_final = pd.concat([y, X_normal, X_dummy], axis=1, sort=False)
df.shape

(512470, 35)

In [138]:
df_test = df_final[df_final.test==1]
df_train = df_final[df_final.test==0]
df_test.shape, df_train.shape

((10000, 53), (50000, 53))

In [139]:
y_train = df_train[y_col]
X_train = df_train.drop([y_col,'test'], axis=1)
y_test = df_test[y_col]
X_test = df_test.drop([y_col,'test'], axis=1)
y_train.shape, X_train.shape, y_test.shape, X_test.shape

((50000,), (50000, 51), (10000,), (10000, 51))

In [140]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 51), (10000,), (50000, 51), (50000,))

### KNN

In [65]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(1,10):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    print('Neighbours:', i, '|',
          'Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
         )

Neighbours: 1 | Accuracy: 0.525 | Kappa: 0.028 | F1-Score: 0.366 | Precision: 0.499 | Recall: 0.289 |
Neighbours: 2 | Accuracy: 0.417 | Kappa: 0.024 | F1-Score: 0.411 | Precision: 0.742 | Recall: 0.284 |
Neighbours: 3 | Accuracy: 0.541 | Kappa: 0.043 | F1-Score: 0.371 | Precision: 0.493 | Recall: 0.297 |
Neighbours: 4 | Accuracy: 0.463 | Kappa: 0.045 | F1-Score: 0.411 | Precision: 0.683 | Recall: 0.294 |
Neighbours: 5 | Accuracy: 0.554 | Kappa: 0.067 | F1-Score: 0.385 | Precision: 0.509 | Recall: 0.310 |
Neighbours: 6 | Accuracy: 0.483 | Kappa: 0.048 | F1-Score: 0.406 | Precision: 0.643 | Recall: 0.296 |
Neighbours: 7 | Accuracy: 0.553 | Kappa: 0.059 | F1-Score: 0.379 | Precision: 0.497 | Recall: 0.306 |
Neighbours: 8 | Accuracy: 0.499 | Kappa: 0.053 | F1-Score: 0.403 | Precision: 0.617 | Recall: 0.299 |
Neighbours: 9 | Accuracy: 0.558 | Kappa: 0.064 | F1-Score: 0.379 | Precision: 0.492 | Recall: 0.309 |


### Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.6119, 0.14570635226429862)

### Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.6183, 0.14880396364329207)

### XGBoost

In [210]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 3000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.5455, 0.09102843263062732)

### Logistic Regresion

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(pred, y_test), ' | ',
      'Kappa:',round(cohen_kappa_score(pred, y_test),4), ' | ',
      'F1-Score:',round(f1_score(pred, y_test, pos_label='Bad'),4), ' | ',
      'Precision:',round(precision_score(pred, y_test, pos_label='Bad'),4), ' | ',
      'Recall:',round(recall_score(pred, y_test, pos_label='Bad'),4)
     )

Accuracy: 0.6276  |  Kappa: 0.1548  |  F1-Score: 0.4207  |  Precision: 0.4927  |  Recall: 0.367


In [63]:
mat = clf.predict_proba(X_test)

for threshold in np.arange(0.35, 0.55, 0.01):
    pred = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.334 | Kappa: 0.034 | F1-Score: 0.442 | Precision: 0.962 | Recall: 0.287 | Mean: 0.270 | Threshold: 0.35
Accuracy: 0.351 | Kappa: 0.044 | F1-Score: 0.446 | Precision: 0.951 | Recall: 0.291 | Mean: 0.280 | Threshold: 0.36
Accuracy: 0.372 | Kappa: 0.058 | F1-Score: 0.451 | Precision: 0.940 | Recall: 0.297 | Mean: 0.294 | Threshold: 0.37
Accuracy: 0.390 | Kappa: 0.065 | F1-Score: 0.452 | Precision: 0.916 | Recall: 0.300 | Mean: 0.302 | Threshold: 0.38
Accuracy: 0.412 | Kappa: 0.077 | F1-Score: 0.455 | Precision: 0.895 | Recall: 0.305 | Mean: 0.315 | Threshold: 0.39
Accuracy: 0.431 | Kappa: 0.085 | F1-Score: 0.455 | Precision: 0.867 | Recall: 0.309 | Mean: 0.324 | Threshold: 0.40
Accuracy: 0.455 | Kappa: 0.096 | F1-Score: 0.457 | Precision: 0.835 | Recall: 0.314 | Mean: 0.336 | Threshold: 0.41
Accuracy: 0.476 | Kappa: 0.106 | F1-Score: 0.457 | Precision: 0.804 | Recall: 0.320 | Mean: 0.347 | Threshold: 0.42
Accuracy: 0.498 | Kappa: 0.115 | F1-Score: 0.456 | Precision: 0.766 | Re

### Neural Network

In [177]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

In [220]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [228]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

<keras.callbacks.callbacks.History at 0x29d51cafe10>

In [229]:
predictions = list(model.predict(pred_data)[:,1])
pred = ['Good' if round(i) else 'Bad' for i in predictions]
print('Accuracy:',accuracy_score(pred, y_test), '|', 
      'Kappa:', round(cohen_kappa_score(pred,y_test),4), '|', 
      'F1:',round(f1_score(pred,y_test, pos_label='Bad'),4))

Accuracy: 0.5874 | Kappa: 0.0965 | F1: 0.3918


In [210]:
mat = model.predict(pred_data)

for threshold in np.arange(0.4, 0.5, 0.02):
    pred = np.array(['Bad' if i > threshold else 'Good' for i in mat[:,0]])
    acc = accuracy_score(pred, y_test)
    kap = cohen_kappa_score(pred, y_test)
    f1s = f1_score(pred, y_test, pos_label='Bad')
    pre = precision_score(pred, y_test, pos_label='Bad')
    rec = recall_score(pred, y_test, pos_label='Bad')
    mea = (acc + kap + f1s) / 3
    print('Accuracy:', f'{acc:.3f}', '|',
          'Kappa:', f'{kap:.3f}', '|',
          'F1-Score:', f'{f1s:.3f}', '|',
          'Precision:', f'{pre:.3f}', '|',
          'Recall:', f'{rec:.3f}', '|',
          'Mean:', f'{mea:.3f}', '|',
          'Threshold:', f'{threshold:.2f}'
         )

Accuracy: 0.403 | Kappa: 0.057 | F1-Score: 0.442 | Precision: 0.862 | Recall: 0.297 | Mean: 0.301 | Threshold: 0.40
Accuracy: 0.440 | Kappa: 0.068 | F1-Score: 0.439 | Precision: 0.798 | Recall: 0.303 | Mean: 0.315 | Threshold: 0.42
Accuracy: 0.487 | Kappa: 0.089 | F1-Score: 0.439 | Precision: 0.730 | Recall: 0.314 | Mean: 0.338 | Threshold: 0.44
Accuracy: 0.533 | Kappa: 0.104 | F1-Score: 0.432 | Precision: 0.647 | Recall: 0.324 | Mean: 0.356 | Threshold: 0.46
Accuracy: 0.580 | Kappa: 0.124 | F1-Score: 0.424 | Precision: 0.565 | Recall: 0.340 | Mean: 0.376 | Threshold: 0.48
