### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it incorrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [4]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [5]:
df = df.dropna(subset=['Dist_Center'])

### Create Predicted Category for final models (2 categories)

In [6]:
category = np.array(['Bad' if i < -0.6 else 'Good' for i in df.Diff])
df.loc[:, 'Diff_Recode'] = category
df.Diff_Recode.value_counts() / len(df) 

Good    0.726212
Bad     0.273788
Name: Diff_Recode, dtype: float64

### Balance Nationalities and / or Categories

In [7]:
def balance_df(Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Diff_Recode.value_counts())
        classes = list(df_balance_country.Diff_Recode.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Diff_Recode == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [8]:
df_balanced = balance_df(Balance_Nationality=True, Balance_Category=True)

### Prepare Data to run Models

In [18]:
df_model = df_balanced[df_balanced.City=='Barcelona']

In [19]:
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport']
x_col = x_categorical + x_numerical
y_col = 'Diff_Recode'

In [20]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [21]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [22]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [24]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((5809, 44), (5809,), (5808, 44), (5808,))

## MODELS

### KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = KNeighborsClassifier(n_neighbors=5, )
clf.fit(X_train, y_train)
train_knn = clf.predict(X_train)
pred_knn = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_knn, y_test):.4f}', '|', f'{cohen_kappa_score(pred_knn, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_knn, y_train):.4f}', '|', f'{cohen_kappa_score(train_knn, y_train):.4f}')

Test : 0.5323 | 0.0601
Train: 0.7061 | 0.4075


### Gradient Boosting

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, random_state=1)
clf.fit(X_train, y_train)
train_gbt = clf.predict(X_train)
pred_gbt = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_gbt, y_test):.4f}', '|', f'{cohen_kappa_score(pred_gbt, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_gbt, y_train):.4f}', '|', f'{cohen_kappa_score(train_gbt, y_train):.4f}')

Test : 0.5710 | 0.1337
Train: 0.6090 | 0.2057


### Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 200, max_depth = 6, random_state=1)
clf.fit(X_train, y_train)
train_rf = clf.predict(X_train)
pred_rf = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_rf, y_test):.4f}', '|', f'{cohen_kappa_score(pred_rf, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_rf, y_train):.4f}', '|', f'{cohen_kappa_score(train_rf, y_train):.4f}')

Test : 0.5712 | 0.1291
Train: 0.6553 | 0.2960


### XGBoosting

In [29]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 5, max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_xgb = clf.predict(X_train)
pred_xgb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_xgb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_xgb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_xgb, y_train):.4f}', '|', f'{cohen_kappa_score(train_xgb, y_train):.4f}')

Test : 0.5715 | 0.1387
Train: 0.6269 | 0.2471


### Logistic Regresion

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
clf.fit(X_train, y_train)
train_log = clf.predict(X_train)
pred_log = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_log, y_test):.4f}', '|', f'{cohen_kappa_score(pred_log, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_log, y_train):.4f}', '|', f'{cohen_kappa_score(train_log, y_train):.4f}')

Test : 0.5824 | 0.1572
Train: 0.5921 | 0.1734


### Decision Trees

In [31]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_tree = clf.predict(X_train)
pred_tree = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_tree, y_test):.4f}', '|', f'{cohen_kappa_score(pred_tree, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_tree, y_train):.4f}', '|', f'{cohen_kappa_score(train_tree, y_train):.4f}')

Test : 0.5445 | 0.1020
Train: 0.5775 | 0.1710


### SVM

In [32]:
from sklearn.svm import SVC

clf = SVC(C=0.5, random_state=1)
clf.fit(X_train, y_train)
train_svm = clf.predict(X_train)
pred_svm = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_svm, y_test):.4f}', '|', f'{cohen_kappa_score(pred_svm, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_svm, y_train):.4f}', '|', f'{cohen_kappa_score(train_svm, y_train):.4f}')

Test : 0.5850 | 0.1615
Train: 0.6486 | 0.2875


### Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, y_train)
train_nb = clf.predict(X_train)
pred_nb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_nb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_nb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_nb, y_train):.4f}', '|', f'{cohen_kappa_score(train_nb, y_train):.4f}')

Test : 0.5636 | 0.1201
Train: 0.5801 | 0.1515


### Neural Network

In [34]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

Using TensorFlow backend.


In [35]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [36]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

<keras.callbacks.callbacks.History at 0x277e1044488>

In [37]:
predictions_test = list(model.predict(pred_data)[:,1])
predictions_train = list(model.predict(predictors)[:,1])
pred_net = ['Good' if round(i) else 'Bad' for i in predictions_test]
train_net = ['Good' if round(i) else 'Bad' for i in predictions_train]
print('Test :',  f'{accuracy_score(pred_net, y_test):.4f}', '|', f'{cohen_kappa_score(pred_net, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_net, y_train):.4f}', '|', f'{cohen_kappa_score(train_net, y_train):.4f}')

Test : 0.5622 | 0.1246
Train: 0.6016 | 0.2038


### Resum Respostes

In [38]:
df_models_test = pd.DataFrame(list(zip(y_test, pred_log, pred_knn, pred_xgb, pred_gbt, pred_net, pred_rf, 
                                       pred_svm, pred_tree)), 
                         columns=['TEST','logistic','knn','xgb','gbt','nnet','rf','svm', 'tree']) 
df_models_test.head()

Unnamed: 0,TEST,logistic,knn,xgb,gbt,nnet,rf,svm,tree
0,Good,Bad,Good,Bad,Bad,Good,Bad,Good,Bad
1,Good,Bad,Good,Good,Good,Bad,Good,Good,Bad
2,Good,Good,Bad,Good,Good,Good,Good,Good,Bad
3,Bad,Bad,Good,Good,Good,Good,Good,Good,Good
4,Good,Good,Good,Good,Good,Good,Good,Good,Bad


## STACKING

Predictions using Random Forest adding the predicions of other models to the original dataset.
I had to split the test set in 2 to create the new train/test set, otherwise i fall into overfitting because my original training set is biased vs the test set

First I append the predictions of the model to the dataset

In [39]:
X_2 = X_test.copy()
y_2 = y_test.copy()

In [40]:
X_2['logistic'] = pred_log

In [41]:
X_2['knn'] = pred_knn

In [42]:
X_2['svm'] = pred_svm

In [43]:
X_2['nnet'] = pred_net

In [44]:
X_2['tree'] = pred_tree

In [45]:
X_2['nb'] = pred_nb

In [46]:
X_2.iloc[:,-6:].head(2)

Unnamed: 0,logistic,knn,svm,nnet,tree,nb
352460,Bad,Good,Good,Good,Bad,Bad
319057,Bad,Good,Good,Bad,Bad,Good


In [47]:
X_2.iloc[:,-6:] = X_2.iloc[:,-6:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [48]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=100)

### Stacked Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 50, max_depth = 5, random_state=1)
clf.fit(X_train_2, y_train_2)
train_rf_2 = clf.predict(X_train_2)
pred_rf_2 = clf.predict(X_test_2)
print('Test :',  f'{accuracy_score(pred_rf_2, y_test_2):.4f}', '|', f'{cohen_kappa_score(pred_rf_2, y_test_2):.4f}')
print('Train:',  f'{accuracy_score(train_rf_2, y_train_2):.4f}', '|', f'{cohen_kappa_score(train_rf_2, y_train_2):.4f}')

Test : 0.5972 | 0.1926
Train: 0.6193 | 0.2335
