### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it incorrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [4]:
df = df.dropna(subset=['Dist_Center'])
# df = df.dropna(subset=['Reservation_ADR'])

### Create Predicted Category for final models (2 categories)

In [5]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [6]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [7]:
category = np.array(['Bad' if i < -2 else 'Good' for i in df.Diff])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df) 

Good    0.890544
Bad     0.109456
Name: Category, dtype: float64

In [8]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Diff_Percentage',
       'Review_Month', 'Review_Year', 'Country', 'City', 'Pet', 'Purpose',
       'Whom', 'Room', 'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars', 'Length_N', 'Reservation_ADR',
       'food_Neg', 'staff_Neg', 'location_Neg', 'value_Neg', 'comfort_Neg',
       'room_Neg', 'facilities_Neg', 'cleanliness_Neg', 'food_Pos',
       'staff_Pos', 'location_Pos', 'value_Pos', 'comfort_Pos', 'room_Pos',
       'facilities_Pos', 'cleanliness_Pos', 'food_Neg_Hotel',
       'staff_Neg_Hotel', 'loca

### Balance Nationalities and / or Categories

In [9]:
def balance_df(Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [10]:
df_balanced = balance_df(Balance_Nationality=True, Balance_Category=True)

### Prepare Data to run Models

In [11]:
df_model = df_balanced.sample(n=20000, random_state=1)

In [12]:
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport','food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
               'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel','count', 'mean', 'std', 'min', '25%', 
               '50%', '75%', 'max']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [13]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [14]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [15]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [17]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 74), (10000,), (10000, 74), (10000,))

## MODELS

### KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = KNeighborsClassifier(n_neighbors=5, )
clf.fit(X_train, y_train)
train_knn = clf.predict(X_train)
pred_knn = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_knn, y_test):.4f}', '|', f'{cohen_kappa_score(pred_knn, y_test):.4f}')
print('Train:', f'{accuracy_score(train_knn, y_train):.4f}', '|', f'{cohen_kappa_score(train_knn, y_train):.4f}')

Test : 0.5583 | 0.1165
Train: 0.7059 | 0.4117


### Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, random_state=1)
clf.fit(X_train, y_train)
train_gbt = clf.predict(X_train)
pred_gbt = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_gbt, y_test):.4f}', '|', f'{cohen_kappa_score(pred_gbt, y_test):.4f}')
print('Train:', f'{accuracy_score(train_gbt, y_train):.4f}', '|', f'{cohen_kappa_score(train_gbt, y_train):.4f}')

Test : 0.6115 | 0.2237
Train: 0.6266 | 0.2538


### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 50, max_depth = 5, random_state=1)
clf.fit(X_train, y_train)
train_rf = clf.predict(X_train)
pred_rf = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_rf, y_test):.4f}', '|', f'{cohen_kappa_score(pred_rf, y_test):.4f}')
print('Train:', f'{accuracy_score(train_rf, y_train):.4f}', '|', f'{cohen_kappa_score(train_rf, y_train):.4f}')

Test : 0.6017 | 0.2039
Train: 0.6230 | 0.2464


### XGBoosting

In [21]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 5, max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_xgb = clf.predict(X_train)
pred_xgb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_xgb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_xgb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_xgb, y_train):.4f}', '|', f'{cohen_kappa_score(train_xgb, y_train):.4f}')

Test : 0.6050 | 0.2103
Train: 0.6502 | 0.3005


### Logistic Regresion

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
clf.fit(X_train, y_train)
train_log = clf.predict(X_train)
pred_log = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_log, y_test):.4f}', '|', f'{cohen_kappa_score(pred_log, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_log, y_train):.4f}', '|', f'{cohen_kappa_score(train_log, y_train):.4f}')

Test : 0.6159 | 0.2319
Train: 0.6265 | 0.2528


### Decision Trees

In [23]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_tree = clf.predict(X_train)
pred_tree = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_tree, y_test):.4f}', '|', f'{cohen_kappa_score(pred_tree, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_tree, y_train):.4f}', '|', f'{cohen_kappa_score(train_tree, y_train):.4f}')

Test : 0.6030 | 0.2062
Train: 0.6226 | 0.2453


### SVM

In [24]:
from sklearn.svm import SVC

clf = SVC(C=0.5, random_state=1)
clf.fit(X_train, y_train)
train_svm = clf.predict(X_train)
pred_svm = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_svm, y_test):.4f}', '|', f'{cohen_kappa_score(pred_svm, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_svm, y_train):.4f}', '|', f'{cohen_kappa_score(train_svm, y_train):.4f}')

Test : 0.6099 | 0.2202
Train: 0.6644 | 0.3290


### Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, y_train)
train_nb = clf.predict(X_train)
pred_nb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_nb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_nb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_nb, y_train):.4f}', '|', f'{cohen_kappa_score(train_nb, y_train):.4f}')

Test : 0.5964 | 0.1929
Train: 0.5920 | 0.1839


### Neural Network

In [26]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [27]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(2, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [28]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

Instructions for updating:
Use tf.cast instead.


<keras.callbacks.callbacks.History at 0x27830a57ec8>

In [29]:
predictions_test = list(model.predict(pred_data)[:,1])
predictions_train = list(model.predict(predictors)[:,1])
pred_net = ['Good' if round(i) else 'Bad' for i in predictions_test]
train_net = ['Good' if round(i) else 'Bad' for i in predictions_train]
print('Test :',  f'{accuracy_score(pred_net, y_test):.4f}', '|', f'{cohen_kappa_score(pred_net, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_net, y_train):.4f}', '|', f'{cohen_kappa_score(train_net, y_train):.4f}')

Test : 0.6040 | 0.2081
Train: 0.6512 | 0.3023


### Resum Respostes

In [30]:
df_models_test = pd.DataFrame(list(zip(y_test, pred_log, pred_knn, pred_xgb, pred_gbt, pred_net, pred_rf, 
                                       pred_svm, pred_tree)), 
                         columns=['TEST','logistic','knn','xgb','gbt','nnet','rf','svm', 'tree']) 
df_models_test.head()

Unnamed: 0,TEST,logistic,knn,xgb,gbt,nnet,rf,svm,tree
0,Good,Good,Bad,Bad,Good,Good,Bad,Good,Good
1,Good,Good,Good,Good,Good,Good,Good,Good,Bad
2,Good,Bad,Bad,Bad,Bad,Good,Bad,Bad,Good
3,Bad,Good,Good,Bad,Bad,Good,Bad,Bad,Bad
4,Bad,Bad,Good,Bad,Bad,Bad,Good,Bad,Bad


## STACKING

Predictions using Random Forest adding the predicions of other models to the original dataset.
I had to split the test set in 2 to create the new train/test set, otherwise i fall into overfitting because my original training set is biased vs the test set

First I append the predictions of the model to the dataset

In [31]:
X_2 = X_test.copy()
y_2 = y_test.copy()

In [32]:
X_2['logistic'] = pred_log

In [33]:
X_2['knn'] = pred_knn

In [34]:
X_2['svm'] = pred_svm

In [35]:
X_2['nnet'] = pred_net

In [36]:
X_2['tree'] = pred_tree

In [37]:
X_2['nb'] = pred_nb

In [38]:
X_2.iloc[:,-6:].head(2)

Unnamed: 0,logistic,knn,svm,nnet,tree,nb
483844,Good,Bad,Good,Good,Good,Good
49888,Good,Good,Good,Good,Bad,Good


In [39]:
X_2.iloc[:,-6:] = X_2.iloc[:,-6:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [40]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=100)

### Stacked Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 110, max_depth = 5, random_state=1)
clf.fit(X_train_2, y_train_2)
train_rf_2 = clf.predict(X_train_2)
pred_rf_2 = clf.predict(X_test_2)
print('Test :',  f'{accuracy_score(pred_rf_2, y_test_2):.4f}', '|', f'{cohen_kappa_score(pred_rf_2, y_test_2):.4f}')
print('Train:',  f'{accuracy_score(train_rf_2, y_train_2):.4f}', '|', f'{cohen_kappa_score(train_rf_2, y_train_2):.4f}')

Test : 0.6180 | 0.2371
Train: 0.6355 | 0.2710


In [42]:
pd.crosstab(train_rf_2, y_train_2)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,2662,1576
Good,1335,2427
