### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

In [4]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Review_Month',
       'Review_Year', 'Country', 'City', 'Pet', 'Purpose', 'Whom', 'Room',
       'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars'],
      dtype='object')

I fix the date because when Pandas opens the file it reads it incorrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [4]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [5]:
df = df.dropna(subset=['Dist_Center'])
#df = df.dropna(subset=['Price'])

### Create Predicted Category for final models (2 categories)

In [6]:
df['Diff_Percentage'] = df.Reviewer_Score / df.Average_Score

In [7]:
category = np.array(['Bad' if i < 0.70 else 'Good' for i in df.Diff_Percentage])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df) 

Good    0.917609
Bad     0.082391
Name: Category, dtype: float64

### Balance Nationalities and / or Categories

In [8]:
def balance_df(Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [9]:
df_balanced = balance_df(Balance_Nationality=True, Balance_Category=True)

### Prepare Data to run Models

In [10]:
df_model = df_balanced.sample(n=20000, random_state=1)

In [11]:
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [12]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [13]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [14]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [16]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 50), (10000,), (10000, 50), (10000,))

## MODELS

### KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
train_knn = clf.predict(X_train)
pred_knn = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_knn, y_test):.4f}', '|', f'{cohen_kappa_score(pred_knn, y_test):.4f}')
print('Train:', f'{accuracy_score(train_knn, y_train):.4f}', '|', f'{cohen_kappa_score(train_knn, y_train):.4f}')

Test : 0.5721 | 0.1441
Train: 0.7223 | 0.4446


### Gradient Boosting

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, random_state=1)
clf.fit(X_train, y_train)
train_gbt = clf.predict(X_train)
pred_gbt = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_gbt, y_test):.4f}', '|', f'{cohen_kappa_score(pred_gbt, y_test):.4f}')
print('Train:', f'{accuracy_score(train_gbt, y_train):.4f}', '|', f'{cohen_kappa_score(train_gbt, y_train):.4f}')

Test : 0.6140 | 0.2299
Train: 0.6220 | 0.2429


### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 50, max_depth = 5, random_state=1)
clf.fit(X_train, y_train)
train_rf = clf.predict(X_train)
pred_rf = clf.predict(X_test)
print('Test :', f'{accuracy_score(pred_rf, y_test):.4f}', '|', f'{cohen_kappa_score(pred_rf, y_test):.4f}')
print('Train:', f'{accuracy_score(train_rf, y_train):.4f}', '|', f'{cohen_kappa_score(train_rf, y_train):.4f}')

Test : 0.6176 | 0.2365
Train: 0.6335 | 0.2662


### XGBoosting

In [20]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 5, max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_xgb = clf.predict(X_train)
pred_xgb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_xgb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_xgb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_xgb, y_train):.4f}', '|', f'{cohen_kappa_score(train_xgb, y_train):.4f}')

Test : 0.6207 | 0.2427
Train: 0.6396 | 0.2784


### Logistic Regresion

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures

clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
clf.fit(X_train, y_train)
train_log = clf.predict(X_train)
pred_log = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_log, y_test):.4f}', '|', f'{cohen_kappa_score(pred_log, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_log, y_train):.4f}', '|', f'{cohen_kappa_score(train_log, y_train):.4f}')

Test : 0.6235 | 0.2467
Train: 0.6273 | 0.2547


### Decision Trees

In [22]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_tree = clf.predict(X_train)
pred_tree = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_tree, y_test):.4f}', '|', f'{cohen_kappa_score(pred_tree, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_tree, y_train):.4f}', '|', f'{cohen_kappa_score(train_tree, y_train):.4f}')

Test : 0.5972 | 0.1970
Train: 0.6145 | 0.2275


### SVM

In [23]:
from sklearn.svm import SVC

clf = SVC(C=0.5, random_state=1)
clf.fit(X_train, y_train)
train_svm = clf.predict(X_train)
pred_svm = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_svm, y_test):.4f}', '|', f'{cohen_kappa_score(pred_svm, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_svm, y_train):.4f}', '|', f'{cohen_kappa_score(train_svm, y_train):.4f}')

Test : 0.6264 | 0.2534
Train: 0.6728 | 0.3452


### Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, y_train)
train_nb = clf.predict(X_train)
pred_nb = clf.predict(X_test)
print('Test :',  f'{accuracy_score(pred_nb, y_test):.4f}', '|', f'{cohen_kappa_score(pred_nb, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_nb, y_train):.4f}', '|', f'{cohen_kappa_score(train_nb, y_train):.4f}')

Test : 0.6110 | 0.2221
Train: 0.6007 | 0.2013


### Neural Network

In [25]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

predictors = X_train.to_numpy()
df_target = [1 if i=='Good' else 0 for i in y_train]
target = to_categorical(df_target)
pred_data = X_test.to_numpy()

early_stopping_monitor = EarlyStopping(patience=2)
n_cols = predictors.shape[1]
input_shape = (n_cols,)

Using TensorFlow backend.


In [26]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=input_shape))
model.add(Dense(500, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [27]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(predictors, target, epochs=20, validation_split=0.3, callbacks=[early_stopping_monitor], verbose=False)

<keras.callbacks.callbacks.History at 0x23d37c454c8>

In [28]:
predictions_test = list(model.predict(pred_data)[:,1])
predictions_train = list(model.predict(predictors)[:,1])
pred_net = ['Good' if round(i) else 'Bad' for i in predictions_test]
train_net = ['Good' if round(i) else 'Bad' for i in predictions_train]
print('Test :',  f'{accuracy_score(pred_net, y_test):.4f}', '|', f'{cohen_kappa_score(pred_net, y_test):.4f}')
print('Train:',  f'{accuracy_score(train_net, y_train):.4f}', '|', f'{cohen_kappa_score(train_net, y_train):.4f}')

Test : 0.6097 | 0.2163
Train: 0.6343 | 0.2703


### Resum Respostes

In [29]:
df_models_test = pd.DataFrame(list(zip(y_test, pred_log, pred_knn, pred_xgb, pred_gbt, pred_net, pred_rf, 
                                       pred_svm, pred_tree)), 
                         columns=['TEST','logistic','knn','xgb','gbt','nnet','rf','svm', 'tree']) 
df_models_test.head()

Unnamed: 0,TEST,logistic,knn,xgb,gbt,nnet,rf,svm,tree
0,Good,Bad,Bad,Bad,Bad,Good,Bad,Bad,Bad
1,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
2,Good,Good,Good,Good,Good,Good,Good,Good,Good
3,Bad,Good,Bad,Good,Good,Good,Bad,Good,Bad
4,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad


## STACKING

Predictions using Random Forest adding the predicions of other models to the original dataset.
I had to split the test set in 2 to create the new train/test set, otherwise i fall into overfitting because my original training set is biased vs the test set

First I append the predictions of the model to the dataset

In [30]:
X_2 = X_test.copy()
y_2 = y_test.copy()

In [31]:
X_2['logistic'] = pred_log

In [32]:
X_2['knn'] = pred_knn

In [33]:
X_2['svm'] = pred_svm

In [34]:
X_2['nnet'] = pred_net

In [35]:
X_2['xgb'] = pred_xgb

In [36]:
X_2['nb'] = pred_nb

In [37]:
X_2.iloc[:,-6:].head(2)

Unnamed: 0,logistic,knn,svm,nnet,xgb,nb
468162,Bad,Bad,Bad,Good,Bad,Bad
102164,Bad,Bad,Bad,Bad,Bad,Bad


In [38]:
X_2.iloc[:,-6:] = X_2.iloc[:,-6:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [39]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=100)

### Stacked Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=1)
clf.fit(X_train_2, y_train_2)
train_rf_2 = clf.predict(X_train_2)
pred_rf_2 = clf.predict(X_test_2)
print('Test :',  f'{accuracy_score(pred_rf_2, y_test_2):.4f}', '|', f'{cohen_kappa_score(pred_rf_2, y_test_2):.4f}')
print('Train:',  f'{accuracy_score(train_rf_2, y_train_2):.4f}', '|', f'{cohen_kappa_score(train_rf_2, y_train_2):.4f}')
# f1_score(train_rf_2, y_train_2, pos_label='Bad')

Test : 0.6470 | 0.2938
Train: 0.6432 | 0.2866


In [41]:
pd.crosstab(train_rf_2, y_train_2)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,2569,1466
Good,1388,2577


In [42]:
feature_importance = pd.DataFrame(zip(X_train_2.columns, clf.feature_importances_)).sort_values(1, ascending=False)
feature_importance.columns = ['Feature','Importance']
feature_importance.head(10)

Unnamed: 0,Feature,Importance
54,xgb,0.159235
52,svm,0.142699
50,logistic,0.139236
0,Average_Score,0.090929
53,nnet,0.087761
55,nb,0.083108
1,Total_Number_of_Reviews_Reviewer_Has_Given,0.038469
4,Dist_Train,0.032536
51,knn,0.031389
35,Nationality_Recode_Middle east,0.029018
