### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it incorrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [4]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [5]:
df = df.dropna(subset=['Dist_Center'])
df = df.dropna(subset=['Price'])
df = df.dropna(subset=['Stars'])

### Create Predicted Category for final models (2 categories)

In [6]:
df['Diff_Percentage'] = df.Reviewer_Score / df.Average_Score

In [7]:
category = np.array(['Bad' if i < 0.70 else 'Good' for i in df.Diff_Percentage])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df) 

Good    0.918884
Bad     0.081116
Name: Category, dtype: float64

### Balance Nationalities and / or Categories

In [8]:
def balance_df(Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [9]:
df_balanced = balance_df(Balance_Nationality=True, Balance_Category=False)

### Prepare Data to run Models

In [10]:
df_model = df_balanced.sample(n=20000, random_state=1)

In [11]:
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode','Stars']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport', 'Price']
x_col = x_categorical + x_numerical
y_col = 'Reviewer_Score'

In [12]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [13]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [14]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [16]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 54), (10000,), (10000, 54), (10000,))

## MODELS

In [50]:
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

def error_metrics(pred_train, pred_test, y_train, y_test, train=True, test=True):
    if test:
        print('Test  -> R^2:', f'{linregress(pred_test, y_test)[2]**2:.4f}', '|', 
                       'MAE:', f'{mean_absolute_error(pred_test, y_test):.4f}'
             )
    if train:
        print('Train -> R^2:', f'{linregress(pred_train, y_train)[2]**2:.4f}', '|', 
                       'MAE:', f'{mean_absolute_error(pred_train, y_train):.4f}'
             ) 

### KNN

In [52]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

clf = KNeighborsRegressor(n_neighbors=10)
clf.fit(X_train, y_train)
train_knn = clf.predict(X_train)
pred_knn = clf.predict(X_test)
error_metrics(train_knn, pred_knn, y_train, y_test)

Test  -> R^2: 0.0831 | MAE: 1.2228
Train -> R^2: 0.2241 | MAE: 1.1243


### Gradient Boosting

In [54]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

clf = GradientBoostingRegressor(learning_rate=0.01, max_depth=4, random_state=1)
clf.fit(X_train, y_train)
train_gbt = clf.predict(X_train)
pred_gbt = clf.predict(X_test)
error_metrics(train_gbt, pred_gbt, y_train, y_test)

Test  -> R^2: 0.1431 | MAE: 1.2090
Train -> R^2: 0.1524 | MAE: 1.2167


### Random Forest

In [55]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

clf = RandomForestRegressor(n_estimators = 50, max_depth = 5, random_state=1)
clf.fit(X_train, y_train)
train_rf = clf.predict(X_train)
pred_rf = clf.predict(X_test)
error_metrics(train_rf, pred_rf, y_train, y_test)

Test  -> R^2: 0.1430 | MAE: 1.1761
Train -> R^2: 0.1664 | MAE: 1.1746


### XGBoosting

In [56]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

clf = xgb.XGBRegressor(objective="reg:squarederror", n_estimators = 5, max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_xgb = clf.predict(X_train)
pred_xgb = clf.predict(X_test)
error_metrics(train_xgb, pred_xgb, y_train, y_test)

Test  -> R^2: 0.1379 | MAE: 1.7947
Train -> R^2: 0.1746 | MAE: 1.7766


### Decision Trees

In [57]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import linregress

clf = DecisionTreeRegressor(max_depth=5, random_state=1)
clf.fit(X_train, y_train)
train_tree = clf.predict(X_train)
pred_tree = clf.predict(X_test)
error_metrics(train_tree, pred_tree, y_train, y_test)

Test  -> R^2: 0.1253 | MAE: 1.1857
Train -> R^2: 0.1456 | MAE: 1.1842


### SVM

In [None]:
from sklearn.svm import SVR

clf = SVR(kernel='poly', C=1e3, degree=3)
clf.fit(X_train, y_train)
train_svr = clf.predict(X_train)
pred_svr = clf.predict(X_test)
error_metrics(train_svr, pred_svr, y_train, y_test)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

clf = BernoulliNB()
clf.fit(X_train, y_train)
train_nb = clf.predict(X_train)
pred_nb = clf.predict(X_test)
error_metrics(train_nb, pred_nb, y_train, y_test)

### Resum Respostes

In [29]:
df_models_test = pd.DataFrame(list(zip(y_test, pred_log, pred_knn, pred_xgb, pred_gbt, pred_net, pred_rf, 
                                       pred_svm, pred_tree)), 
                         columns=['TEST','logistic','knn','xgb','gbt','nnet','rf','svm', 'tree']) 
df_models_test.head()

Unnamed: 0,TEST,logistic,knn,xgb,gbt,nnet,rf,svm,tree
0,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
1,Good,Good,Good,Good,Good,Good,Good,Good,Good
2,Bad,Good,Good,Good,Good,Bad,Good,Good,Good
3,Good,Good,Bad,Bad,Bad,Bad,Bad,Bad,Bad
4,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad


## STACKING

Predictions using Random Forest adding the predicions of other models to the original dataset.
I had to split the test set in 2 to create the new train/test set, otherwise i fall into overfitting because my original training set is biased vs the test set

First I append the predictions of the model to the dataset

In [30]:
X_2 = X_test.copy()
y_2 = y_test.copy()

In [31]:
X_2['logistic'] = pred_log

In [32]:
X_2['knn'] = pred_knn

In [33]:
X_2['svm'] = pred_svm

In [34]:
X_2['nnet'] = pred_net

In [35]:
X_2['tree'] = pred_tree

In [36]:
X_2['nb'] = pred_nb

In [37]:
X_2.iloc[:,-6:].head(2)

Unnamed: 0,logistic,knn,svm,nnet,tree,nb
65357,Bad,Bad,Bad,Bad,Bad,Bad
247493,Good,Good,Good,Good,Good,Good


In [38]:
X_2.iloc[:,-6:] = X_2.iloc[:,-6:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [39]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=100)

### Stacked Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=1)
clf.fit(X_train_2, y_train_2)
train_rf_2 = clf.predict(X_train_2)
pred_rf_2 = clf.predict(X_test_2)
print('Test :',  f'{accuracy_score(pred_rf_2, y_test_2):.4f}', '|', f'{cohen_kappa_score(pred_rf_2, y_test_2):.4f}')
print('Train:',  f'{accuracy_score(train_rf_2, y_train_2):.4f}', '|', f'{cohen_kappa_score(train_rf_2, y_train_2):.4f}')

Test : 0.6700 | 0.3402
Train: 0.6703 | 0.3405


In [43]:
pd.crosstab(train_rf_2, y_train_2)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,2664,1299
Good,1339,2698
