### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I fix the date because when Pandas opens the file it reads it incorrectly

In [3]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [4]:
df = df.dropna(subset=['Dist_Center'])
#df = df.dropna(subset=['Reservation_ADR'])

### Create Predicted Category for final models (2 categories)

In [5]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [6]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [7]:
category = np.array(['Bad' if i < 7 else 'Good' for i in df.Reviewer_Score])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df) 

Good    0.831582
Bad     0.168418
Name: Category, dtype: float64

In [8]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Diff_Percentage',
       'Review_Month', 'Review_Year', 'Country', 'City', 'Pet', 'Purpose',
       'Whom', 'Room', 'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars', 'Length_N', 'Reservation_ADR',
       'food_Neg', 'staff_Neg', 'location_Neg', 'value_Neg', 'comfort_Neg',
       'room_Neg', 'facilities_Neg', 'cleanliness_Neg', 'food_Pos',
       'staff_Pos', 'location_Pos', 'value_Pos', 'comfort_Pos', 'room_Pos',
       'facilities_Pos', 'cleanliness_Pos', 'food_Neg_Hotel',
       'staff_Neg_Hotel', 'loca

### Balance Nationalities and / or Categories

In [9]:
def balance_df(Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [10]:
df_balanced = balance_df(Balance_Nationality=True, Balance_Category=True)

### Prepare Data to run Models

In [11]:
df_model = df_balanced.sample(n=20000, random_state=1)

In [15]:
x_categorical = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport','food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
               'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel','count', 'std', 'mean', 'min', '25%', 
               '50%', '75%', 'max']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [16]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [17]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [18]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=100)

In [20]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 74), (10000,), (10000, 74), (10000,))

## MODELS

In [21]:
X_test.columns

Index(['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given',
       'Close_Landmarks', 'Dist_Center', 'Dist_Train', 'Dist_Airport',
       'food_Neg_Hotel', 'staff_Neg_Hotel', 'location_Neg_Hotel',
       'value_Neg_Hotel', 'comfort_Neg_Hotel', 'room_Neg_Hotel',
       'facilities_Neg_Hotel', 'cleanliness_Neg_Hotel', 'food_Pos_Hotel',
       'staff_Pos_Hotel', 'location_Pos_Hotel', 'value_Pos_Hotel',
       'comfort_Pos_Hotel', 'room_Pos_Hotel', 'facilities_Pos_Hotel',
       'cleanliness_Pos_Hotel', 'count', 'std', 'mean', 'min', '25%', '50%',
       '75%', 'max', 'Review_Month_10', 'Review_Month_11', 'Review_Month_12',
       'Review_Month_2', 'Review_Month_3', 'Review_Month_4', 'Review_Month_5',
       'Review_Month_6', 'Review_Month_7', 'Review_Month_8', 'Review_Month_9',
       'City_Barcelona', 'City_London', 'City_Milan', 'City_Paris',
       'City_Vienna', 'Purpose_Leisure trip',
       'Whom_Family with older children', 'Whom_Family with young children',
       'Whom_

### Random Forest

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.metrics import accuracy_score, cohen_kappa_score
import random
from random import sample

In [89]:
def check_model(variable, X_train, X_test, y_train, y_test):
    clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 7, max_depth=4, random_state=1)
    clf = RandomForestClassifier(n_estimators = 50, max_depth = 4, random_state=1)
    clf = BernoulliNB()
    clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
    
    clf.fit(X_train, y_train)
    prediccion = clf.predict(X_test)
    acc_score = accuracy_score(prediccion, y_test)
    # print(variable, acc_score)
    return(variable, acc_score)

In [90]:
# FULL MODEL

score = check_model('', X_train, X_test, y_train, y_test)[1]
varout = []
varin = list(X_test.columns)

import random

# BACKWARD ELIMINATION DELETE FIRST RANDOM VARIABLE THAT INCRESES THE ACCURACY)

for n in range(len(varin)):
    max_score = score
    max_feature = []
    
    random.seed(1)
    for i in sample(varin, len(varin)):
        var_test = varin.copy()
        var_test.remove(i)
        X_train_vartest = X_train[var_test]
        X_test_vartest = X_test[var_test]
        check = check_model(i, X_train_vartest, X_test_vartest, y_train, y_test)
        if check[1] > max_score:
            max_feature = check[0]
            max_score = check[1] 
            varin.remove(max_feature)   
            varout.append(max_feature)
            print(n, 'Original Score:', score, '| New score:', max_score, '| Variable to remove:', max_feature)
            break
    
    if max_score > score:
        score = max_score
    else:
        print('End of process')
        break

0 Original Score: 0.6535 | New score: 0.6556 | Variable to remove: value_Pos_Hotel
1 Original Score: 0.6556 | New score: 0.6562 | Variable to remove: Whom_Group
2 Original Score: 0.6562 | New score: 0.6564 | Variable to remove: Review_Month_2
3 Original Score: 0.6564 | New score: 0.6575 | Variable to remove: cleanliness_Neg_Hotel
4 Original Score: 0.6575 | New score: 0.6584 | Variable to remove: room_Pos_Hotel
5 Original Score: 0.6584 | New score: 0.6585 | Variable to remove: location_Neg_Hotel
6 Original Score: 0.6585 | New score: 0.6586 | Variable to remove: Length_Recode_Stayed 2 nights
7 Original Score: 0.6586 | New score: 0.6587 | Variable to remove: Room_Recode_Studio
8 Original Score: 0.6587 | New score: 0.6589 | Variable to remove: Length_Recode_Stayed 9+ nights
9 Original Score: 0.6589 | New score: 0.6592 | Variable to remove: value_Neg_Hotel
10 Original Score: 0.6592 | New score: 0.66 | Variable to remove: Nationality_Recode_Middle east
11 Original Score: 0.66 | New score: 0.