De manera similar a las clases de nuestra variable respuesta, se tiene una variable importante y con influencia en las respuestas como son las Nacionalidades con un claro peso en una sola Nacionalidad (UK, ~50% de las respuestas). A pesar de no ser imprescindible y tan critico como la variable respuesta también se ha probado a balancear esta variable para ver si mejoraban los resultados del modelo.

### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

### Create Predicted Category for final models (2 categories)

I create statistic columns for each hotel because each hotel behaves differently and it's important to have it into account

In [None]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [None]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [None]:
category = np.array(['Bad' if i < 7 else 'Good' for i in df.Reviewer_Score])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df)

### Balance Nationalities and / or Categories

In [None]:
df.Nationality_Recode.value_counts()

In [6]:
def balance_df(df, Balance_Nationality, Balance_Category, cut):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = cut
            if len(nationality) < n:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [7]:
df_balanced_1 = balance_df(df, Balance_Nationality=False, Balance_Category=True, cut=10000)
df_balanced_2 = balance_df(df_balanced_1, Balance_Nationality=True, Balance_Category=True, 
                         cut=int(np.median(df_balanced_1.Nationality_Recode.value_counts())*1.5))
df_balanced_2.shape

(77270, 78)

In [8]:
df_balanced_2.Category.value_counts()

Bad     38635
Good    38635
Name: Category, dtype: int64

In [9]:
df_balanced_2.Nationality_Recode.value_counts()

North America          11829
UK & Ireland           11810
Western Europe         11809
Middle east            11761
Eastern Europe          9058
Asia & Pacific          7869
Oceania                 7555
Sub-Saharian Africa     2023
South/Latin America     1614
China                   1127
Arab States              815
Name: Nationality_Recode, dtype: int64

### Prepare Data for Modeling

Subset a small fraction to run the first models

In [10]:
df_model = df_balanced_2.sample(n=10000, random_state=1)

In [11]:
x_categorical = ['Review_Month','City','Pet','Purpose','Whom','Room_Recode','Nationality_Recode','Length_Recode','Stars']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport','food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
               'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel','Price','Reservation_ADR',
               'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [12]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

Create unique binary variables for from categorical variables

In [13]:
df_model['Review_Month'] = df_model['Review_Month'].astype(str)
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

Merge numerical Variables and categorical Variables

In [14]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((2000, 79), (2000,), (8000, 79), (8000,))

### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer

In [17]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train, )
pred = clf.predict(X_test)

In [18]:
print('Accuracy: ', accuracy_score(pred, y_test))
print('Kappa:    ', cohen_kappa_score(pred, y_test))
print('F1-Score: ', f1_score(pred, y_test, pos_label='Bad'))
print('Precision:', precision_score(y_test, pred, pos_label='Bad'))
print('Recall:   ', recall_score(y_test, pred, pos_label='Bad'))

Accuracy:  0.617
Kappa:     0.23399999999999999
F1-Score:  0.6162324649298598
Precision: 0.615


In [19]:
pd.crosstab(pred, y_test)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,615,381
Good,385,619
