### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [3]:
df = df.dropna(subset=['Dist_Center'])
# df = df.dropna(subset=['Reservation_ADR'])

### Create Predicted Category for final models (2 categories)

In [4]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [5]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [6]:
category = np.array(['Bad' if i < 7 else 'Good' for i in df.Reviewer_Score])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df) 

Good    0.831582
Bad     0.168418
Name: Category, dtype: float64

In [7]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Diff_Percentage',
       'Review_Month', 'Review_Year', 'Country', 'City', 'Pet', 'Purpose',
       'Whom', 'Room', 'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars', 'Length_N', 'Reservation_ADR',
       'food_Neg', 'staff_Neg', 'location_Neg', 'value_Neg', 'comfort_Neg',
       'room_Neg', 'facilities_Neg', 'cleanliness_Neg', 'food_Pos',
       'staff_Pos', 'location_Pos', 'value_Pos', 'comfort_Pos', 'room_Pos',
       'facilities_Pos', 'cleanliness_Pos', 'food_Neg_Hotel',
       'staff_Neg_Hotel', 'loca

### Select Hotel

In [19]:
df.Hotel_Name.value_counts().head()

Britannia International Hotel Canary Wharf           4789
Strand Palace Hotel                                  4256
Park Plaza Westminster Bridge London                 4169
Copthorne Tara Hotel London Kensington               3578
DoubleTree by Hilton Hotel London Tower of London    3212
Name: Hotel_Name, dtype: int64

In [146]:
df_Hotel = df[df.Hotel_Name=='Britannia International Hotel Canary Wharf']

In [147]:
df_Hotel.Reviewer_Score.describe()

count    4789.000000
mean        6.826644
std         2.025480
min         2.500000
25%         5.400000
50%         7.100000
75%         8.300000
max        10.000000
Name: Reviewer_Score, dtype: float64

In [148]:
df_Hotel.Category.value_counts()

Good    2512
Bad     2277
Name: Category, dtype: int64

In [161]:
df.describe()

Unnamed: 0,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,lat,lng,Diff,...,facilities_Pos_Hotel,cleanliness_Pos_Hotel,count,mean,std,min,25%,50%,75%,max
count,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,...,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0,512470.0
mean,500.118391,8.397934,18.541864,2747.504902,17.765052,7.152272,8.395594,49.442439,2.823803,-0.002339,...,0.226761,0.194419,914.838874,-0.002339,1.49724,-5.588449,-0.795955,0.350036,1.192276,1.602178
std,501.419262,0.549133,29.693695,2322.698454,21.789025,11.028943,1.63817,3.466325,4.579425,1.525948,...,0.072677,0.059396,893.183937,0.121667,0.280036,0.694523,0.416756,0.221606,0.251901,0.548241
min,1.0,5.2,0.0,43.0,0.0,1.0,2.5,41.328376,-0.369758,-7.0,...,0.041667,0.0,8.0,-1.347368,0.42771,-7.0,-3.0,-1.4,0.1,0.2
25%,169.0,8.1,2.0,1161.0,5.0,1.0,7.5,48.214662,-0.143372,-0.8,...,0.174028,0.152866,312.0,-0.066016,1.319523,-6.1,-1.0,0.2,1.0,1.2
50%,343.0,8.4,9.0,2134.0,11.0,3.0,8.8,51.499981,0.010607,0.4,...,0.210412,0.186047,651.0,-0.001136,1.512393,-5.7,-0.8,0.4,1.2,1.6
75%,666.0,8.8,23.0,3633.0,22.0,8.0,9.6,51.516288,4.834443,1.1,...,0.267637,0.231061,1154.0,0.077416,1.692351,-5.2,-0.5,0.5,1.4,1.9
max,2682.0,9.8,408.0,16670.0,395.0,355.0,10.0,52.400181,16.429233,3.6,...,0.69863,0.692308,4789.0,0.762162,2.75428,-0.9,0.4,1.6,2.3,3.6


### Balance Nationalities and / or Categories

In [149]:
def balance_df(df, Balance_Nationality, Balance_Category):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = 20000
            if len(nationality) < 20000:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [150]:
df_model = balance_df(df_Hotel, Balance_Nationality=False, Balance_Category=False)

### Prepare Data to run Models

In [151]:
x_categorical = ['Review_Month', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode',
                 'Stars']
x_numerical = ['Total_Number_of_Reviews_Reviewer_Has_Given','count', 'mean', 'std', 'min', '25%', 
               '50%', '75%', 'max']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [152]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [153]:
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [154]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [155]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [156]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((1198, 36), (1198,), (3591, 36), (3591,))

## MODELS

### Evaluate Model

In [157]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score

In [158]:
def evaluate(model):
    clf = model
    clf.fit(X_train, y_train)
    train = clf.predict(X_train)
    pred = clf.predict(X_test)
    print('Test :', f'{accuracy_score(pred, y_test):.4f}', '|', f'{cohen_kappa_score(pred, y_test):.4f}')
    print('Train:', f'{accuracy_score(train, y_train):.4f}', '|', f'{cohen_kappa_score(train, y_train):.4f}')
    return(pred)

### KNN

In [159]:
pred_knn = evaluate(KNeighborsClassifier(n_neighbors=5))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Gradient Boosted Trees

In [160]:
pred_gbt = evaluate(GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=1))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### Random Forest

In [130]:
pred_rf = evaluate(RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=1))

Test : 0.8667 | 0.0000
Train: 0.8765 | 0.0000


### XGBoosting

In [131]:
pred_xgb = evaluate(xgb.XGBClassifier(objective="binary:logistic", n_estimators = 7, max_depth=4, random_state=1))

Test : 0.8667 | 0.0000
Train: 0.8775 | 0.0135


### Logistic Regresion

In [132]:
pred_log = evaluate(LogisticRegression(solver='lbfgs', max_iter=500, random_state=1))

Test : 0.8677 | 0.0124
Train: 0.8762 | -0.0006


### Decision Trees

In [133]:
pred_tree = evaluate(DecisionTreeClassifier(max_depth=4, random_state=1))

Test : 0.8639 | 0.0047
Train: 0.8775 | 0.0396


### SVM

In [134]:
pred_svm = evaluate(SVC(C=0.5, random_state=1))

Test : 0.8667 | 0.0000
Train: 0.8765 | 0.0000


### Naive Bayes

In [135]:
pred_nb = evaluate(BernoulliNB())

Test : 0.8648 | 0.0267
Train: 0.8740 | 0.0321


## ENSEMBLE

In [76]:
modelos = [('Logistic', LogisticRegression(solver='lbfgs', max_iter=1500, random_state=1)), 
           ('Random Forest', RandomForestClassifier(n_estimators = 70, max_depth = 5, random_state=1)),
           ('XGB', xgb.XGBClassifier(objective="binary:logistic", n_estimators = 7, max_depth=4, random_state=1)),
           ('GBT', GradientBoostingClassifier(learning_rate=0.04, max_depth=3, random_state=1))
          ]

pred = pd.DataFrame(columns=['Logistic','Random Forest','XGB','GBT'])
prob = pd.DataFrame(columns=['Logistic','Random Forest','XGB','GBT'])

for i in modelos:
    clf = i[1]
    clf.fit(X_train, y_train)
    train_ens = clf.predict(X_train)
    pred_ens = clf.predict(X_test)
    prob_ens = clf.predict_proba(X_test)

    pred[i[0]] = pred_ens
    prob[i[0]] = prob_ens[:,0]

print('Done!')

Done!


In [77]:
prob_final = prob.apply(lambda x: np.mean(x), axis=1)
prob_final = ['Good' if i < 0.5 else 'Bad' for i in prob_final]
print('Probability :',  
      f'{accuracy_score(prob_final, y_test):.4f}', '|', 
      f'{cohen_kappa_score(prob_final, y_test):.4f}'
     )

Probability : 0.5735 | 0.1407
