In [167]:
import pandas as pd
import numpy as np

In [168]:
df = pd.read_csv('./data/df_features.gz')

In [169]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'Diff', 'Diff_Percentage',
       'Review_Month', 'Review_Year', 'Country', 'City', 'Pet', 'Purpose',
       'Whom', 'Room', 'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport',
       'Dist_Train', 'Price', 'Stars', 'Length_N', 'Reservation_ADR',
       'food_Neg', 'staff_Neg', 'location_Neg', 'value_Neg', 'comfort_Neg',
       'room_Neg', 'facilities_Neg', 'cleanliness_Neg', 'food_Pos',
       'staff_Pos', 'location_Pos', 'value_Pos', 'comfort_Pos', 'room_Pos',
       'facilities_Pos', 'cleanliness_Pos', 'food_Neg_Hotel',
       'staff_Neg_Hotel', 'loca

### Hotel Prices

Check if there are hotels without price and the total amount of these

In [317]:
df_price = df[['Hotel_Address','Hotel_Name','Reservation_ADR','Price']].groupby(['Hotel_Address','Hotel_Name']).min()

In [318]:
df_price[df_price.Price.isnull() == True].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Reservation_ADR,Price
Hotel_Address,Hotel_Name,Unnamed: 2_level_1,Unnamed: 3_level_1
1 Place Vend me 1st arr 75001 Paris France,H tel De Vend me,,
10 rue Saint Hyacinthe 1st arr 75001 Paris France,Hotel des Tuileries Relais du Silence,,
108 Baker Street Westminster Borough London W1U 6LJ United Kingdom,Park Plaza Sherlock Holmes London,,
12 Osborn Street Aldgate East Tower Hamlets London E1 6TE United Kingdom,Arbor City,,
12 rue Saint Roch 1st arr 75001 Paris France,Hotel Louvre Montana,,


In [319]:
df_price[df_price.Price.isnull() == True].shape

(45, 2)

Extract and classify the useful features. Later on I create a DataFrame with these useful features

In [320]:
X_feature = ['Hotel_Address','Average_Score', 'Hotel_Name', 'City', 'Close_Landmarks', 'Dist_Center', 
              'Dist_Airport','Dist_Train', 'Stars', 'food_Neg_Hotel', 'staff_Neg_Hotel', 'location_Neg_Hotel', 
              'value_Neg_Hotel','comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel', 'cleanliness_Neg_Hotel',
              'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
              'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel']
y_feature = ['Price']
features = X_feature + y_feature

In [321]:
df_hotels = df[features].groupby(['Hotel_Address','Hotel_Name']).min().reset_index()

(1494, 26)

I create the Training set for modeling. During the process I must create binary variables for the categorical ones

In [322]:
df_hotels_full = df_hotels.dropna(subset=['Price'])
df_hotels_full.shape

(1449, 26)

In [323]:
X_num = df_hotels_full[['Average_Score', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport','Dist_Train',
                        'food_Neg_Hotel', 'staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
                        'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel', 'cleanliness_Neg_Hotel',
                        'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 
                        'comfort_Pos_Hotel','room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel']]
X_fct = pd.get_dummies(df_hotels_full[['City','Stars']], prefix_sep='_', drop_first=False)


In [324]:
X = pd.concat([X_num, X_fct], axis=1, sort=False)
y = df_hotels_full['Price']

In [325]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### MODELS

In [347]:
# Function to find the optimal hyperparameters through Bayesian Optimization
def bayesian(space, X, y, modelo, nevals):
    
    def objective(space):        
        global best_score
        model = modelo(**space)   
        cv =  KFold(n_splits = 5, random_state = 1, shuffle = True)
        score = -cross_val_score(model, X, y, cv = cv, verbose = False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()
    rstate = np.random.RandomState(1)
    best = fmin(
      objective, 
      space = space,
      algo = tpe.suggest, 
      max_evals = nevals,
      trials = Trials()
      # rstate = rstate
    )

    print("Hyperopt search took %.2f seconds for 200 candidates" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

In [348]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import pearsonr, linregress
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KDTree
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
import xgboost as xgb
from sklearn.svm import SVR

import time
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval
from collections import Counter

### Random Forest

In [255]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'n_estimators':      hp.choice('n_estimators',[50, 100, 150, 200, 250])}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestRegressor, 10)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:22<00:00, 26.21s/trial, best loss: -0.7006081361402992]
Hyperopt search took 262.13 seconds for 200 candidates
Best score: 0.7006 
Best space:  {'bootstrap': False, 'max_features': 'sqrt', 'n_estimators': 250}


### Gradient Boosted Trees

In [249]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingRegressor, 25)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:40<00:00,  1.63s/trial, best loss: -0.6634335411676455]
Hyperopt search took 40.90 seconds for 200 candidates
Best score: 0.6634 
Best space:  {'learning_rate': 0.05, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 0.03, 'min_samples_split': 0.23, 'n_estimators': 391, 'subsample': 1}


### XGB

In [258]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBRegressor, 25)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [02:02<00:00,  4.91s/trial, best loss: -0.6946310156969048]
Hyperopt search took 123.13 seconds for 200 candidates
Best score: 0.6946 
Best space:  {'colsample_bytree': 0.9, 'gamma': 0.73, 'learning_rate': 0.075, 'max_depth': 3, 'min_child_weight': 0.99, 'n_estimators': 135}


### Final Model

My final model is the Random Forest

In [301]:
clf = RandomForestRegressor(**rf_params)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('R2:  ', round(pearsonr(pred, y_test)[0]**2,4))
print('RMSE:', round(np.sqrt(np.mean((pred-y_test)**2)),2))

R2:   0.7073
RMSE: 68.52


Prediction of NA. I must create and format the dataframe with NA and the same features than the training set.

In [431]:
df_hotels_na = df_hotels[df_hotels.Price.isnull() == True]

In [432]:
X_num = df_hotels_na[['Average_Score', 'Close_Landmarks', 'Dist_Center', 'Dist_Airport','Dist_Train',
                        'food_Neg_Hotel', 'staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
                        'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel', 'cleanliness_Neg_Hotel',
                        'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 
                        'comfort_Pos_Hotel','room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel']]
X_fct = pd.get_dummies(df_hotels_na[['City','Stars']], prefix_sep='_', drop_first=False)
X_na = pd.concat([X_num, X_fct], axis=1, sort=False)

In [433]:
np.setdiff1d(X.columns,X_na.columns)

array(['City_Amsterdam', 'Stars_Pension', 'Stars_hotel de 3 estrellas'],
      dtype=object)

In [434]:
X_na['City_Amsterdam'] = 0
X_na['Stars_Pension'] = 0
X_na['Stars_hotel de 3 estrellas'] = 0

In [442]:
df_hotels_na.Price = clf.predict(X_na)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


### Add Predictions to the Original DataFrame

In [444]:
df_add = df_hotels_na[['Hotel_Address','Price']]

In [445]:
df_na = df[df.Price.isna() == True]
df_full = df[df.Price.isna() == False]
df_na.shape, df_full.shape

((15898, 69), (499840, 69))

In [446]:
df_na = df_na.drop(['Price'], axis=1)
df_na_full = pd.merge(df_na, df_add[['Hotel_Address','Price']], on='Hotel_Address')
df_na_full = df_na_full[df_full.columns]

Also fix reservations without Length_N

In [447]:
df.loc[df['Length_N'].isna() == True, 'Length_N'] = np.mean(df.Length_N)

In [448]:
df_new = pd.concat([df_full, df_na_full])
df_new['Reservation_ADR'] = df_new.Length_N * df_new.Price

#### Save Results

In [452]:
df_new.to_csv("./data/df_features.gz", index_label=False, compression="gzip")