In [13]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np

In [11]:
pickle_path = "final_data.pck"

df = pd.read_pickle(pickle_path)

df.dropna()

categorical_features = ['host_response_time', 'neighbourhood_cleansed', 'room_type', 'season']
df = pd.get_dummies(data, columns=categorical_features)
X_train, X_test, y_train, y_test= train_test_split(
    df.drop(['price'], axis = 1), #explanatory
    df[['price']], #response
    test_size=0.2, #hold out size
    random_state=42
    )

In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

param_grid = {'learning_rate': [0.01,0.1,0.5], #alias eta, Step size shrinkage used in update to prevents overfitting.  
    'n_estimators': [10, 20, 50, 100],
    'subsample': [0.5, 0.8, 1], #Subsample ratio of the training instances
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.5, 1] #colsample_bytree is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
    }

#instantiate XGBRegressor 
gbm = xgb.XGBRegressor(seed=42, objective='reg:squarederror')
grid_mse = GridSearchCV(estimator=gbm,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error', 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1)
#fit  GridSearchCV 
grid_mse.fit(X_train, y_train)

print("Best parameters found: ",grid_mse.best_params_) #best_params_
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) #best_score_

#extract the estimator best_estimator_ 
gbm_ins = grid_mse.best_estimator_ #best_estimator_

# Predict the test set labels 'y_pred'
y_pred_xgb = gbm_ins.predict(X_test)

# Evaluate the test set RMSE
rmse_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(rmse_test)

r2_xgb = r2_score(y_test, y_pred_xgb)
print("R2: {:.2f}".format(r2_xgb))

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best parameters found:  {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 10, 'subsample': 1}
Lowest RMSE found:  39762.42258549929
36483.1496361904
R2: 0.06




In [18]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


hyper_grid = {'n_estimators': [100, 200, 300, 500, 1000, 2000,3000,5000],
               'max_features': [2,4,8,9,12],
               'min_samples_split': [5,10, 20,30, 40]}

#reinstantiate RandomForestRegressor regressor with empty parameter set
forest_model_cv = RandomForestRegressor()

# Instantiate the GridSearchCV with forest_model_cv  as estimator
forest = GridSearchCV(estimator = forest_model_cv, param_grid = hyper_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

forest.fit(X_train, y_train.values.ravel()) #values.ravel() flattened array expected by RandomForestRegressor


Fitting 3 folds for each of 200 candidates, totalling 600 fits
30623.927043752195
R2: 0.34




In [22]:
print(forest.best_params_)

forest_model_opt= forest.best_estimator_
y_pred_forest = forest_model_opt.predict(X_test)

rmse_forest = mean_squared_error(y_test, y_pred_forest, squared=False)
print(rmse_forest)

r2_forest = r2_score(y_test, y_pred_forest)
print("R2: {:.2f}".format(r2_forest))

{'max_features': 9, 'min_samples_split': 5, 'n_estimators': 200}
30623.927043752195
R2: 0.34


