In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import ExtraTreesRegressor

In [6]:
df9 = pd.read_csv("df_9.csv")
test_ids = df9.ID.iloc[7000:].values # save IDs for later output
df9.drop(["Unnamed: 0", "ID", 'price', 'host_has_profile_pic_t','host_identity_verified_t'], axis=1, inplace=True)
y_train = df9['log_price'].iloc[:7000].values
#y_test = np.zeros(3000)
X_train = df9.drop(['log_price'], axis=1).iloc[:7000].values
X_test = df9.drop(['log_price'], axis=1).iloc[7000:].values

In [7]:
pipe_rf = make_pipeline(StandardScaler(), ExtraTreesRegressor(random_state=42))

pipe_rf.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_rf, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_rf.fit(X_train, y_train)


y_pred = pipe_rf.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.40287094718079813


In [13]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("ExtraTreesPredictions.csv", index=False, header=True)

In [8]:
xtra = ExtraTreesRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(xtra.get_params())

Parameters currently in use:

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [11]:
from sklearn.model_selection import RandomizedSearchCV
# first let's do randomized grid search to narrow down parameter ranges
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 100)]
# Number of features to consider at every split
max_features = [1.0, 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 90, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2, 20, num = 10)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(1, 20, num = 10)]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# sample size per bootstrap sample
max_samples = [x for x in np.linspace(0.1, 1, num = 10)]
# Create the random grid
random_grid = {'extratreesregressor__n_estimators': n_estimators,
               'extratreesregressor__max_features': max_features,
               'extratreesregressor__max_depth': max_depth,
               'extratreesregressor__min_samples_split': min_samples_split,
               'extratreesregressor__min_samples_leaf': min_samples_leaf,
               'extratreesregressor__bootstrap': bootstrap,
               'extratreesregressor__max_samples': max_samples}
pprint(random_grid)

{'extratreesregressor__bootstrap': [True, False],
 'extratreesregressor__max_depth': [20,
                                    27,
                                    35,
                                    43,
                                    51,
                                    58,
                                    66,
                                    74,
                                    82,
                                    90,
                                    None],
 'extratreesregressor__max_features': [1.0, 'sqrt', 'log2'],
 'extratreesregressor__max_samples': [0.1,
                                      0.2,
                                      0.30000000000000004,
                                      0.4,
                                      0.5,
                                      0.6,
                                      0.7000000000000001,
                                      0.8,
                                      0.9,
                            

In [12]:
pipe = make_pipeline(StandardScaler(), ExtraTreesRegressor(random_state=42))

gs_random = RandomizedSearchCV(estimator = pipe, 
                               param_distributions = random_grid, 
                               n_iter = 50, 
                               cv = 10,  
                               n_jobs = -1,
                               scoring = 'neg_mean_squared_error')
# Fit the random search model
gs_random.fit(X_train, y_train)
print(gs_random.best_estimator_)
print(gs_random.best_params_)
print(gs_random.best_score_)

260 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
260 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

         nan -0.23253353     

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('extratreesregressor',
                 ExtraTreesRegressor(bootstrap=True, max_depth=35,
                                     max_samples=1.0, min_samples_split=10,
                                     n_estimators=1019, random_state=42))])
{'extratreesregressor__n_estimators': 1019, 'extratreesregressor__min_samples_split': 10, 'extratreesregressor__min_samples_leaf': 1, 'extratreesregressor__max_samples': 1.0, 'extratreesregressor__max_features': 1.0, 'extratreesregressor__max_depth': 35, 'extratreesregressor__bootstrap': True}
-0.17057789555326675


In [14]:
# narrow down optimal parameter values
from sklearn.model_selection import GridSearchCV
pipe = make_pipeline(StandardScaler(), ExtraTreesRegressor(random_state=42))
# Create the parameter grid based on the results of random search 
param_grid = {
    'extratreesregressor__bootstrap': [True],
    'extratreesregressor__max_depth': [30, 35, 40],
    'extratreesregressor__max_features': [1.0],
    'extratreesregressor__max_samples': [0.90, 1.0],
    'extratreesregressor__min_samples_leaf': [1, 2],
    'extratreesregressor__min_samples_split': [8, 10, 12],
    'extratreesregressor__n_estimators': [950, 1000, 1050]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('extratreesregressor',
                 ExtraTreesRegressor(bootstrap=True, max_depth=30,
                                     max_samples=1.0, min_samples_leaf=2,
                                     min_samples_split=8, n_estimators=1050,
                                     random_state=42))])
{'extratreesregressor__bootstrap': True, 'extratreesregressor__max_depth': 30, 'extratreesregressor__max_features': 1.0, 'extratreesregressor__max_samples': 1.0, 'extratreesregressor__min_samples_leaf': 2, 'extratreesregressor__min_samples_split': 8, 'extratreesregressor__n_estimators': 1050}


In [15]:
# final model after hyperparameter tuning
pipe = make_pipeline(StandardScaler(), 
                     ExtraTreesRegressor(random_state=42,
                                         bootstrap=True,
                                         max_depth=30,
                                         max_features=1.0,
                                         max_samples=1.0,
                                         min_samples_leaf=2,
                                         min_samples_split=8,
                                         n_estimators=1050))

pipe.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe.fit(X_train, y_train)


y_pred = pipe.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.40271852340085335


In [16]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("ExtraTreesPredictions.csv", index=False, header=True)