# ScikitLearn Grid Search and Hyperopt

## Data Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/AB_NYC_2019.csv')

In [18]:
df.head(5)

Unnamed: 0,neighbourhood,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
0,108,1,1,9,0.21,6,365,149
1,127,0,1,45,0.38,2,355,225
2,94,1,3,0,0.0,1,365,150
3,41,0,1,270,4.64,1,194,89
4,61,0,10,9,0.1,1,0,80


In [6]:
selected_cols = ['neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'price']

In [7]:
df = df[selected_cols]

In [13]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviews_per_month'] = df['reviews_per_month'].fillna(0)


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood                   48895 non-null  int32  
 1   room_type                       48895 non-null  int32  
 2   minimum_nights                  48895 non-null  int64  
 3   number_of_reviews               48895 non-null  int64  
 4   reviews_per_month               48895 non-null  float64
 5   calculated_host_listings_count  48895 non-null  int64  
 6   availability_365                48895 non-null  int64  
 7   price                           48895 non-null  int64  
dtypes: float64(1), int32(2), int64(5)
memory usage: 2.6 MB


In [15]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [17]:
for i in ['neighbourhood','room_type']:
    df[i] = le.fit_transform(df[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = le.fit_transform(df[i])


In [23]:
y = df['price'].values
X = df.drop(columns=['price']).values

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ScikitLearn Grid Search

In [60]:
from hyperopt import hp

search_space = {
    "max_depth": hp.quniform("max_depth", 2, 5, 1),
    "n_estimators": hp.quniform("num_trees", 10, 100, 1)
}

In [None]:
search_spece = {'n_estimators':[10,20], 'max_depth':[2, 5]}

In [81]:
estimators_array = (np.linspace(uniform.ppf(0.1), uniform.ppf(0.2), 2) * 10).astype(int)
depth_array = (np.linspace(uniform.ppf(0.2), uniform.ppf(0.5), 4) * 10).astype(int)
search_space = {'n_estimators':estimators_array, 'max_depth':depth_array}

In [82]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()

cv = GridSearchCV(rf, search_space,cv=3)

cv.fit(X_train, y_train)


In [83]:
cv.best_params_

{'max_depth': 4, 'n_estimators': 2}

In [84]:
cv.predict(X_test)

array([210.21359561,  93.53439862,  90.11877726, ...,  90.11877726,
       234.20844299,  92.57755045])

### What kind of uniform not working??

In [None]:
np.random.uniform

In [33]:
from scipy.stats import uniform
uniform.pdf([1, 2])

array([1., 0.])

In [41]:
import random
random.uniform(4, 9)

## HyperOpt with Sklearn

In [88]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score

from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trialsma
import mlflow

In [89]:
def objective(params):
    reg = RandomForestRegressor(**params)
    r2_score = cross_val_score(reg, X_train, y_train, cv=3).mean()
    return -r2_score

In [100]:
from hyperopt import hp

# search_space = {
#     "max_depth": hp.quniform("max_depth", 2, 5, 1),
#     "n_estimators": hp.quniform("n_estimators", 10, 100, 1)
# }

search_space = {
                 "max_depth": hp.choice('max_depth', np.arange(2, 5, dtype=int)),
                 "n_estimators":hp.choice('n_estimators', np.arange(10, 100, dtype=int))
                }



In [101]:
algo = tpe.suggest
trials = Trials()

In [102]:
with mlflow.start_run():
    best_result = fmin(
                        fn=objective, 
                        space=search_space,
                        algo=algo,
                        max_evals=32,
                        trials=trials)

100%|██| 32/32 [01:46<00:00,  3.34s/trial, best loss: -0.1222781322342924]


In [108]:
import hyperopt
print(hyperopt.space_eval(search_space, best_result))

{'max_depth': 4, 'n_estimators': 84}


In [109]:
best_result

{'max_depth': 2, 'n_estimators': 74}

In [112]:
from sklearn.metrics import r2_score

with mlflow.start_run():
    best_max_depth = best_result["max_depth"]
    best_n_estimators = best_result["n_estimators"]
    estimator = RandomForestRegressor(max_depth=best_max_depth,n_estimators=best_n_estimators )
    estimator.fit(X_train,y_train)
    y_pred = estimator.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    # Log param and metrics for the final model
    mlflow.log_param("maxDepth", best_max_depth)
    mlflow.log_param("numTrees", best_n_estimators)
    mlflow.log_metric("r2", 2)
    mlflow.sklearn.log_model(estimator, "sk_rf")



In [113]:
r2

0.09979542973038291