## 1. Data Loader

In [1]:
from sklearn.datasets import fetch_california_housing
fetch_california_housing().keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [2]:
import pandas as pd
X = pd.DataFrame(fetch_california_housing().data,
                 columns=fetch_california_housing().feature_names)
y = pd.DataFrame(fetch_california_housing().target,
                 columns=fetch_california_housing().target_names)

df = pd.merge(X, y, left_index=True, right_index=True)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


## 2. Transformer

In [3]:
df['Longitude']= pd.cut(df['Longitude'], bins= 5)
df['Latitude'] = pd.cut(df['Latitude'], bins= 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   MedInc       20640 non-null  float64 
 1   HouseAge     20640 non-null  float64 
 2   AveRooms     20640 non-null  float64 
 3   AveBedrms    20640 non-null  float64 
 4   Population   20640 non-null  float64 
 5   AveOccup     20640 non-null  float64 
 6   Latitude     20640 non-null  category
 7   Longitude    20640 non-null  category
 8   MedHouseVal  20640 non-null  float64 
dtypes: category(2), float64(7)
memory usage: 1.1 MB


In [5]:
df = pd.get_dummies(df)

## 3. Hyperparameter tuning

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X = df.drop(['MedHouseVal'], axis=1)
y = df['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=10,
                            param_distributions=param_distributions,
                            random_state=0)

search.fit(X_train, y_train)

In [7]:
search.cv_results_

{'mean_fit_time': array([0.29362173, 0.20505176, 0.57494936, 0.34941449, 0.14895058,
        0.12458014, 0.62974052, 0.30517097, 0.2623404 , 0.21892495]),
 'std_fit_time': array([0.02115524, 0.00997581, 0.13227964, 0.09970736, 0.05370225,
        0.02246851, 0.08566571, 0.10370019, 0.06963952, 0.06942945]),
 'mean_score_time': array([0.00758095, 0.00609336, 0.01570578, 0.01259327, 0.01095309,
        0.0156343 , 0.01477146, 0.01365786, 0.00700235, 0.00664291]),
 'std_score_time': array([0.00124596, 0.0001633 , 0.0077699 , 0.01183911, 0.00767954,
        0.00526064, 0.00543062, 0.01446993, 0.00136772, 0.00246022]),
 'param_max_depth': masked_array(data=[9, 5, 8, 6, 7, 5, 9, 6, 5, 6],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[4, 4, 4, 4, 1, 1, 3, 3, 2, 2],
              mask=[False, False, False, False, False, False, False, Fal

In [8]:
pd.DataFrame(search.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [9]:
pd.DataFrame(search.cv_results_)[['param_max_depth', 'param_n_estimators', 'params', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score')

Unnamed: 0,param_max_depth,param_n_estimators,params,mean_test_score,rank_test_score
0,9,4,"{'max_depth': 9, 'n_estimators': 4}",0.694124,1
2,8,4,"{'max_depth': 8, 'n_estimators': 4}",0.693023,2
6,9,3,"{'max_depth': 9, 'n_estimators': 3}",0.687564,3
3,6,4,"{'max_depth': 6, 'n_estimators': 4}",0.669034,4
7,6,3,"{'max_depth': 6, 'n_estimators': 3}",0.662331,5
9,6,2,"{'max_depth': 6, 'n_estimators': 2}",0.652634,6
1,5,4,"{'max_depth': 5, 'n_estimators': 4}",0.644482,7
4,7,1,"{'max_depth': 7, 'n_estimators': 1}",0.633653,8
8,5,2,"{'max_depth': 5, 'n_estimators': 2}",0.627094,9
5,5,1,"{'max_depth': 5, 'n_estimators': 1}",0.608806,10


In [10]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [11]:
# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.score(X_test, y_test)

0.6917465952796706

In [12]:
search

In [13]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedHouseVal,"Latitude_(32.531, 34.422]","Latitude_(34.422, 36.304]","Latitude_(36.304, 38.186]","Latitude_(38.186, 40.068]","Latitude_(40.068, 41.95]","Longitude_(-124.36, -122.342]","Longitude_(-122.342, -120.334]","Longitude_(-120.334, -118.326]","Longitude_(-118.326, -116.318]","Longitude_(-116.318, -114.31]"
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,4.526,0,0,1,0,0,0,1,0,0,0
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,3.585,0,0,1,0,0,0,1,0,0,0
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,3.521,0,0,1,0,0,0,1,0,0,0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,0,0,1,0,0,0,1,0,0,0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,0.781,0,0,0,1,0,0,1,0,0,0
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,0.771,0,0,0,1,0,0,1,0,0,0
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,0.923,0,0,0,1,0,0,1,0,0,0
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,0.847,0,0,0,1,0,0,1,0,0,0


## metrics save

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'mean_squared_error : {mse:.3f}')
r2 = r2_score(y_test, y_pred)
print(f'r2_score : {r2:.3f}')

mean_squared_error : 0.407
r2_score : 0.692


In [17]:
y_pred.shape

(5160,)

In [18]:
X_test.shape

(5160, 16)

In [28]:
y_test

SyntaxError: unmatched ')' (<ipython-input-28-26e892cbd225>, line 1)

In [25]:
y_pred

array([1.72517988, 2.60123091, 1.82300125, ..., 2.26374954, 1.99309676,
       2.34620205])

In [32]:
tf = pd.merge(X_test, y_test, left_index=True, right_index=True)
tf['Hosing_pred'] = y_pred
tf

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,"Latitude_(32.531, 34.422]","Latitude_(34.422, 36.304]","Latitude_(36.304, 38.186]","Latitude_(38.186, 40.068]","Latitude_(40.068, 41.95]","Longitude_(-124.36, -122.342]","Longitude_(-122.342, -120.334]","Longitude_(-120.334, -118.326]","Longitude_(-118.326, -116.318]","Longitude_(-116.318, -114.31]",MedHouseVal,Hosing_pred
14740,4.1518,22.0,5.663073,1.075472,1551.0,4.180593,1,0,0,0,0,0,0,0,1,0,1.369,1.725180
10101,5.7796,32.0,6.107226,0.927739,1296.0,3.020979,1,0,0,0,0,0,0,0,1,0,2.413,2.601231
20566,4.3487,29.0,5.930712,1.026217,1554.0,2.910112,0,0,0,1,0,0,1,0,0,0,2.007,1.823001
2670,2.4511,37.0,4.992958,1.316901,390.0,2.746479,1,0,0,0,0,0,0,0,0,1,0.725,1.548546
15709,5.0049,25.0,4.319261,1.039578,649.0,1.712401,0,0,1,0,0,1,0,0,0,0,4.600,4.283613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13132,4.0516,8.0,6.201299,1.048996,6002.0,3.543093,0,0,0,1,0,0,1,0,0,0,1.212,1.565497
8228,1.2321,35.0,3.062257,1.173152,820.0,1.595331,1,0,0,0,0,0,0,0,1,0,1.375,1.582815
3948,3.6296,16.0,3.616842,0.983158,896.0,1.886316,1,0,0,0,0,0,0,1,0,0,1.609,2.263750
8522,5.5133,37.0,4.593220,0.889831,355.0,3.008475,1,0,0,0,0,0,0,1,0,0,2.273,1.993097
