In [1]:
# Needed packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

housing = pd.read_csv('cal_housing_dataset.csv')

display(housing)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [2]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, random_state=0)
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15480 entries, 19226 to 2732
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           15480 non-null  float64
 1   latitude            15480 non-null  float64
 2   housing_median_age  15480 non-null  float64
 3   total_rooms         15480 non-null  float64
 4   total_bedrooms      15331 non-null  float64
 5   population          15480 non-null  float64
 6   households          15480 non-null  float64
 7   median_income       15480 non-null  float64
 8   median_house_value  15480 non-null  float64
 9   ocean_proximity     15480 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.3+ MB


In [3]:
X_train = train_set.drop(columns=["median_house_value"])
y_train = train_set["median_house_value"]
display(X_train)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
19226,-122.68,38.46,19.0,4976.0,711.0,1926.0,625.0,7.3003,<1H OCEAN
14549,-117.24,32.95,18.0,1591.0,268.0,547.0,243.0,5.9547,NEAR OCEAN
9093,-118.27,34.68,19.0,552.0,129.0,314.0,106.0,3.2125,INLAND
12213,-117.18,33.51,13.0,270.0,42.0,120.0,42.0,6.9930,<1H OCEAN
12765,-121.41,38.62,21.0,3260.0,763.0,1735.0,736.0,2.5162,INLAND
...,...,...,...,...,...,...,...,...,...
13123,-121.26,38.27,20.0,1314.0,229.0,712.0,219.0,4.4125,INLAND
19648,-120.89,37.48,27.0,1118.0,195.0,647.0,209.0,2.9135,INLAND
9845,-121.90,36.58,31.0,1431.0,,704.0,393.0,3.1977,NEAR OCEAN
10799,-117.93,33.62,34.0,2125.0,498.0,1052.0,468.0,5.6315,<1H OCEAN


In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# synthaxis is Transformer( [list of Transformer each inside (name of transformer, instantiation )])
housing_num = X_train.drop(columns=["ocean_proximity"])
num_feat = housing_num.columns
cat_feat = ["ocean_proximity"]

full_pipeline = ColumnTransformer([('num', num_pipeline, num_feat), ('cat', OneHotEncoder(), cat_feat)])

predicting_pipeline_ridge = Pipeline([('prepare', full_pipeline), ('ridge', Ridge())])

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid_ridge = {
    'ridge__alpha': [0.01, 0.1, 0.5,1,2,5,10]
             }

grid_search_ridge = GridSearchCV(predicting_pipeline_ridge, param_grid_ridge, cv =5)

grid_search_ridge.fit(X_train,y_train)
print(grid_search_ridge.best_estimator_)

Pipeline(steps=[('prepare',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  ['ocean_proximity'])])),
                ('ridge', Ridge(alpha=0.1))])


In [6]:
best_model_ridge = grid_search_ridge.best_estimator_
best_model_ridge.fit(X_train,y_train)

In [7]:
print(best_model_ridge.score(X_train, y_train))

0.6475442020896096


In [8]:
X_test = test_set.drop(columns=["median_house_value"])
y_test = test_set["median_house_value"]
print(best_model_ridge.score(X_test, y_test))

0.6384718253770056


In [9]:
predicting_pipeline_tree = Pipeline([('prepare', full_pipeline), ('tree', DecisionTreeRegressor(random_state=42))])

param_grid_tree = {'tree__max_depth': np.arange(2, 10), 'tree__min_samples_leaf': np.arange(2, 10)}

grid_search_tree = GridSearchCV(predicting_pipeline_tree, param_grid_tree, cv =5)

grid_search_tree.fit(X_train,y_train)
print(grid_search_tree.best_estimator_)

best_model_tree = grid_search_tree.best_estimator_
best_model_tree.fit(X_train,y_train)

print(best_model_tree.score(X_train, y_train))
print(best_model_tree.score(X_test, y_test))

Pipeline(steps=[('prepare',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  ['ocean_proximity'])])),
                ('tree',
                 DecisionTreeRegressor(max_depth=9, min_samples_leaf=8,
                                       random_state=42))])
0.7820176520831984
0.7237627917272901


In [11]:
from sklearn.ensemble import RandomForestRegressor

predicting_pipeline_forest = Pipeline([('prepare', full_pipeline), ('forest',RandomForestRegressor(random_state=42))])

param_grid_forest = {'forest__max_depth': [3, 4, 5, 6],
              'forest__n_estimators' : [200],
             'forest__max_features': [8,10,20, None],
              'forest__max_samples': [0.7, 0.8],
             'forest__n_jobs': [-1]}

grid_search_forest = GridSearchCV(predicting_pipeline_forest, param_grid_forest, cv =3)

grid_search_forest.fit(X_train,y_train)
print(grid_search_forest.best_estimator_)

best_model_forest = grid_search_forest.best_estimator_
best_model_forest.fit(X_train,y_train)

print(best_model_forest.score(X_train, y_train))
print(best_model_forest.score(X_test, y_test))

Pipeline(steps=[('prepare',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  ['ocean_proximity'])])),
                ('forest',
                 RandomForestRegressor(max_depth=6, max_features=8,
                                       max_samples=0.7, n_estimators=200,
                                      

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

predicting_pipeline_gbdt = Pipeline([('prepare', full_pipeline), ('gbdt',GradientBoostingRegressor())])


param_grid_gbdt = {'gbdt__max_depth': [3, 4, 5,6],
              'gbdt__n_estimators' : [50, 100, 150],
             'gbdt__learning_rate': [0.05, 0.1, 0.2, 1.0],
              }

grid_search_gbdt = GridSearchCV(predicting_pipeline_gbdt, param_grid_gbdt, cv =3)

grid_search_gbdt.fit(X_train,y_train)
print(grid_search_gbdt.best_estimator_)

best_model_gbdt = grid_search_gbdt.best_estimator_
best_model_gbdt.fit(X_train,y_train)

print(best_model_gbdt.score(X_train, y_train))
print(best_model_gbdt.score(X_test, y_test))

Pipeline(steps=[('prepare',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  ['ocean_proximity'])])),
                ('gbdt',
                 GradientBoostingRegressor(max_depth=6, n_estimators=150))])
0.9153017418130605
0.8301822898505704


In [20]:
from sklearn.base import BaseEstimator, RegressorMixin
class BlendingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, regressors):
        self.regressors = regressors

    def fit(self, X, y):
        for regressor in self.regressors:
            regressor.fit(X, y)
        return self

    def predict(self, X):
        predictions = [regressor.predict(X) for regressor in self.regressors]
        return np.mean(predictions, axis=0)


predicting_pipeline_blending = Pipeline([
    ('prepare', full_pipeline),
    ('blending', BlendingRegressor([r]))
])
grid_search_blending = GridSearchCV(predicting_pipeline_blending, param_grid={}, cv =3)

grid_search_blending.fit(X_train,y_train)
print(grid_search_blending.best_estimator_)

best_model_blending = grid_search_blending.best_estimator_
best_model_blending.fit(X_train,y_train)

print(best_model_blending.score(X_train, y_train))
print(best_model_blending.score(X_test, y_test))

Pipeline(steps=[('prepare',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  ['ocean_proximity'])])),
                ('blending',
                 BlendingRegressor(regressors=[Ridge(), RandomForestRegressor(),
                                               GradientBoostingRegressor()]))])
0.854706364662338