# Install Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sb
import requests
import os
import csv
import tarfile
import tensorflow as tf
from tensorflow import keras
import joblib
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, QuantileTransformer, PolynomialFeatures
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Get the Data

In [4]:
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()

# Create a Test Set

In [6]:
housing['income_cat'] = pd.cut(housing['median_income'], bins = [0, 1.5, 3, 4.5, 6, np.inf], labels = [1,2,3,4,5])

In [7]:
sss= StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in sss.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [8]:
(strat_train_set['income_cat'].value_counts() / len(strat_train_set)).sort_index()

1    0.039850
2    0.318859
3    0.350594
4    0.176296
5    0.114402
Name: income_cat, dtype: float64

In [9]:
(strat_test_set['income_cat'].value_counts() / len(strat_test_set)).sort_index()

1    0.039729
2    0.318798
3    0.350533
4    0.176357
5    0.114583
Name: income_cat, dtype: float64

In [10]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis =1, inplace = True)

In [11]:
#Separate the predictors and labels
housing = strat_train_set.drop('median_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()

In [12]:
housing_num = housing.drop('ocean_proximity', axis = 1)
housing_cat = housing[['ocean_proximity']]

# Preparing the Data for Machine Learning

In [13]:
col_names = ['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
rooms_ix, bedrooms_ix, population_ix, households_ix, income_ix = [housing.columns.get_loc(c) for c in col_names]

In [14]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [15]:
def display_scores(scores):
    scores = [float('{:0.2f}'.format(v)) for v in scores]
    print('Scores: ', scores)
    print('Mean: ', np.mean(scores))
    print('Standard Deviation: ', np.std(scores))

# Model 1

In [16]:
housing_m1 = housing.copy()

In [17]:
housing_m1['inland'] = (housing_m1['ocean_proximity'] == 'INLAND').astype(int)

In [18]:
housing_m1.drop('ocean_proximity', axis = 1, inplace = True)

In [26]:
num_pipeline_m1 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [27]:
housing_prepared_m1 = num_pipeline_m1.fit_transform(housing_m1)

In [28]:
forest_reg_m1 = RandomForestRegressor(n_estimators = 100, random_state = 42)
forest_reg_m1.fit(housing_prepared_m1, housing_labels)

RandomForestRegressor(random_state=42)

In [29]:
housing_predictions_m1 = forest_reg_m1.predict(housing_prepared_m1)
forest_rmse_m1 = mean_squared_error(housing_labels, housing_predictions_m1, squared = False)
forest_rmse_m1

18663.227306974266

In [30]:
#cross validation
forest_scores_m1 = cross_val_score(forest_reg_m1, housing_prepared_m1, housing_labels, scoring = 'neg_mean_squared_error', 
                            cv = 10)
forest_rmse_scores_m1 = np.sqrt(-forest_scores_m1)
forest_rmse_scores_m1

array([49769.30165401, 47716.69111426, 49933.3830373 , 52227.98155561,
       49417.9300074 , 53681.54181692, 48537.09302853, 47891.85350849,
       53452.40606155, 50233.53875565])

In [31]:
display_scores(forest_rmse_scores_m1)

Scores:  [49769.3, 47716.69, 49933.38, 52227.98, 49417.93, 53681.54, 48537.09, 47891.85, 53452.41, 50233.54]
Mean:  50286.171
Standard Deviation:  2046.1364036957568


In [44]:
param_grid = [
    {'n_estimators':[50, 70, 100], 'max_features':[8, 10,11]},
    {'bootstrap':[False], 'n_estimators':[30,40], 'max_features':[8,10,11]}
]

forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(estimator = forest_reg_m1, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(housing_prepared_m1, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [8, 10, 11],
                          'n_estimators': [50, 70, 100]},
                         {'bootstrap': [False], 'max_features': [8, 10, 11],
                          'n_estimators': [30, 40]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [45]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, random_state=42)

In [46]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 100}

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

49972.28155920334 {'max_features': 8, 'n_estimators': 50}
49865.75564632418 {'max_features': 8, 'n_estimators': 70}
49774.159915810116 {'max_features': 8, 'n_estimators': 100}
50285.44791388631 {'max_features': 10, 'n_estimators': 50}
50078.836185158965 {'max_features': 10, 'n_estimators': 70}
49957.08308440036 {'max_features': 10, 'n_estimators': 100}
50625.090510511014 {'max_features': 11, 'n_estimators': 50}
50435.89005860777 {'max_features': 11, 'n_estimators': 70}
50361.82383676503 {'max_features': 11, 'n_estimators': 100}
50262.883998496836 {'bootstrap': False, 'max_features': 8, 'n_estimators': 30}
50242.23110838541 {'bootstrap': False, 'max_features': 8, 'n_estimators': 40}
52515.56760061423 {'bootstrap': False, 'max_features': 10, 'n_estimators': 30}
52400.78056783633 {'bootstrap': False, 'max_features': 10, 'n_estimators': 40}
57643.66590811575 {'bootstrap': False, 'max_features': 11, 'n_estimators': 30}
56708.21180500156 {'bootstrap': False, 'max_features': 11, 'n_estimators

In [48]:
m1 = grid_search.best_estimator_

In [86]:
m1_scores = cross_val_score(m1, housing_prepared_m1, housing_labels, scoring = 'neg_mean_squared_error', 
                            cv = 10)
m1_rmse_scores = np.sqrt(-m1_scores)
m1_rmse_scores

array([49027.80904186, 46759.51741062, 49398.12481633, 51503.93007782,
       48594.10227291, 52887.14890295, 48398.49552373, 47065.87724938,
       52522.27008632, 49089.00717728])

In [87]:
display_scores(m1_rmse_scores)

Scores:  [49027.81, 46759.52, 49398.12, 51503.93, 48594.1, 52887.15, 48398.5, 47065.88, 52522.27, 49089.01]
Mean:  49524.629
Standard Deviation:  2012.161877665165


In [49]:
joblib.dump(m1, "model_1.pkl")

['model_1.pkl']

# Model 2

- Model 2 will use the same dataset (housing_m1) as Model 1.

In [51]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        result = np.c_[X, np.log(X[:,[rooms_ix, bedrooms_ix, population_ix, households_ix, income_ix]])]
        return result[:, [0, 1, 2, 8, 9, 10, 11, 12, 13, 14, 15]]

In [52]:
num_pipeline_m2 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('log_transform', LogTransformer()),
])

In [53]:
housing_prepared_m2 = num_pipeline_m2.fit_transform(housing_m1)

In [54]:
forest_reg_m2 = RandomForestRegressor(n_estimators = 100, random_state = 42)
forest_reg_m2.fit(housing_prepared_m2, housing_labels)

RandomForestRegressor(random_state=42)

In [55]:
housing_predictions_m2 = forest_reg_m2.predict(housing_prepared_m2)
forest_rmse_m2 = mean_squared_error(housing_labels, housing_predictions_m2, squared = False)
forest_rmse_m2

18464.357419920947

In [56]:
#cross validation
forest_scores_m2 = cross_val_score(forest_reg_m2, housing_prepared_m2, housing_labels, scoring = 'neg_mean_squared_error', 
                            cv = 10)
forest_rmse_scores_m2 = np.sqrt(-forest_scores_m2)
forest_rmse_scores_m2

array([48159.07314616, 49870.23763387, 51543.90038814, 50685.01535649,
       48964.37134583, 52194.43343416, 47120.54479468, 48368.87280846,
       52726.84487212, 49329.48272654])

In [57]:
display_scores(forest_rmse_scores_m2)

Scores:  [48159.07, 49870.24, 51543.9, 50685.02, 48964.37, 52194.43, 47120.54, 48368.87, 52726.84, 49329.48]
Mean:  49896.276
Standard Deviation:  1757.166321024847


In [58]:
param_grid = [
    {'n_estimators':[50, 70, 100], 'max_features':[8, 10,11]},
    {'bootstrap':[False], 'n_estimators':[30,40], 'max_features':[8,10,11]}
]

forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(estimator = forest_reg_m2, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(housing_prepared_m2, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [8, 10, 11],
                          'n_estimators': [50, 70, 100]},
                         {'bootstrap': [False], 'max_features': [8, 10, 11],
                          'n_estimators': [30, 40]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [59]:
grid_search.best_estimator_

RandomForestRegressor(max_features=10, random_state=42)

In [62]:
grid_search.best_params_

{'max_features': 10, 'n_estimators': 100}

In [60]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

50730.14222313894 {'max_features': 8, 'n_estimators': 50}
50677.767814910956 {'max_features': 8, 'n_estimators': 70}
50498.48216750671 {'max_features': 8, 'n_estimators': 100}
50548.53454391526 {'max_features': 10, 'n_estimators': 50}
50407.08564981479 {'max_features': 10, 'n_estimators': 70}
50350.17613803004 {'max_features': 10, 'n_estimators': 100}
50707.595690584836 {'max_features': 11, 'n_estimators': 50}
50445.66365978271 {'max_features': 11, 'n_estimators': 70}
50403.49501139747 {'max_features': 11, 'n_estimators': 100}
51546.366692260606 {'bootstrap': False, 'max_features': 8, 'n_estimators': 30}
51392.16655257548 {'bootstrap': False, 'max_features': 8, 'n_estimators': 40}
56724.198000299235 {'bootstrap': False, 'max_features': 10, 'n_estimators': 30}
56589.17563436104 {'bootstrap': False, 'max_features': 10, 'n_estimators': 40}
68290.29860166984 {'bootstrap': False, 'max_features': 11, 'n_estimators': 30}
68246.38734385578 {'bootstrap': False, 'max_features': 11, 'n_estimators

In [67]:
from scipy.stats import randint

param_distribs = {
    'n_estimators': randint(low = 1, high = 200),
    'max_features': randint(low = 1, high = 11)
}


rnd_search = RandomizedSearchCV(estimator = forest_reg_m2,
                                param_distributions = param_distribs,
                                n_iter = 10,
                                cv = 10,
                                scoring = 'neg_mean_squared_error',
                                random_state = 42)

rnd_search.fit(housing_prepared_m2, housing_labels)

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017D80FC9250>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017D80E815B0>},
                   random_state=42, scoring='neg_mean_squared_error')

In [70]:
rnd_search.best_params_

{'max_features': 8, 'n_estimators': 189}

In [71]:
rnd_cvres = rnd_search.cv_results_
for mean_score, params in zip(rnd_cvres['mean_test_score'], rnd_cvres['params']):
    print(np.sqrt(-mean_score), params)

49754.12035720878 {'max_features': 7, 'n_estimators': 180}
49684.33036520984 {'max_features': 8, 'n_estimators': 189}
50704.889102897476 {'max_features': 5, 'n_estimators': 103}
49934.53871079605 {'max_features': 10, 'n_estimators': 75}
49797.49750233286 {'max_features': 8, 'n_estimators': 117}
51309.95317199467 {'max_features': 4, 'n_estimators': 104}
49782.978951284356 {'max_features': 8, 'n_estimators': 131}
50353.6840252857 {'max_features': 6, 'n_estimators': 53}
55127.010426953246 {'max_features': 2, 'n_estimators': 88}
50127.68208116778 {'max_features': 6, 'n_estimators': 130}


In [76]:
m2 = rnd_search.best_estimator_
m2

RandomForestRegressor(max_features=8, n_estimators=189, random_state=42)

In [77]:
m2_scores = cross_val_score(m2, housing_prepared_m2, housing_labels, scoring = 'neg_mean_squared_error', 
                            cv = 10)
m2_rmse_scores = np.sqrt(-m2_scores)
m2_rmse_scores

array([48348.80349514, 49545.22528717, 50543.66916096, 50170.1745234 ,
       48591.07087516, 52165.29469632, 47440.56710356, 47898.17779489,
       52794.04625188, 49060.34532991])

In [78]:
display_scores(m2_rmse_scores)

Scores:  [48348.8, 49545.23, 50543.67, 50170.17, 48591.07, 52165.29, 47440.57, 47898.18, 52794.05, 49060.35]
Mean:  49655.738
Standard Deviation:  1685.354566124292


In [90]:
joblib.dump(m2, "model_2.pkl")

['model_2.pkl']

# Model 3

- Model 2 will use the same dataset (housing_m1) as Model 1.

In [113]:
num_pipeline_m3 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ('polyfeatures', PolynomialFeatures())
])

In [114]:
housing_prepared_m3 = num_pipeline_m3.fit_transform(housing_m1)

In [115]:
forest_reg_m3 = RandomForestRegressor(n_estimators = 100, random_state = 42)
forest_reg_m3.fit(housing_prepared_m3, housing_labels)

RandomForestRegressor(random_state=42)

In [116]:
housing_predictions_m3 = forest_reg_m3.predict(housing_prepared_m3)
forest_rmse_m3 = mean_squared_error(housing_labels, housing_predictions_m3, squared = False)
forest_rmse_m3

18577.050499344437

In [117]:
param_grid = [
    {'n_estimators':[50, 70, 100], 'max_features':[8, 10,11]},
    {'bootstrap':[False], 'n_estimators':[30,40], 'max_features':[8,10,11]}
]

forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(estimator = forest_reg_m3, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error',
                           return_train_score = True)

grid_search.fit(housing_prepared_m3, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [8, 10, 11],
                          'n_estimators': [50, 70, 100]},
                         {'bootstrap': [False], 'max_features': [8, 10, 11],
                          'n_estimators': [30, 40]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [118]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, max_features=11, n_estimators=40,
                      random_state=42)

In [119]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

51062.18395029619 {'max_features': 8, 'n_estimators': 50}
50965.54540983591 {'max_features': 8, 'n_estimators': 70}
50990.61270130175 {'max_features': 8, 'n_estimators': 100}
51136.23962260938 {'max_features': 10, 'n_estimators': 50}
51070.807999221885 {'max_features': 10, 'n_estimators': 70}
50748.982616812944 {'max_features': 10, 'n_estimators': 100}
50908.29753226171 {'max_features': 11, 'n_estimators': 50}
50841.0857028081 {'max_features': 11, 'n_estimators': 70}
50611.586376825704 {'max_features': 11, 'n_estimators': 100}
50696.97170780656 {'bootstrap': False, 'max_features': 8, 'n_estimators': 30}
50398.36560215212 {'bootstrap': False, 'max_features': 8, 'n_estimators': 40}
50477.75162816296 {'bootstrap': False, 'max_features': 10, 'n_estimators': 30}
50159.72929064109 {'bootstrap': False, 'max_features': 10, 'n_estimators': 40}
50261.68361535132 {'bootstrap': False, 'max_features': 11, 'n_estimators': 30}
49956.16057335806 {'bootstrap': False, 'max_features': 11, 'n_estimators':