In [12]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
     os.makedirs(housing_path)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()


In [13]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis=1, inplace=True)
  

In [14]:
def seperating_labels(dataFrame):
  input_features = dataFrame.drop("median_house_value", axis=1)
  labels = dataFrame["median_house_value"].copy()
  return input_features, labels

train_input_features, train_labels = seperating_labels(strat_train_set)
test_input_features, test_labels = seperating_labels(strat_test_set)


train_num_features =  train_input_features.drop("ocean_proximity", axis=1)
test_num_features =  test_input_features.drop("ocean_proximity", axis=1)


In [15]:
rooms_index, bedrooms_index, population_index, households_index = 3, 4, 5, 6

class CombinedAttributesAdder():

  def fit(self, X, y=None):
    return self # nothing else to do
  
  def transform(self, X, y=None):
    rooms_per_household = X[:, rooms_index] / X[:, households_index]
    population_per_household = X[:, population_index] / X[:, households_index]
    bedrooms_per_room = X[:, bedrooms_index] / X[:, rooms_index]
    return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
             ('imputer', SimpleImputer(strategy="median")),
            #('attribs_adder', CombinedAttributesAdder()), #as we see from 1 question, it is better to not add the 3 features.
             ('std_scaler', StandardScaler()),
             ])

#housing_num_tr = num_pipeline.fit_transform(train_num_features)

num_attribs = list(train_num_features)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
             ("num", num_pipeline, num_attribs),
             ("cat", OneHotEncoder(), cat_attribs),],
             )

housing_prepared = full_pipeline.fit_transform(train_input_features)
housing_prepared_test = full_pipeline.fit_transform(test_input_features)
housing_prepared_test


array([[ 0.57471497, -0.69620912,  0.03285951, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.83839441, -0.86074221,  0.82471029, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.24635944, -0.65390061, -0.12551065, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.52496413, -0.66800344,  0.19122966, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.10704893, -0.75732141, -0.12551065, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.31601062,  0.55894333, -0.75899128, ...,  0.        ,
         0.        ,  0.        ]])

In [20]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, train_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_rmse = mean_squared_error(train_labels, housing_predictions, squared=False)
print('Train Set forest_rmse :', forest_rmse)
scores = cross_val_score(forest_reg, housing_prepared, train_labels,scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)

print('***********************')

forest_reg.fit(housing_prepared_test, test_labels)
housing_predictions = forest_reg.predict(housing_prepared_test)
forest_rmse = mean_squared_error(test_labels, housing_predictions, squared=False)
print('Test Set forest_rmse :', forest_rmse)
scores = cross_val_score(forest_reg, housing_prepared_test, test_labels ,scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)



Train Set forest_rmse : 18285.423904002368
Scores: [47908.45458623 46107.53761248 48922.83447929 50312.91982653
 49281.16898481 53404.05860454 48618.1518469  50404.71830684
 51597.78914316 49731.98862513]
Mean: 49628.96220159079
Standard deviation: 1902.8855466142086
***********************
Test Set forest_rmse : 20195.189515856553
Scores: [53987.34427416 57681.6824339  55480.12499877 52783.5912555
 54381.83146151 51830.87924683 55468.24261498 49931.9534748
 54287.78671753 49763.4028622 ]
Mean: 53559.68393401742
Standard deviation: 2384.822406275257
