In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
     os.makedirs(housing_path)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis=1, inplace=True)


In [None]:
def adding_attributes(dataFrame):
  dataFrame["rooms_per_household"] = dataFrame["total_rooms"]/dataFrame["households"]
  dataFrame["bedrooms_per_room"] = dataFrame["total_bedrooms"]/dataFrame["total_rooms"]
  dataFrame["population_per_household"]=dataFrame["population"]/dataFrame["households"]
    
adding_attributes(strat_train_set)
adding_attributes(strat_test_set)



In [None]:
def seperating_labels(dataFrame):
  input_features = dataFrame.drop("median_house_value", axis=1)
  labels = dataFrame["median_house_value"].copy()
  return input_features, labels

train_input_features, train_labels =seperating_labels(strat_train_set)
test_input_features, test_labels = seperating_labels(strat_test_set)



In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

train_num_features =  train_input_features.drop("ocean_proximity", axis=1)
test_num_features =  test_input_features.drop("ocean_proximity", axis=1)

imputer.fit(train_num_features)

train_nparray = imputer.transform(train_num_features)
test_nparray = imputer.transform(test_num_features)

train_num = pd.DataFrame(train_nparray, columns=train_num_features.columns)
test_num =  pd.DataFrame(test_nparray, columns=test_num_features.columns)

train_num.isnull().sum()


In [None]:
test_num.isnull().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder

train_cat = train_input_features[["ocean_proximity"]]
test_cat = test_input_features[["ocean_proximity"]]

cat_encoder = OneHotEncoder()

train_cat_1hot = cat_encoder.fit_transform(train_cat)
test_cat_1hot = cat_encoder.transform(test_cat)

train_cat_1hot = pd.DataFrame.sparse.from_spmatrix(train_cat_1hot)
test_cat_1hot = pd.DataFrame.sparse.from_spmatrix(test_cat_1hot)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_scaling = scaler.fit_transform(train_num)
test_scaling = scaler.transform(test_num)

train_num_normalized = pd.DataFrame(train_scaling, columns=train_num.columns)
test_num_normalized =  pd.DataFrame(test_scaling, columns=test_num.columns)


In [None]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

train_concat = pd.concat([train_num,train_cat_1hot],axis = 1)
test_concat = pd.concat([test_num,test_cat_1hot],axis = 1)

train_concat_normalized = pd.concat([train_num_normalized,train_cat_1hot],axis = 1)
test_concat_normalized = pd.concat([test_num_normalized,test_cat_1hot],axis = 1)


def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

forest_reg = RandomForestRegressor()

normalization = 0 
addition_features = 0
min_rmse = np.finfo('d').max
for normalization in range(2):
    for addition_features in range(2):

        if normalization == 0 and addition_features == 0:
            train_last = train_concat.drop(["rooms_per_household","bedrooms_per_room","population_per_household"], axis = 1)
            test_last = test_concat.drop(["rooms_per_household","bedrooms_per_room","population_per_household"], axis = 1)
            print( "Not Normalized, Not Added Features " )

           
        if normalization == 0 and addition_features == 1:
            train_last = train_concat
            test_last = test_concat     
            print( "Not Normalized, Added Features " )

        if normalization == 1 and addition_features == 0:
            train_last = train_concat_normalized.drop(["rooms_per_household","bedrooms_per_room","population_per_household"], axis = 1)
            test_last = test_concat_normalized.drop(["rooms_per_household","bedrooms_per_room","population_per_household"], axis = 1)
            print( "Normalized, Not Added Features " )
        
        if normalization == 1 and addition_features == 1:
            train_last = train_concat_normalized
            test_last = test_concat_normalized
            print( "Normalized, Added Features " )


        
        forest_reg.fit(train_last, train_labels)
        housing_predictions = forest_reg.predict(train_last)
        forest_rmse = mean_squared_error(train_labels, housing_predictions, squared=False)
        print('Train Set forest_rmse :', forest_rmse)
        scores = cross_val_score(forest_reg, train_last, train_labels, scoring="neg_mean_squared_error", cv=10)
        forest_rmse_scores = np.sqrt(-scores)
        display_scores(forest_rmse_scores)

        if forest_rmse < min_rmse:
           best_combination = f"Normalization : {normalization} ,  Addition Features : {addition_features}"
           min_rmse = forest_rmse
        print('***********************')

        forest_reg.fit(test_last, test_labels)
        housing_predictions = forest_reg.predict(test_last)
        forest_rmse = mean_squared_error(test_labels, housing_predictions, squared=False)
        print('Test Set forest_rmse :', forest_rmse)
        scores = cross_val_score(forest_reg, test_last, test_labels ,scoring="neg_mean_squared_error", cv=10)
        forest_rmse_scores = np.sqrt(-scores)
        display_scores(forest_rmse_scores)

        print("\n")

print("Best Combination : ", best_combination)

Not Normalized, Not Added Features 
Train Set forest_rmse : 18282.409781377282
Scores: [47858.27747953 46234.39065588 49200.9646124  50053.19432666
 49280.92970309 53169.34918717 48565.30365874 50107.16442955
 51558.55987124 49253.71286522]
Mean: 49528.18467894792
Standard deviation: 1811.0689338327225
***********************
Test Set forest_rmse : 20035.551227570628
Scores: [53801.4828659  58009.37197627 55531.14483723 52834.62157425
 54611.97574775 51799.2808702  55051.44873498 50020.21607206
 54264.2006595  50534.61948502]
Mean: 53645.83628231684
Standard deviation: 2297.89539267382


Not Normalized, Added Features 
Train Set forest_rmse : 18664.37598190893
Scores: [49326.53162497 47424.49072092 49752.80345785 52209.85725218
 49996.20390506 53252.08061655 49191.05549853 47564.35101971
 52622.06238828 50209.45139296]
Mean: 50154.888787699216
Standard deviation: 1894.0268873806688
***********************
Test Set forest_rmse : 20122.566368349566
Scores: [53873.84539224 56595.03902319 