In [None]:
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    os.makedirs(housing_path,exist_ok=True)
    tgz_path = os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [None]:
#running the functions to fetch and load data
fetch_housing_data()
housing = load_housing_data()

#getting the dataframe information:

print(housing.head())
print(housing.info())
print(housing.describe())
print(housing["ocean_proximity"].value_counts())

In [None]:
#data viusualization:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#spliting the data into training and testing sets:
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.25,random_state=42)

In [None]:
#defining the income categories:
import numpy as np
housing["income_cat"]= pd.cut(housing["median_income"],bins=[0.,1.5,3,4.5,6.,np.inf],labels=[1,2,3,4,5])
housing["income_cat"].hist()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

#droping the income cat columns of the set:
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
#exploring the data through visualization:
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)

In [None]:
#exploring the data through visualization:
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,s=housing["population"]/100,label="population",figsize=(10,7),c="median_house_value",cmap=plt.get_cmap("jet"))
plt.legend()
plt.show()

In [None]:
#explorign the data through corelation matrix:
corr_matrix = housing.corr()

housing.head()

print(list(enumerate(housing.keys())))

print(corr_matrix["median_house_value"].sort_values(ascending=False))

plt.matshow(housing.corr())
plt.show()

In [None]:
#explorign the data through correlation plots:
from pandas.plotting import scatter_matrix
attrbutes=['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attrbutes],figsize=(12,8))


In [None]:
print(list(enumerate(housing.keys())))

In [None]:
#defining new attributes for the dataset:
housing["rooms_per_household"]=housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"]=housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix = housing.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False))

plt.matshow(housing.corr())
plt.show()

In [None]:
#preparing the data for machine learning implementation:
#clean start:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

#replaceing the NaN values with the calculated mean.
'''
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median,inplace=True)
'''
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

#only numerical data:
housing_num = housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
#creating final traingin data:
X = imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)

print(X)
print(housing_tr)

In [None]:
#transfering categorical variables into numeric description: - machine learning algoritms dont work on text they work on vectors/tensors
from sklearn.preprocessing import OneHotEncoder
cat_econder = OneHotEncoder()
housing_cat=housing[["ocean_proximity"]]
housing_cat_1hot = cat_econder.fit_transform(housing_cat)
#print(housing_cat_1hot)
print(housing_cat_1hot.toarray())
print(cat_econder.categories_)

In [None]:
#reminder on what is housing data:
print(type(housing)) #a DataFrame
print(housing.head())
print(housing.keys())
print(housing.values)

In [None]:
#creating custom transformers:
from sklearn.base import BaseEstimator, TransformerMixin
#defining indexes of attibutes in the dataset:
rooms_ix, bedrooms_ix, population_ix,households_ix= 3,4,5,6
#defining the class that inherits from sklearn transformers to enable adding parameters to the dataset:
class CombinedAttibutesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        rooms_per_household=X[:,rooms_ix]/X[:,households_ix]
        population_per_household=X[:,population_ix]/X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
#read up on what np.c_ does! - arrays concatination along axis.
'''
np.c_[np.array([1,2,3]), np.array([4,5,6])]
array([[1, 4],
       [2, 5],
       [3, 6]])
'''
attr_adder=CombinedAttibutesAdder(add_bedrooms_per_room=False)
housing_extra_attirbs = attr_adder.transform(housing.values)

In [None]:
#feature scaling for optimal learning experience :)
#STANDARDIZATION & MIN-MAX SCALING as two basica algorythms:

#sklearn PIPELINES: - execution of data transfromations in a tipical order:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    #imputer allows to clean the data form NANs and replace them with median values:
    ("imputer",SimpleImputer(strategy="median")),
    #attribs_adder has been defined in a previous block as a class that allows to extend the attributes:
    ("attribs_adder",CombinedAttibutesAdder()),
    #std_scaler standardizes the data:
    ("std_scaler",StandardScaler()),
])

#learning ready data:
housing_num_tr = num_pipeline.fit_transform(housing_num)


In [None]:
#Column transformer - allows to perform transformations of a pipeline column by column and distinguish numeriacal and cathegorical data:
from sklearn.compose import ColumnTransformer

"""
print(housing_num)
print(list(housing_num))
"""
num_attribs=list(housing_num)
cat_attribs=["ocean_proximity"]

#combined pipeline that applies different transformations based on the attibutes of the datafarme:
full_pipeline = ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("cat",OneHotEncoder(),cat_attribs),
])

housing_prepared=full_pipeline.fit_transform(housing)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
print(housing)

In [None]:
#choosing the model and training:
#linear regression model:

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
#testing the model:
some_data =  housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = housing_prepared[:5]

print("Predictions: ",lin_reg.predict(some_data_prepared))
print("Actual Values: ",list(some_labels))
print(lin_reg.score(some_data_prepared,some_labels))

#model underfitting

In [None]:
#choosing the model and training:
#decision tree model:

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
#testing the model:
some_data =  housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = housing_prepared[:5]

print("Predictions: ",tree_reg.predict(some_data_prepared))
print("Actual Values: ",list(some_labels))
print(tree_reg.score(some_data_prepared,some_labels))

#model overfitting

In [None]:
#model cross validation for more precise understanding of performance:

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores: ",scores)
    print("Mean :",scores.mean())
    print("Standard Deviation :",scores.std())

display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [None]:
def display_scores(scores):
    print("Scores: ",scores)
    print("Mean :",scores.mean())
    print("Standard Deviation :",scores.std())

display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
#testing the model:
some_data =  housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = housing_prepared[:5]

print("Predictions: ",forest_reg.predict(some_data_prepared))
print("Actual Values: ",list(some_labels))
print(tree_reg.score(some_data_prepared,some_labels))

forest_scores = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

def display_scores(scores):
    print("Scores: ",scores)
    print("Mean :",scores.mean())
    print("Standard Deviation :",scores.std())

display_scores(forest_rmse_scores)

In [None]:
#Finetuning the models:

#01.Grid search
#02.Randomize search
#03.Ensemble methods

from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)


In [None]:
grid_search.best_params_
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
    
pd.DataFrame(grid_search.cv_results_)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [59]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

housing_prepared=full_pipeline.fit_transform(housing)

housing_test = strat_test_set.drop("median_house_value",axis=1)
housing_test_lables = strat_test_set["median_house_value"].copy()

housing_test_prepared=full_pipeline.fit_transform(housing)

final_predictions = final_model.predict(housing_test_prepared)

final_score = final_model.score(housing_test_prepared,housing_labels)

print(final_score)

0.9724473581719917
