![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)
   
fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

# 1- Select and Train a Model

# Let’s first train a LinearRegression model 

In [None]:
# CODE HERE
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(full_pipeline, LinearRegression())

lin_reg.fit(housing, housing_labels)

# First try it out on a few instances from the training set:


In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_labels

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

In [None]:
# CODE HERE
lin_reg.predict(some_data)

array([181746.54359616, 290558.74973505, 244957.50017771, 146498.51061398,
       163230.42393939])

In [None]:
housing_predictions = lin_reg.predict(housing)

# measure this regression model’s RMSE on the whole training set 
* sing Scikit-Learn’s mean_squared_error() function:

In [None]:
# CODE HERE

from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(housing_labels, housing_predictions, squared = False)

In [None]:
lin_rmse

67593.20745775253

# judge on the RMSE result for this model 
write down your answar 

the rmse is not small, the model is underfitting the training data

# Let’s train a Decision Tree Regressor model 
## more powerful model

In [None]:
from sklearn.tree import DecisionTreeRegressor 

In [None]:
# CODE HERE

tree_reg = make_pipeline(full_pipeline, DecisionTreeRegressor(random_state=42))

tree_reg.fit(housing, housing_labels)

# Now evaluate the model on the training set 
* using Scikit-Learn’s mean_squared_error() function:

In [None]:
# CODE HERE

housing_predictions = tree_reg.predict(housing)

tree_rmse = mean_squared_error(housing_labels, housing_predictions,squared=False)

tree_rmse

0.0

# Explaine this result 
write down your answar

the decision tree overfitted the data, so the rmse is 0

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# CODE HERE

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels , scoring='neg_root_mean_squared_error' , cv=10)

2- display the resultant scores and calculate its Mean and Standard deviation

In [None]:
# CODE HERE

pd.Series(tree_rmses).describe()

count       10.000000
mean     68893.687971
std       2652.947532
min      65312.211113
25%      67219.522970
50%      68574.758465
75%      70415.073385
max      73925.150554
dtype: float64

3-repaet the same steps to compute the same scores for the Linear Regression  model 

*notice the difference between the results of the two models*

In [None]:
# CODE HERE
lin_rmses = -cross_val_score(lin_reg, housing, housing_labels , scoring='neg_root_mean_squared_error' , cv=10)

In [None]:
pd.Series(lin_rmses).describe()

count       10.000000
mean     67828.386774
std       2601.596761
min      65000.673826
25%      65472.168399
50%      67762.593108
75%      68849.373294
max      72739.875560
dtype: float64

## Let’s train one last model the RandomForestRegressor.

In [None]:
# CODE HERE

from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(full_pipeline, RandomForestRegressor(random_state=42))

forest_rmses = -cross_val_score(forest_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
# CODE HERE
pd.Series(forest_rmses).describe()

count       10.000000
mean     49682.765146
std       2030.717612
min      47014.394040
25%      47755.304303
50%      49753.603087
75%      51381.992160
max      52777.365636
dtype: float64

# Save every model you experiment with 
*using the joblib library*

In [None]:
# CODE HERE
import joblib

joblib.dump(lin_reg, "lin_housing_model.pkl")

joblib.dump(tree_reg, "tree_housing_model.pkl")

['tree_housing_model.pkl']

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor 
*It may take a long time*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# CODE HERE
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
 ("preprocessing", full_pipeline),
 ("random_forest",
RandomForestRegressor(random_state=42)),
])

param_grid = [
 {'random_forest__max_features': [4, 6, 8]},
 {'random_forest__max_features': [6, 8, 10]},
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')

grid_search.fit(housing, housing_labels)

with the evaluation scores

In [None]:
# CODE HERE
grid_search.best_params_

{'random_forest__max_features': 8}

In [None]:
grid_search.cv_results_

{'mean_fit_time': array([ 3.81051461,  5.13343366,  6.63828158,  5.63230189,  6.45716786,
        10.16372554]),
 'std_fit_time': array([0.41899651, 0.27977162, 0.38083016, 0.09886919, 0.42915952,
        2.02239157]),
 'mean_score_time': array([0.17188422, 0.17770712, 0.17636228, 0.16940864, 0.16638637,
        0.2082382 ]),
 'std_score_time': array([0.00311688, 0.02107007, 0.01562258, 0.01047088, 0.00178518,
        0.0601224 ]),
 'param_random_forest__max_features': masked_array(data=[4, 6, 8, 6, 8, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'random_forest__max_features': 4},
  {'random_forest__max_features': 6},
  {'random_forest__max_features': 8},
  {'random_forest__max_features': 6},
  {'random_forest__max_features': 8},
  {'random_forest__max_features': 10}],
 'split0_test_score': array([-49591.40923836, -49596.55656636, -49652.53132367, -49596.55656636,
        -49652.53132367, -50174.776

In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)

cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [None]:
# CODE HERE
final_model = grid_search.best_estimator_ 

feature_importances = final_model["random_forest"].feature_importances_

2-display these importance scores next to their corresponding attribute names:

In [None]:
# CODE HERE

feature_importances.round(2)

array([0.07, 0.06, 0.04, 0.01, 0.01, 0.01, 0.01, 0.38, 0.05, 0.11, 0.06,
       0.01, 0.16, 0.  , 0.  , 0.  ])

In [None]:
sorted(zip(feature_importances, housing.columns), reverse=True)

[(0.37779368093177956, 'median_income'),
 (0.06776419637846413, 'longitude'),
 (0.0639286434402403, 'latitude'),
 (0.050419717570566576, 'ocean_proximity'),
 (0.04256529997899305, 'housing_median_age'),
 (0.014739282663824007, 'total_rooms'),
 (0.014431838912060527, 'total_bedrooms'),
 (0.014326224034151203, 'population'),
 (0.013123577144554508, 'households')]

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [None]:
# CODE HERE

housing_predictors = housing

housing_labels = housing_labels

In [None]:
# CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing_predictors, housing_labels)

2-run your full_pipeline to transform the data

In [None]:
# CODE HERE
final_predictions = final_model.predict(X_test)

3-evaluate the final model on the test set

In [None]:
# CODE HERE
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
print(final_rmse) 

18181.424578495164


# compute a 95% confidence interval for the generalization error 
*using scipy.stats.t.interval():*

In [None]:
from scipy import stats

In [None]:
# CODE HERE
confidence = 0.95

squared_errors = (final_predictions - y_test) ** 2

np.sqrt( stats.t.interval( confidence, len(squared_errors) - 1, loc = squared_errors.mean() , scale = stats.sem(squared_errors) ) )

array([17334.34626837, 18990.75666884])

# Great Job!
# #shAI_Club