![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)
   
fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
#split test set data 
X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

X_test_housing_num = X_test.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

X_test = full_pipeline.fit_transform(X_test)

In [None]:
train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,229200.0,<1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,97800.0,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,222100.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN


# 1- Select and Train a Model

# Let’s first train a LinearRegression model 

In [None]:
# CODE HERE
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
print('Linear Regression Train Score is : ' , lin_reg.score(housing_prepared, housing_labels))

Linear Regression Train Score is :  0.6582199160539229


# First try it out on a few instances from the training set:


In [None]:
some_data = housing_prepared[:5]
some_labels = housing_labels.iloc[:5]
print(f'X_train data is  :-{some_data}')
print('################################################################################')
print(f'y_train data is  :-{some_labels}')

X_train data is  :-[[ 1.27258656 -1.3728112   0.34849025  0.22256942  0.21122752  0.76827628
   0.32290591 -0.326196   -0.17491646  0.05137609 -0.2117846   0.
   0.          0.          0.          1.        ]
 [ 0.70916212 -0.87669601  1.61811813  0.34029326  0.59309419 -0.09890135
   0.6720272  -0.03584338 -0.40283542 -0.11736222  0.34218528  0.
   0.          0.          0.          1.        ]
 [-0.44760309 -0.46014647 -1.95271028 -0.34259695 -0.49522582 -0.44981806
  -0.43046109  0.14470145  0.08821601 -0.03227969 -0.66165785  0.
   0.          0.          0.          1.        ]
 [ 1.23269811 -1.38217186  0.58654547 -0.56148971 -0.40930582 -0.00743434
  -0.38058662 -1.01786438 -0.60001532  0.07750687  0.78303162  0.
   0.          0.          0.          1.        ]
 [-0.10855122  0.5320839   1.14200767 -0.11956547 -0.25655915 -0.48587717
  -0.31496232 -0.17148831  0.3490073  -0.06883176 -0.55036364  0.
   1.          0.          0.          0.        ]]
#########################

In [None]:
# CODE HERE
housing_predictions = lin_reg.predict(housing_prepared)
print(housing_predictions[:5].round(2))
print(housing_labels[:5])

[181746.54 290558.75 244957.5  146498.51 163230.42]
14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64


# measure this regression model’s RMSE on the whole training set 
* sing Scikit-Learn’s mean_squared_error() function:

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)


In [None]:
# CODE HERE
#Calculating Root mean Squared Error
print(f'Root Mean Squared Error is : {lin_rmse}')

Root Mean Squared Error is : 67593.20745775253


# judge on the RMSE result for this model 
write down your answar 

# A root mean squared error (RMSE) of 67593.20745775253 for a linear regression model suggests that the model is not performing well on the data so it is underfitting the data 

your answer goes here

# Let’s train a Decision Tree Regressor model 
## more powerful model

In [None]:
from sklearn.tree import DecisionTreeRegressor 

In [None]:
# CODE HERE
tree_reg = DecisionTreeRegressor(random_state=42)
# fit the model to the training data
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
print('Decision Tree Regressor  Train Score is : ' , tree_reg.score(housing_prepared, housing_labels))

Decision Tree Regressor  Train Score is :  1.0


In [None]:
# make predictions on the training set
housing_predictions = tree_reg.predict(housing_prepared)


# Now evaluate the model on the training set 
* using Scikit-Learn’s mean_squared_error() function:

In [None]:
# CODE HERE
# calculate RMSE on the training set
from sklearn.metrics import mean_squared_error
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(f'Root Mean Squared Error is : {tree_rmse}')

Root Mean Squared Error is : 0.0


# Explaine this result 
write down your answar

# Model has badly overfit the data.

# RMSE on the training set is 0, this could indicate that the model is overfitting the training data



your answer goes here

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [None]:
 from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)

2- display the resultant scores and calculate its Mean and Standard deviation

In [None]:
# CODE HERE
tree_rmse_scores = np.sqrt(-scores)

print("Scores:", tree_rmse_scores)
print("Mean:", tree_rmse_scores.mean())
print("Standard deviation:", tree_rmse_scores.std())


Scores: [65312.86044031 70581.69865676 67849.75809965 71460.33789358
 74035.29744574 65562.42978503 67964.10942543 69102.89388457
 66876.66473025 69735.84760006]
Mean: 68848.18979613911
Standard deviation: 2579.6785558576307


3-repaet the same steps to compute the same scores for the Linear Regression  model 

*notice the difference between the results of the two models*

In [None]:
# CODE HERE
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring ="neg_mean_squared_error",cv = 10)


In [None]:
lin_rmse_scores = np.sqrt(-scores)
print("Scores: ", lin_rmse_scores)
print("Mean: ", lin_rmse_scores.mean())
print("Standard Deviation: ", lin_rmse_scores.std())

Scores:  [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean:  67828.38677377408
Standard Deviation:  2468.0913950652275


## Let’s train one last model the RandomForestRegressor.

In [None]:
# CODE HERE
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,housing_prepared, housing_labels,scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [None]:
# CODE HERE
print("Scores: ", forest_rmse_scores)
print("Mean: ", forest_rmse_scores.mean())
print("Standard Deviation: ", forest_rmse_scores.std())

Scores:  [47082.10689477 51976.25221174 49862.50714409 51771.34607965
 52540.9666248  47414.37393998 47496.79311489 51010.90889166
 49390.23130508 49939.38191518]
Mean:  49848.48681218382
Standard Deviation:  1901.3713250543096


# Save every model you experiment with 
*using the joblib library*

In [None]:
# CODE HERE

from joblib import dump

# save the Linear Regression model
dump(lin_reg, "lin_reg_model.joblib")

# save the Decision Tree Regressor model
dump(tree_reg, "tree_reg_model.joblib")

# save the Random Forest Regressor model
dump(forest_reg, "forest_reg_model.joblib")

['forest_reg_model.joblib']

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor 
*It may take a long time*

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# CODE HERE
param_grid = [
              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}
]
forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)


with the evaluation scores

In [None]:
grid_search.best_score_

-2467358190.4802885

# Analyze the Best Models and Their Errors

---


1-indicate the relative importance of each attribute

In [None]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(max_features=8, n_estimators=30, random_state=42)
forest_scores = cross_val_score(forest_reg,housing_prepared, housing_labels,scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [None]:
print("Scores: ", forest_rmse_scores)
print("Mean: ", forest_rmse_scores.mean())
print("Standard Deviation: ", forest_rmse_scores.std())

Scores:  [47434.65944602 50515.51213132 48237.53031018 50531.06763401
 52039.68041997 46904.59875957 47460.06174483 50140.20719581
 48817.09594064 50114.21545463]
Mean:  49219.46290369915
Standard Deviation:  1605.2330735523904


2-display these importance scores next to their corresponding attribute names:

In [None]:
# CODE HERE
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

64878.27480854276 {'max_features': 2, 'n_estimators': 3}
55391.003575336406 {'max_features': 2, 'n_estimators': 10}
52721.66494842234 {'max_features': 2, 'n_estimators': 30}
58541.12715494087 {'max_features': 4, 'n_estimators': 3}
51623.59366665994 {'max_features': 4, 'n_estimators': 10}
49787.65951361993 {'max_features': 4, 'n_estimators': 30}
58620.88234614251 {'max_features': 6, 'n_estimators': 3}
51645.862673140065 {'max_features': 6, 'n_estimators': 10}
49917.66994061786 {'max_features': 6, 'n_estimators': 30}
58640.96129790229 {'max_features': 8, 'n_estimators': 3}
51650.365581628095 {'max_features': 8, 'n_estimators': 10}
49672.50940389753 {'max_features': 8, 'n_estimators': 30}
61580.24110015614 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53889.80996032937 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
58667.89389226964 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52764.2630869393 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [None]:
final_model = grid_search.best_estimator_

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

2-run your full_pipeline to transform the data

In [None]:
X_test_prepared = full_pipeline.transform(X_test)


3-evaluate the final model on the test set

In [None]:
# CODE HERE
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final Root Mean Squared Error:", final_rmse)

Final Root Mean Squared Error: 68502.71773483063


# compute a 95% confidence interval for the generalization error 
*using scipy.stats.t.interval():*

In [None]:
from scipy import stats

In [None]:
# CODE HERE

# Compute standard error of the mean
std_error = np.std(final_predictions - y_test) / np.sqrt(len(y_test))

# Compute 95% confidence interval for the generalization error
conf_interval = stats.t.interval(0.95, len(y_test) - 1, loc=np.mean(final_predictions - y_test), scale=std_error)

print(f"95% Confidence Interval: {conf_interval}")


95% Confidence Interval: (20795.052240549714, 24738.06137379137)


# Great Job!
# #shAI_Club