# Exercises

In [1]:
# 1. Import core libraries
import numpy as np
import pandas as pd
import os
import tarfile
import urllib.request
import matplotlib.pyplot as plt

In [None]:
# 2. Download and extract the dataset
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_URL = DOWNLOAD_ROOT + "../data/housing.tgz"
HOUSING_PATH = os.path.join("..", "data", "housing")

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)

fetch_housing_data()

In [4]:
# 3. Load dataset into a DataFrame
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [5]:
# 4. Create income category for stratified sampling
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

In [6]:
# 5. Stratified train/test split
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_idx]
    strat_test_set = housing.loc[test_idx]

# Drop the income_cat column
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [7]:
# 6. Separate features and target
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
# 7. Data preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

# Define numerical and categorical columns
num_attribs = list(housing.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Full pipeline
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

In [9]:
# 8. Apply pipeline to features
housing_prepared = full_pipeline.fit_transform(housing)

## 1. Try a Support Vector Machine Regressor (SVR)

In [10]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [11]:
# Define hyperparameter grid for SVR with different kernels
param_grid = [
    {"kernel": ["linear"], "C": [1, 10, 100]},
    {"kernel": ["rbf"], "C": [1, 10, 100], "gamma": ["scale", 0.1, 0.01]}
]

In [12]:
svr = SVR()

In [13]:
# Perform grid search with cross-validation
grid_search_svr = GridSearchCV(
    svr, param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    verbose=2, n_jobs=-1
)

In [14]:
# Train the models with all hyperparameter combinations
grid_search_svr.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


0,1,2
,estimator,SVR()
,param_grid,"[{'C': [1, 10, ...], 'kernel': ['linear']}, {'C': [1, 10, ...], 'gamma': ['scale', 0.1, ...], 'kernel': ['rbf']}]"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [15]:
# Show best combination of hyperparameters and RMSE score
print("Best Parameters (SVR):", grid_search_svr.best_params_)
svr_rmse = np.sqrt(-grid_search_svr.best_score_)
print("Best RMSE (SVR):", svr_rmse)

Best Parameters (SVR): {'C': 100, 'kernel': 'linear'}
Best RMSE (SVR): 71129.63004739223


## 2. Use RandomizedSearchCV instead of GridSearchCV

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal

In [17]:
# Define distribution ranges for hyperparameters
param_distributions = {
    "kernel": ["rbf"],
    "C": reciprocal(1, 100),
    "gamma": reciprocal(0.001, 0.1)
}

In [18]:
# Perform randomized search (20 random combinations)
rnd_search_svr = RandomizedSearchCV(
    SVR(), param_distributions,
    n_iter=20, cv=5,
    scoring="neg_mean_squared_error",
    random_state=42, n_jobs=-1
)

In [19]:
rnd_search_svr.fit(housing_prepared, housing_labels)

0,1,2
,estimator,SVR()
,param_distributions,"{'C': <scipy.stats....001BC270E1590>, 'gamma': <scipy.stats....001BC270E0810>, 'kernel': ['rbf']}"
,n_iter,20
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,kernel,'rbf'
,degree,3
,gamma,np.float64(0....0401125610165)
,coef0,0.0
,tol,0.001
,C,np.float64(85.3618986286683)
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [20]:
# Show best result
print("Best Parameters (Randomized SVR):", rnd_search_svr.best_params_)
print("Best RMSE (Randomized SVR):", np.sqrt(-rnd_search_svr.best_score_))

Best Parameters (Randomized SVR): {'C': np.float64(85.3618986286683), 'gamma': np.float64(0.041380401125610165), 'kernel': 'rbf'}
Best RMSE (Randomized SVR): 101128.7381157618


## 3. Add a transformer to select only the most important attributes

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [22]:
# Train a temporary model only to get feature importances
temp_forest = RandomForestRegressor(random_state=42)
temp_forest.fit(housing_prepared, housing_labels)
feature_importances = temp_forest.feature_importances_

In [23]:
# Custom transformer to select top-k important features by index
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k

    def fit(self, X, y=None):
        # Find indices of top k features
        self.feature_indices_ = np.argsort(self.feature_importances)[-self.k:]
        return self

    def transform(self, X):
        # Return only the selected features
        return X[:, self.feature_indices_]

In [24]:
top_k = 5
selector = TopFeatureSelector(feature_importances, k=top_k)
housing_top_k = selector.fit_transform(housing_prepared)


In [25]:
# Get extra attributes added manually (e.g. engineered features)
extra_attribs = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]

# Get category names from the encoder (OneHotEncoder)
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.get_feature_names_out())

# Combine all feature names in the same order as housing_prepared
all_feature_names = num_attribs + extra_attribs + cat_one_hot_attribs

# Get the indices of the top k features (from your selector)
top_feature_indices = selector.feature_indices_

# Get the corresponding names
top_feature_names = [all_feature_names[i] for i in top_feature_indices]

# Print them with their importances
for i in reversed(np.argsort(feature_importances)[-top_k:]):
    print(f"{all_feature_names[i]}: {feature_importances[i]:.5f}")

median_income: 0.47406
population_per_household: 0.13938
population_per_household: 0.12301
longitude: 0.05872
latitude: 0.05586


## 4. Create a full pipeline including preparation and prediction

In [26]:
from sklearn.pipeline import Pipeline

In [27]:
# Atribuições adicionais
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["population_per_household"] = housing["population"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]

In [28]:
# Listas de atributos
num_attribs = list(housing.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

In [29]:
# Pipeline numérico
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [30]:
# Pipeline completo de preparação
full_preparation = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [31]:
# Treinar modelo para extrair importâncias
temp_model = RandomForestRegressor(random_state=42)
housing_prepared = full_preparation.fit_transform(housing)
temp_model.fit(housing_prepared, housing_labels)
feature_importances = temp_model.feature_importances_

In [32]:
# Criar pipeline final: preparação + seleção + modelo
full_pipeline_with_predictor = Pipeline([
    ("preparation", full_preparation),
    ("feature_selection", TopFeatureSelector(feature_importances, k=5)),
    ("regressor", RandomForestRegressor(n_estimators=30, max_features=6, random_state=42))
])

In [33]:
# Treinar pipeline com dados brutos
full_pipeline_with_predictor.fit(housing, housing_labels)

0,1,2
,steps,"[('preparation', ...), ('feature_selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_importances,array([5.8721...26671990e-03])
,k,5

0,1,2
,n_estimators,30
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 5. Use GridSearchCV to explore preprocessing options (e.g., add or not a feature)

In [34]:
# Custom transformer with hyperparameter for including/excluding a derived feature
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, 3] / X[:, 6]
        population_per_household = X[:, 5] / X[:, 6]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, 4] / X[:, 3]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [35]:
# Updated numerical pipeline with attribute adder as a tunable step
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),  # configurable
    ('std_scaler', StandardScaler())
])

In [36]:
# Full pipeline
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [37]:
# Pipeline that includes both preparation and model
pipeline = Pipeline([
    ("preparation", full_pipeline),
    ("regressor", RandomForestRegressor())
])

In [38]:
# Grid search over preprocessing + model hyperparameters
param_grid = {
    "preparation__num__attribs_adder__add_bedrooms_per_room": [True, False],
    "regressor__n_estimators": [10, 30],
    "regressor__max_features": [4, 6, 8],
}

In [39]:
# Run grid search
grid_search = GridSearchCV(
    pipeline, param_grid, cv=3,
    scoring="neg_mean_squared_error", n_jobs=-1
)

In [40]:
grid_search.fit(housing, housing_labels)



0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'preparation__num__attr...__add_bedrooms_per_room': [True, False], 'regressor__max_features': [4, 6, ...], 'regressor__n_estimators': [10, 30]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,add_bedrooms_per_room,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [41]:
print("Best parameters from combined grid search:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))

Best parameters from combined grid search: {'preparation__num__attribs_adder__add_bedrooms_per_room': True, 'regressor__max_features': 4, 'regressor__n_estimators': 10}
Best RMSE: nan
