# Ejercicios del capítulo 1.

### Carga de los datos de los ejercicios.

In [63]:
# Función para recuperar los datos de las casas del repositorio de Github del libro.

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://github.com/ageron/handson-ml2/blob/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz?raw=true"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [64]:
# Ya lanzada, descomentar para volver a recuperar los datos.

#fetch_housing_data()

In [65]:
#Función para cargar los datos de las casas en un DataFrame de Pandas.

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [66]:
# Cargamos los datos y los guardamos en la variable housing.

housing = load_housing_data()

### Preparación de los datos.

In [67]:
# Categorizamos los datos según el nivel de ingresos.
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

In [68]:
# Función para realizar el sampleo estratificado de los datos basándonos en la categoría de ingresos.

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [69]:
# Después podemos eliminar la categoría de ingresos para que los datos regresen a su estado original.

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [70]:
# Procedemos a preparar los datos para los algoritmos de ML, primero, revertiremos los datos a su estado limpio.

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [71]:
# Scikit-learn provee una clase para lidiar con valores faltantes: SimpleImputer.

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [72]:
# Como la mediana sólo se puede calcular de atributos numéricos, 
# creamos una copia de los datos sin el atributo ocean_proximity.

housing_num = housing.drop("ocean_proximity", axis=1)

In [73]:
# Ajustamos la instancia de SimpleImputer a los datos usando el método fit().

imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [74]:
# Podemos crear nuestros propios transformadores y para que funcionen correctamente en pipelines de Scikit-learn, tendremos
# que crear una clase que implemente los métodos fit() (devolviendo self), transform(), y fit_transform(). 

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self # nothing else to do

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [75]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [76]:
# Para ejecutar pipelines de datos categóricos junto con datos numéricos, tenemos la clase ColumnTransformer.
# Realizamos one-hot encoding con la clase OneHotEncoder de Scikit-learn.

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

### Ejercicio 1: Entrenar un modelo de Regresión por Máquinas de vectores de soporte (SVM).

In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   3.7s
[CV] END ..............................C=10.0, kernel=linear; total time=   3.6s
[CV] END ..............................C=10.0, kernel=linear; total time=   3.6s
[CV] END ..............................C=10.0, kernel=linear; total time=   3.6s
[CV] END ..............................C=10.0, kernel=linear; total time=   3.5s
[CV] END ..............................C=30.0, kernel=linear; total time=   3.5s
[CV] END ..............................C=30.0, kernel=linear; total time=   3.5s
[CV] END ..............................C=30.0, kernel=linear; total time=   3.6s
[CV] END ..............................C=30.0, kernel=linear; total time=   3.6s
[CV] END ..............................C=30.0, kernel=linear; total time=   3.5s
[CV] END .............................C=100.0, kernel=linear; total time=   3.5s
[CV] END .............................C=100.0, 

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=2)

In [78]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70286.61836530612

In [79]:
# Esto tarda bastante ya que probará muchas combinaciones del Bosque aleatorio.
# Una vez se ha completado podemos ver la mejor combinación con la variable best_params_ de la instancia de RandomizedSearchCV.

grid_search.best_params_

{'C': 30000.0, 'kernel': 'linear'}

### Ejercicio 2: Realizar la búsqueda de hiperparámetros mediante búsqueda aleatoria.

In [80]:
# Podemos probar múltiples combinaciones de hiperparámetros con la clase RandomizedSearchCV de Scikit-learn.
# Le decimos los rangos de valores con los que queremos experimentar y 
# aplicará validación cruzada entre posibles combinaciones de hiperparámetros.

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, expon

param_distributions = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 200000),
    'gamma': expon(scale=1.0),
}
svm_reg = SVR()
rnd_search_cv = RandomizedSearchCV(svm_reg, param_distributions=param_distributions, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, random_state=42)
rnd_search_cv.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   3.8s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   3.7s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   3.7s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   3.6s
[CV] END C=629.782329591372, gamma=3.010121430917521, kernel=linear; total time=   3.7s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=   6.5s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=   6.5s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=   6.5s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=   6.6s
[CV] END C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf; total time=   6.6s
[CV] END C=84.14107900575871, gamma=0.059838768608680676, 

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=50,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x0000020314FA03A0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x0000020305FD1DC0>,
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [81]:
negative_mse = rnd_search_cv.best_score_
rmse = np.sqrt(-negative_mse)
rmse

54751.69009256622

In [82]:
rnd_search_cv.best_params_

{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

### Ejercicio 3: Creación de un pipeline de preparación para seleccionar los atributos más importantes.

In [83]:
# Podemos probar múltiples combinaciones de hiperparámetros con la clase GridSearchCV de Scikit-learn.
# Le decimos los valores con los que queremos experimentar y aplicará validación cruzada entre todas las combinaciones posibles.

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [84]:
# Algunos modelos especifican la importancia de sus atributos, podemos ver la de los del bosque aleatorio 
# con la variable feature_importances_ del modelo.

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.11471272e-02, 6.30611968e-02, 4.42343681e-02, 1.73112291e-02,
       1.67061940e-02, 1.79409977e-02, 1.75487432e-02, 3.30103443e-01,
       7.19083056e-02, 1.08634945e-01, 7.09369672e-02, 1.33317197e-02,
       1.49729115e-01, 4.89925091e-05, 2.17505786e-03, 5.18159767e-03])

In [85]:
# Vamos a mostrar las importancias junto con el nombre del atributo.

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3301034430057969, 'median_income'),
 (0.1497291151750564, 'INLAND'),
 (0.1086349451454764, 'pop_per_hhold'),
 (0.07190830563531257, 'rooms_per_hhold'),
 (0.07114712717526209, 'longitude'),
 (0.07093696723202743, 'bedrooms_per_room'),
 (0.06306119676153553, 'latitude'),
 (0.04423436810461348, 'housing_median_age'),
 (0.017940997749779235, 'population'),
 (0.01754874315983213, 'households'),
 (0.017311229103130314, 'total_rooms'),
 (0.016706193973157892, 'total_bedrooms'),
 (0.013331719744811241, '<1H OCEAN'),
 (0.005181597670005312, 'NEAR OCEAN'),
 (0.002175057855094517, 'NEAR BAY'),
 (4.899250910857068e-05, 'ISLAND')]

In [86]:
# Función para seleccionar los atributos más importantes de los datos.

from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [87]:
# Nos quedamos con las 5 características más importantes.

k = 5
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

array([ 0,  7,  8,  9, 12], dtype=int64)

In [88]:
# Veamos de qué características se trata.

sorted(zip(feature_importances, attributes), reverse=True)[:k]

[(0.3301034430057969, 'median_income'),
 (0.1497291151750564, 'INLAND'),
 (0.1086349451454764, 'pop_per_hhold'),
 (0.07190830563531257, 'rooms_per_hhold'),
 (0.07114712717526209, 'longitude')]

In [89]:
# Ahora podemos crear un pipeline que incluya la selección de características.

preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [90]:
# Veamos las características que se quedan.

housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)
housing_prepared[0:3, top_k_feature_indices]

array([[-0.94135046, -0.8936472 ,  0.01739526,  0.00622264,  1.        ],
       [ 1.17178212,  1.292168  ,  0.56925554, -0.04081077,  0.        ],
       [ 0.26758118, -0.52543365, -0.01802432, -0.07537122,  1.        ]])

### Ejercicio 4: Crear un pipeline que realice la preparación de los datos, el ajuste y una predicción.

In [91]:
# Ahora podemos entrenar un modelo con las características seleccionadas.

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search_cv.best_params_))
])
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   CombinedAttributesAdder()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                    

In [92]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predicciones:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Etiquetas:\t\t", list(some_labels))

Predicciones:	 [ 85179.52149426 295946.15585365  96459.22300706 149949.64345676]
Etiquetas:		 [72100.0, 279600.0, 82700.0, 112500.0]


### Ejercicio 5: Probar algunas opciones de preparación usando GridSearchCV.

In [100]:
# Para eliminar los warnings causados por la categoría ISLAND que no aparece en el conjunto de los atributos seleccionados.
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

full_pipeline.named_transformers_["cat"].handle_unknown = 'ignore'

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   4.1s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   5.8s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   5.9s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   6.0s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   5.9s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   4.2s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   6.0s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   5.9s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   6.0s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total t

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('attribs_adder',
                                                                                          CombinedAttributesAdder()),
                                                                                         ('std_scaler',
                                                                                          StandardScaler())]),
                                                                         ['longitude',
                                                                          'latitude',
       

In [94]:
grid_search_prep.best_params_

{'feature_selection__k': 1, 'preparation__num__imputer__strategy': 'mean'}