# Optimización de hiperparámetros

## Pasos previos

### Definición del *pipeline*

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
cat_pipeline = make_pipeline( # Pipeline for categorical features
    SimpleImputer(strategy="most_frequent"), # Impute missing values with the most frequent value
    OneHotEncoder(handle_unknown="ignore")) # One-hot encode the categorical features

In [3]:
class ClusterSimilarity(BaseEstimator, TransformerMixin): # Custom transformer to compute similarity with cluster center
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma # RBF kernel bandwidth
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10, 
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

In [4]:
def column_ratio(X): # Custom transformer to compute the ratio of two columns
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in): # Custom function to name the output columns
    return ["ratio"]  # feature names out

def ratio_pipeline(): # Pipeline for ratio features (create new features by dividing two columns)
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

In [5]:
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]), # razón entre total_bedrooms y total_rooms (nueva feature)
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]), # razón entre total_rooms y households (nueva feature)
        ("people_per_house", ratio_pipeline(), ["population", "households"]), # razón entre population y households (nueva feature)
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]), # logaritmo de las columnas seleccionadas (para cambiar distribuciones sesgadas -skewed- por distribuciones normales)
        ("geo", cluster_simil, ["latitude", "longitude"]), # similitud con los clusters
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)), # pipeline categórico
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [6]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42, n_jobs=1)),
])

### Importación y preparación de datos

In [7]:
housing = pd.read_csv("./data/housing.csv")

In [8]:
X = housing.drop(columns="median_house_value")
y = housing["median_house_value"]

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
    
X_train = strat_train_set.drop("median_house_value", axis=1)
y_train = strat_train_set["median_house_value"].copy()

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

## Hiperparámetros relevantes

Para el pipeline de preprocesamiento:

| Hiperparámetro      | Descripción                                                 |
|---------------------|-------------------------------------------------------------|
| `n_clusters`        | Números de clusters correspondientes a zonas geográficas.   |
| `gamma`             | Velocidad de caída de la similitud con el centroide.        |
| `strategy`          | Estrategia de imputación de valores no disponibles (por defecto, la media).        |


Para RandomForestRegressor:

| Hiperparámetro      | Descripción |
|---------------------|-------------|
| `n_estimators`     | Número de árboles en el bosque. Más árboles pueden mejorar la precisión pero aumentan el tiempo de cómputo. |
| `max_depth`        | Profundidad máxima de cada árbol. Un valor bajo puede llevar a *underfitting*, mientras que un valor alto puede llevar a *overfitting*. |
| `max_features`     | Número de *features* consideradas en cada división. Puede ser un número entero, un porcentaje, `"sqrt"` o `"log2"`. Un menor número de *features* puede reducir la varianza (y con ello el *overfitting*). |
| `min_samples_split` | Número mínimo de muestras necesarias para dividir un nodo. Valores más altos reducen el *overfitting*. |
| `min_samples_leaf`  | Número mínimo de muestras en una hoja. Valores más altos suavizan la predicción. |
| `max_samples`      | Porcentaje de muestras utilizadas en cada árbol. Útil para reducir *overfitting*. |

## Ajuste de hiperparámetros

### 1ª Iteración

Vamos a empezar por una búsqueda randomizada preeliminar con un amplio espectro de valores. 

In [9]:
%%time

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint, uniform

param_dist = {
    'preprocessing__geo__n_clusters': randint(low=3, high=200),
    'random_forest__n_estimators': randint(100, 500),  # Cualquier entero entre 100 y 499
    'random_forest__max_depth': randint(10, 110),      # Cualquier entero entre 10 y 109
    'random_forest__min_samples_split': randint(2, 20),
    'random_forest__min_samples_leaf': randint(1, 20),
    'random_forest__max_features': ['sqrt', 'log2', None]
}

rnd_search = RandomizedSearchCV(
    estimator = full_pipeline, 
    param_distributions=param_dist, 
    n_iter=40, 
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1   # Usar todos los núcleos del CPU en paralelo
    )

_ = rnd_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


CPU times: user 38.6 s, sys: 4.54 s, total: 43.1 s
Wall time: 59min


```%%time``` es un [comando mágico de Jupyter](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-time) que mide el tiempo que tarda en ejecutarse la celda

Podemos ver los resultados de los mejores modelos encontrados:

In [10]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res = cv_res[['param_preprocessing__geo__n_clusters',
                 'param_random_forest__n_estimators',
                 'param_random_forest__max_depth',
                 'param_random_forest__min_samples_split',
                 'param_random_forest__min_samples_leaf',
                 'param_random_forest__max_features',
                 "mean_test_score"]]
cv_res.columns = ["n_clusters", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "mean_test_score"]

cv_res["mean_test_score"] = -cv_res["mean_test_score"].round().astype(np.int64)
cv_res.head()

Unnamed: 0,n_clusters,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,mean_test_score
34,56,225,96,2,1,sqrt,41574
3,132,413,73,13,1,sqrt,42508
25,145,259,52,14,4,sqrt,43156
17,165,330,44,11,5,sqrt,43172
21,103,104,56,2,3,log2,43476


Ahora podemos ir haciendo sucesivas pasadas fijando aquellas *features* donde todos los mejores resultados han convergido a un valor, y definiendo el diccionario de valores de prueba más cerrado sobre los mejores resultados. 

### 2ª Iteración


In [11]:
%%time

full_pipeline.set_params(random_forest__max_features="sqrt") # Fijamos el valor de max_features, que ha convergido a "sqrt"

param_dist = {
    'preprocessing__geo__n_clusters': randint(low=55, high=150),
    'random_forest__n_estimators': randint(200, 300),
    'random_forest__max_depth': randint(44, 97),
    'random_forest__min_samples_split': randint(2, 14),
    'random_forest__min_samples_leaf': randint(1, 5),
}

rnd_search = RandomizedSearchCV(
    estimator = full_pipeline, 
    param_distributions=param_dist, 
    n_iter=40, 
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1   # Usar todos los núcleos del CPU en paralelo
    )

_ = rnd_search.fit(X_train, y_train)



CPU times: user 34 s, sys: 793 ms, total: 34.8 s
Wall time: 10min


In [12]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res = cv_res[['param_preprocessing__geo__n_clusters',
                 'param_random_forest__n_estimators',
                 'param_random_forest__max_depth',
                 'param_random_forest__min_samples_split',
                 'param_random_forest__min_samples_leaf',
                 "mean_test_score"]]
cv_res.columns = ["n_clusters", "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "mean_test_score"]

cv_res["mean_test_score"] = -cv_res["mean_test_score"].round().astype(np.int64)
cv_res.head()

Unnamed: 0,n_clusters,n_estimators,max_depth,min_samples_split,min_samples_leaf,mean_test_score
6,76,290,87,2,1,41604
14,89,249,57,5,1,41678
27,116,243,67,4,1,41747
9,118,206,46,4,1,41846
8,134,254,58,4,2,41848
