# Imputación de valores no disponibles utilizando KNN

## *Imports* y pasos previos

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline


In [30]:
housing = pd.read_csv("./data/housing.csv")

X_train, X_test, y_train, y_test = train_test_split(
    housing.drop(columns="median_house_value"), # features
    housing["median_house_value"], # target
    stratify=pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]),
    test_size=0.2, random_state=42
    )


null_rows_idx = X_train.isnull().any(axis=1) # índices de las filas con valores nulos
X_train.loc[null_rows_idx].head() # visualizamos las primeras filas con valores nulos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1606,-122.08,37.88,26.0,2947.0,,825.0,626.0,2.933,NEAR BAY
10915,-117.87,33.73,45.0,2264.0,,1970.0,499.0,3.4193,<1H OCEAN
19150,-122.7,38.35,14.0,2313.0,,954.0,397.0,3.7813,<1H OCEAN
4186,-118.23,34.13,48.0,1308.0,,835.0,294.0,4.2891,<1H OCEAN
16885,-122.4,37.58,26.0,3281.0,,1145.0,480.0,6.358,NEAR OCEAN


## Solución paso a paso

Aunque hay métodos para calcular el valor optimo de k comparando distintos valores, en este caso vamos a seguir el criterio de que k sea igual a la raíz cuadrada del número de muestras, que es el [criterio que se suele seguir en la práctica](https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb#:~:text=The%20optimal%20K%20value%20usually,be%20aware%20of%20the%20outliers.). Tratandose solo de una imputación y no de de un modelo, no es necesario profundizar más en la optimización de k.

In [31]:
k_value = np.sqrt(X_train.shape[0]).astype(int)
k_value

128

Decidimos utilizar el algoritmo KNN para la imputación de valores no disponibles en la columna 'total_bedrooms'. Para ello, utilizamos la clase KNNImputer de scikit-learn.

Como KNN se basa en medidas de distancia, es importante normalizar los datos antes de aplicar el algoritmo, ya que si no, las características con órdenes de magnitud más grandes dominarán las distancias.

Para ello, utilizamos la clase StandardScaler de scikit-learn.

In [32]:
X_train_num = X_train.select_dtypes(include=[np.number]) 
scaler = StandardScaler().set_output(transform="pandas") # Para que el resultado sea un DataFrame
X_train_num_scaled = scaler.fit_transform(X_train_num)
X_train_num_scaled

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752
...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261


el [método `set_output`](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html) permite establecer que los métodos transformadores de scikit-learn devuelvan DataFrames en luegar de ndarrays.

Aplicamos también OneHotEncoder para las variables categóricas.

In [33]:
cat_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
housing_cat_ohe = cat_encoder.fit_transform(X_train[["ocean_proximity"]])
housing_cat_ohe

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,0.0,1.0,0.0,0.0,0.0
15502,0.0,0.0,0.0,0.0,1.0
2908,0.0,1.0,0.0,0.0,0.0
14053,0.0,0.0,0.0,0.0,1.0
20496,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
15174,1.0,0.0,0.0,0.0,0.0
12661,0.0,1.0,0.0,0.0,0.0
19263,1.0,0.0,0.0,0.0,0.0
19140,1.0,0.0,0.0,0.0,0.0


concatenamos los DataFrames:

In [34]:
X_train_tr1 = pd.concat([X_train_num_scaled, housing_cat_ohe], axis=1)
X_train_tr1

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0


Y con todo el resto de variables escaladas, aplicamos KNNImputer para imputar los valores no disponibles en la columna 'total_bedrooms'.

In [35]:
X_train_imputed_a = KNNImputer(n_neighbors=k_value).set_output(transform="pandas").fit_transform(X_train_tr1)

print(X_train_imputed_a.isna().any().any()) # Verificamos que no hay valores nulos
X_train_imputed_a

False


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12655,-0.941350,1.347438,0.027564,0.584777,0.635123,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
15502,1.171782,-1.192440,-1.722018,1.261467,0.775677,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2908,0.267581,-0.125972,1.220460,-0.469773,-0.545045,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
14053,1.221738,-1.351474,-0.370069,-0.348652,-0.038567,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
20496,0.437431,-0.635818,-0.131489,0.427179,0.269198,0.374060,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15174,1.251711,-1.220505,-1.165333,1.890456,1.686854,0.543471,1.341519,0.637374,1.0,0.0,0.0,0.0,0.0
12661,-0.921368,1.342761,-1.085806,2.468471,2.149712,3.002174,2.451492,-0.557509,0.0,1.0,0.0,0.0,0.0
19263,-1.570794,1.310018,1.538566,-0.895802,-0.894007,-0.862013,-0.865118,-0.365475,1.0,0.0,0.0,0.0,0.0
19140,-1.560803,1.249211,-1.165333,0.249005,0.109257,-0.189747,0.010616,0.168261,1.0,0.0,0.0,0.0,0.0


In [36]:
X_train_imputed_a.loc[null_rows_idx].head() # visualizamos las filas que tenían valores nulos

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
1606,-1.251077,1.048079,-0.211016,0.151734,0.080575,-0.533051,0.343342,-0.494985,0.0,0.0,0.0,1.0,0.0
10915,0.852065,-0.89308,1.299986,-0.167671,-0.046215,0.493276,0.005292,-0.239693,1.0,0.0,0.0,0.0,0.0
19150,-1.560803,1.267921,-1.165333,-0.144756,-0.238985,-0.417421,-0.266212,-0.049654,1.0,0.0,0.0,0.0,0.0
4186,0.672224,-0.705981,1.538566,-0.614744,-0.577098,-0.524088,-0.540378,0.216926,1.0,0.0,0.0,0.0,0.0
16885,-1.410935,0.907754,-0.211016,0.307929,-0.168272,-0.246217,-0.045282,1.303035,0.0,0.0,0.0,0.0,1.0


## Creando un pipeline

In [37]:
transformer1 = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object)),
)
pipeline = make_pipeline(transformer1, KNNImputer(n_neighbors=k_value)) # creamos un pipeline con las transformaciones previas y la imputación

X_train_imputed_b = pipeline.fit_transform(X_train) # Aplicamos el pipeline a los datos de entrenamiento

## Comprobación de la solución

Podemos también convertir de nuevo a DataFrame y comparar los resultados procesados por los dos métodos.

In [38]:
X_train_imputed_b = pd.DataFrame( # convertimos el resultado a DataFrame
    X_train_imputed_b,
    columns=transformer1.get_feature_names_out(), index=X_train.index)

print("¿Hay valores nulos?", X_train_imputed_b.isna().any().any()) # Verificamos que no hay valores nulos

# Comprobamos que X_train_imputed_a y X_train_imputed_b son iguales
print("¿Son los resultados iguales paso a paso y con el pipeline?",(X_train_imputed_a.values==X_train_imputed_b.values).all())

¿Hay valores nulos? False
¿Son los resultados iguales paso a paso y con el pipeline? True


## Entrenando el modelo y midiendo su rendimiento

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

lin_reg = make_pipeline(pipeline, LinearRegression())
lin_reg

In [40]:
print("R2:",lin_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, lin_reg.predict(X_test)))


R2: 0.644820381999992
RMSE: 67271.05185113841


In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

forest_reg = make_pipeline(pipeline, RandomForestRegressor(random_state=42))
print("R2:",forest_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, forest_reg.predict(X_test)))

R2: 0.9748534272756013
RMSE: 47249.540167992134


In [42]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error

knn_reg = make_pipeline(pipeline, KNeighborsRegressor())
print("R2:",knn_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, knn_reg.predict(X_test)))

R2: 0.8163098333472136
RMSE: 60129.65673454397
