# Imputación de valores no disponibles utilizando KNN

## *Imports* y pasos previos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [2]:
housing = pd.read_csv("./data/housing.csv")

X_train, X_test, y_train, y_test = train_test_split(
    housing.drop(columns="median_house_value"), # features
    housing["median_house_value"], # target
    stratify=pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]),
    test_size=0.2, random_state=42
    )


null_rows_idx = X_train.isnull().any(axis=1) # índices de las filas con valores nulos
X_train.loc[null_rows_idx].head().T # visualizamos las primeras filas con valores nulos

Unnamed: 0,1606,10915,19150,4186,16885
longitude,-122.08,-117.87,-122.7,-118.23,-122.4
latitude,37.88,33.73,38.35,34.13,37.58
housing_median_age,26.0,45.0,14.0,48.0,26.0
total_rooms,2947.0,2264.0,2313.0,1308.0,3281.0
total_bedrooms,,,,,
population,825.0,1970.0,954.0,835.0,1145.0
households,626.0,499.0,397.0,294.0,480.0
median_income,2.933,3.4193,3.7813,4.2891,6.358
ocean_proximity,NEAR BAY,<1H OCEAN,<1H OCEAN,<1H OCEAN,NEAR OCEAN


## Solución paso a paso

### Planteamiento

Decidimos utilizar el algoritmo KNN para la imputación de valores no disponibles en la columna 'total_bedrooms'. Para ello, utilizamos la clase `KNNImputer` de scikit-learn.

Como KNN se basa en medidas de distancia, es importante normalizar los datos antes de aplicar el algoritmo, ya que si no, las características con órdenes de magnitud más grandes dominarán las distancias.

Para ello, utilizamos la clase `StandardScaler` de scikit-learn.

[Primero trataremos las variables categóricas con el OneHotEncoder para después escarlar todas juntas](https://datascience.stackexchange.com/questions/31652/should-one-hot-vectors-be-scaled-with-numerical-attributes).

### Selección del número de vecinos k

Aunque hay métodos para calcular el valor optimo de k comparando distintos valores, en este caso vamos a seguir el criterio de que k sea igual a la raíz cuadrada del número de muestras, que es el [criterio que se suele seguir en la práctica](https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb#:~:text=The%20optimal%20K%20value%20usually,be%20aware%20of%20the%20outliers.). Tratándose solo de una imputación y no de de un modelo, no es necesario profundizar más en la optimización de k.

In [3]:
k_value = np.sqrt(housing.shape[0]).astype(int)
k_value

143

### Transformación de variables categóricas

In [4]:
cat_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")  # Para que el resultado sea un DataFrame
X_train_cat_ohe = cat_encoder.fit_transform(X_train[["ocean_proximity"]])
X_train_cat_ohe.head().T

Unnamed: 0,12655,15502,2908,14053,20496
ocean_proximity_<1H OCEAN,0.0,0.0,0.0,0.0,1.0
ocean_proximity_INLAND,1.0,0.0,1.0,0.0,0.0
ocean_proximity_ISLAND,0.0,0.0,0.0,0.0,0.0
ocean_proximity_NEAR BAY,0.0,0.0,0.0,0.0,0.0
ocean_proximity_NEAR OCEAN,0.0,1.0,0.0,1.0,0.0


el [método `set_output`](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html) permite establecer que los métodos transformadores de scikit-learn devuelvan DataFrames en luegar de ndarrays.

sustituimos la columna categórica por las columnas que genera el OneHotEncoder.

In [5]:
X_train_ohe = pd.concat([X_train.drop(columns="ocean_proximity"), X_train_cat_ohe], axis=1)
X_train_ohe.head().T

Unnamed: 0,12655,15502,2908,14053,20496
longitude,-121.46,-117.23,-119.04,-117.13,-118.7
latitude,38.52,33.09,35.37,32.75,34.28
housing_median_age,29.0,7.0,44.0,24.0,27.0
total_rooms,3873.0,5320.0,1618.0,1877.0,3536.0
total_bedrooms,797.0,855.0,310.0,519.0,646.0
population,2237.0,2015.0,667.0,898.0,1837.0
households,706.0,768.0,300.0,483.0,580.0
median_income,2.1736,6.3373,2.875,2.2264,4.4964
ocean_proximity_<1H OCEAN,0.0,0.0,0.0,0.0,1.0
ocean_proximity_INLAND,1.0,0.0,1.0,0.0,0.0


### Normalización de los datos
<!-- TODO: normalización vs escalado -->

Ahora que todas las columnas son numéricas, podemos normalizarlas.

In [6]:
scaler = StandardScaler().set_output(transform="pandas")
X_train_tr1 = scaler.fit_transform(X_train_ohe)
X_train_tr1.head().T

Unnamed: 0,12655,15502,2908,14053,20496
longitude,-0.94135,1.171782,0.267581,1.221738,0.437431
latitude,1.347438,-1.19244,-0.125972,-1.351474,-0.635818
housing_median_age,0.027564,-1.722018,1.22046,-0.370069,-0.131489
total_rooms,0.584777,1.261467,-0.469773,-0.348652,0.427179
total_bedrooms,0.635123,0.775677,-0.545045,-0.038567,0.269198
population,0.732602,0.533612,-0.674675,-0.467617,0.37406
households,0.556286,0.721318,-0.524407,-0.037297,0.220898
median_income,-0.893647,1.292168,-0.525434,-0.865929,0.325752
ocean_proximity_<1H OCEAN,-0.887683,-0.887683,-0.887683,-0.887683,1.126529
ocean_proximity_INLAND,1.46218,-0.68391,1.46218,-0.68391,-0.68391


Y con todas las variables escaladas, aplicamos `KNNImputer` para imputar los valores no disponibles en la columna 'total_bedrooms'.

In [7]:
X_train_imputed_a = KNNImputer(n_neighbors=k_value).set_output(transform="pandas").fit_transform(X_train_tr1)
print(X_train_imputed_a.isna().any().any()) # Verificamos que no hay valores nulos
X_train_imputed_a.loc[null_rows_idx].head().T # visualizamos las filas que tenían valores nulos

False


Unnamed: 0,1606,10915,19150,4186,16885
longitude,-1.251077,0.852065,-1.560803,0.672224,-1.410935
latitude,1.048079,-0.89308,1.267921,-0.705981,0.907754
housing_median_age,-0.211016,1.299986,-1.165333,1.538566,-0.211016
total_rooms,0.151734,-0.167671,-0.144756,-0.614744,0.307929
total_bedrooms,0.074263,-0.055293,-0.233332,-0.579345,-0.189661
population,-0.533051,0.493276,-0.417421,-0.524088,-0.246217
households,0.343342,0.005292,-0.266212,-0.540378,-0.045282
median_income,-0.494985,-0.239693,-0.049654,0.216926,1.303035
ocean_proximity_<1H OCEAN,-0.887683,1.126529,1.126529,1.126529,-0.887683
ocean_proximity_INLAND,-0.68391,-0.68391,-0.68391,-0.68391,-0.68391


## Usando un *pipeline*

### Definición del pipeline

In [8]:
pipeline = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(sparse_output=False), ["ocean_proximity"]), # OneHotEncoder a la columna "ocean_proximity"
        remainder='passthrough' # las columnas restantes pasan sin cambios
    ),
    StandardScaler(),
    KNNImputer(n_neighbors=k_value)
    )

pipeline

### Transformación

In [9]:
X_train_imputed_b = pipeline.fit_transform(X_train) # Aplicamos el pipeline a los datos de entrenamiento

In [10]:
np.any(np.isnan(X_train_imputed_b)) # Verificamos que no hay valores nulos

False

In [11]:
pd.DataFrame( # podemos convertir el resultado a DataFrame (aunque no sería necesario)
    X_train_imputed_b,
    columns=pipeline.get_feature_names_out(),
    index=X_train.index
    ).T

Unnamed: 0,12655,15502,2908,14053,20496,1481,18125,5830,17989,4861,...,12396,16476,2271,6980,5206,15174,12661,19263,19140,19773
onehotencoder__ocean_proximity_<1H OCEAN,-0.887683,-0.887683,-0.887683,-0.887683,1.126529,-0.887683,1.126529,1.126529,1.126529,1.126529,...,-0.887683,-0.887683,-0.887683,1.126529,1.126529,1.126529,-0.887683,1.126529,1.126529,-0.887683
onehotencoder__ocean_proximity_INLAND,1.46218,-0.68391,1.46218,-0.68391,-0.68391,-0.68391,-0.68391,-0.68391,-0.68391,-0.68391,...,1.46218,1.46218,1.46218,-0.68391,-0.68391,-0.68391,1.46218,-0.68391,-0.68391,1.46218
onehotencoder__ocean_proximity_ISLAND,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,...,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006,-0.011006
onehotencoder__ocean_proximity_NEAR BAY,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,2.817783,-0.354889,-0.354889,-0.354889,-0.354889,...,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889,-0.354889
onehotencoder__ocean_proximity_NEAR OCEAN,-0.384217,2.602693,-0.384217,2.602693,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,...,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217,-0.384217
remainder__longitude,-0.94135,1.171782,0.267581,1.221738,0.437431,-1.231094,-1.226099,0.632259,-1.186134,0.647246,...,1.641367,-0.846434,-0.112083,0.782126,0.647246,1.251711,-0.921368,-1.570794,-1.560803,-1.28105
remainder__latitude,1.347438,-1.19244,-0.125972,-1.351474,-0.635818,1.085499,0.790817,-0.673238,0.762752,-0.757433,...,-0.921145,1.165016,0.533555,-0.78082,-0.79953,-1.220505,1.342761,1.310018,1.249211,2.025674
remainder__housing_median_age,0.027564,-1.722018,1.22046,-0.370069,-0.131489,-0.051963,-0.449595,0.584248,-0.926753,0.027564,...,-1.324386,0.504722,1.140933,0.584248,0.981881,-1.165333,-1.085806,1.538566,-1.165333,-0.131489
remainder__total_rooms,0.584777,1.261467,-0.469773,-0.348652,0.427179,-0.661977,0.74752,-0.435167,-0.604456,-0.985591,...,1.134266,-0.007267,-0.112488,-0.54787,-0.78871,1.890456,2.468471,-0.895802,0.249005,-0.721836
remainder__total_bedrooms,0.635123,0.775677,-0.545045,-0.038567,0.269198,-0.685599,0.329782,-0.658943,-0.639556,-0.741336,...,0.744173,0.363708,-0.251821,-0.753453,-0.673483,1.686854,2.149712,-0.894007,0.109257,-0.7583


## Comparación de las soluciones

Observamos que el `ColumnTransformer` ha movido al principio las columnas que ha transformado. El orden de las *features* no es relevante para un entrenamiento, y de serlo para ulteriores preprocesados, podríamos utilizar los índices explícitos de las columnas en el dataframe. Sin embargo, hemos de tener esto en cuenta a la hora de comparar si los dos resultados son iguales.

In [12]:
# Comprobamos que X_train_imputed_a y X_train_imputed_b son iguales
X_train_imputed_a = np.roll(X_train_imputed_a, 5, axis=1) # Desplazamos 5 columnas a la derecha (las correspondientes al OneHotEncoding)
X_train_imputed_a

print("¿Son los resultados iguales paso a paso y con el pipeline?",(X_train_imputed_a==X_train_imputed_b).all())

¿Son los resultados iguales paso a paso y con el pipeline? False


El redondeado en los dos procesos no ha sido idéntico, pero podemos ir haciendo pruebas y ver que sí lo es para 10 cifras significativas en el peor de los casos, lo que es suficiente para considerar que los resultados son iguales. Las diferencias se deben a la forma en la que se almacenan los números reales (coma flotante) y a pequeñas variaciones en el orden de las operaciones.

In [13]:
print(X_train_imputed_a[0,0])
print(X_train_imputed_b[0,0])
print(np.round(X_train_imputed_a[0,0], 12))
print(np.round(X_train_imputed_b[0,0], 12))

print("¿Son los resultados iguales paso a paso y con el pipeline?", 
      (np.round(X_train_imputed_a,9) == np.round(X_train_imputed_b,9))
      .all())

-0.8876826622917703
-0.887682662291862
-0.887682662292
-0.887682662292
¿Son los resultados iguales paso a paso y con el pipeline? True


El método `allclose` de numpy nos permite comparar dos arrays de forma que se consideren iguales si la diferencia entre sus elementos es menor que una tolerancia dada.

In [14]:
print(np.allclose(X_train_imputed_a, X_train_imputed_b))


# TODO: sin embargo, por más que bajo la tolerancia absoluta me sigue dando True
print(f"{X_train_imputed_a[0,0]-X_train_imputed_b[0,0]:.14f}")
print(f"{1e-14:.14f}")
print(np.allclose(X_train_imputed_a, X_train_imputed_b, atol=1e-14))

True
0.00000000000009
0.00000000000001
True


## Entrenando el modelo y midiendo su rendimiento

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

lin_reg = make_pipeline(pipeline, LinearRegression())
lin_reg

In [None]:
print("R2:",lin_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, lin_reg.predict(X_test)))


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

forest_reg = make_pipeline(pipeline, RandomForestRegressor(random_state=42))
print("R2:",forest_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, forest_reg.predict(X_test)))

R2: 0.9748081391179184
RMSE: 47205.89749783198


In [18]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error

knn_reg = make_pipeline(pipeline, KNeighborsRegressor())
print("R2:",knn_reg.fit(X_train, y_train).score(X_train, y_train))
print("RMSE:", root_mean_squared_error(y_test, knn_reg.predict(X_test)))

R2: 0.8162626745863486
RMSE: 60095.38546207905


Queda la solución anterior para comaprar (sin escalar las columnas OneHotEncoded) en [e2e05_knnimputer_old.ipynb](e2e05_knnimputer_old.ipynb). No se aprecia una diferencia significativa entre hacer o no el escalado de las columnas OneHotEncoded. Los resultados son prácticamente idénticos.