# Diplomatura en Ciencias de Datos, Aprendizaje Automático y sus Aplicaciones

Autores: Matías Oria, Antonela Sambuceti, Pamela Pairo, Benjamín Ocampo

In [None]:
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing, impute, neighbors, feature_extraction, pipeline
from sklearn.experimental import enable_iterative_imputer


def plot_imputation_graph(imputations: Tuple[str, List[pd.DataFrame]],
                          missing_cols: List[str]) -> None:
    _, axs = plt.subplots(len(missing_cols), figsize=(10, 10))
    for ax, col_name in zip(axs, missing_cols):
        data = pd.concat([
            imputation_df[[col_name]].assign(method=method)
            for method, imputation_df in imputations
        ])
        seaborn.kdeplot(data=data, x=col_name, hue="method", ax=ax)


def impute_by(values, missing_col_names, estimator):
    indicator = impute.MissingIndicator()
    indicator.fit_transform(values)

    imputer = impute.IterativeImputer(
        random_state=0, estimator=estimator)
    imputed_values = imputer.fit_transform(values)
    imputed_df = pd.DataFrame(imputed_values[:, indicator.features_],
                              columns=missing_col_names)
    return imputed_df

In [None]:
URL_MELB_HOUSING_FILTERED = "https://www.famaf.unc.edu.ar/~nocampo043/melb_housing_filtered_df.csv"
URL_MELB_SUBURB_FILTERED = "https://www.famaf.unc.edu.ar/~nocampo043/melb_suburb_filtered_df.csv"

melb_housing_df = pd.read_csv(URL_MELB_HOUSING_FILTERED)
melb_suburb_df = pd.read_csv(URL_MELB_SUBURB_FILTERED)
melb_combined_df = melb_housing_df.join(melb_suburb_df, on="suburb_id")
melb_combined_df

## Enconding 

In [None]:
categorical_cols = [
    "housing_room_segment", "housing_bathroom_segment", "housing_type",
    "suburb_region_segment"
]
numerical_cols = [
    "housing_price", "housing_land_size", "suburb_rental_dailyprice"
]
feature_cols = categorical_cols + numerical_cols
features = list(melb_combined_df[feature_cols].T.to_dict().values())

vectorizer = feature_extraction.DictVectorizer()
feature_matrix = vectorizer.fit_transform(features)
feature_matrix

In [None]:
vectorizer.get_feature_names()

## Imputación por KNN

### Sin escalado

In [None]:
missing_cols = ["housing_year_built", "housing_building_area"]
estimator = neighbors.KNeighborsRegressor(n_neighbors=2)

In [None]:
missing_df = melb_combined_df[missing_cols]
original_df = missing_df.dropna()
all_df = np.hstack([missing_df, feature_matrix.todense()])

knn_missing_cols = impute_by(missing_df, missing_cols, estimator)
knn_all_cols = impute_by(all_df, missing_cols, estimator)

In [None]:
imputations = [
    ("original", original_df),
    ("knn - missing cols", knn_missing_cols),
    ("knn - all cols", knn_all_cols)
]
plot_imputation_graph(imputations, missing_cols)

### Con escalado

In [None]:
scaler = preprocessing.StandardScaler()
original_scaled_df = pd.DataFrame(scaler.fit_transform(original_df),
                                  columns=missing_cols)
knn_scaled_missing_cols = impute_by(scaler.fit_transform(missing_df),
                                    missing_cols, estimator)
knn_scaled_all_cols = impute_by(scaler.fit_transform(all_df), missing_cols,
                                estimator)

In [None]:
imputations = [
    ("scaled original", original_scaled_df),
    ("knn - scaled missing cols", knn_scaled_missing_cols),
    ("knn - scaled all cols", knn_scaled_all_cols)
]
plot_imputation_graph(imputations, missing_cols)

## Reducción de dimensionalidad

## Composición del resultado