<center>
    <img src="http://sct.inf.utfsm.cl/wp-content/uploads/2020/04/logo_di.png" style="width:60%">
    <h1> INF396 - Introducción a la Ciencia de Datos</h1>
    <h3> Material Complementario T1</h3>
    <h3> Camilo Núñez-Fernández - camilo.nunezf@usm.cl</h3>
</center>
<hr style="height:2px;border:none"/>

**Temas**  

* III.I~$\rhd$~ Data Cleaning

<hr style="height:2px;border:none"/>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [None]:
np.random.seed(42)

# III.I~$\rhd$~ Data Cleaning - Noisy Data

In [None]:
x = np.linspace(0, 10, 100)
y = np.sin(x) + np.random.normal(0, 0.3, 100)

df = pd.DataFrame({'x': x, 'y_original': y})

In [None]:
def apply_binning(data, n_bins=10, method='mean'):

    bins = pd.cut(data, bins=n_bins)
    
    if method == 'mean':
        smoothed = data.groupby(bins).transform('mean')
    elif method == 'median':
        smoothed = data.groupby(bins).transform('median')
    elif method == 'boundary':
        bin_edges = pd.cut(data, bins=n_bins, retbins=True)[1]
        smoothed = data.copy()
        for i in range(len(bin_edges)-1):
            mask = (data >= bin_edges[i]) & (data <= bin_edges[i+1])
            left_dist = np.abs(data[mask] - bin_edges[i])
            right_dist = np.abs(data[mask] - bin_edges[i+1])
            smoothed[mask] = np.where(left_dist < right_dist, bin_edges[i], bin_edges[i+1])
    else:
        raise ValueError("Método no válido. Usar 'mean', 'median' o 'boundary'")
    return smoothed

In [None]:
df['y_mean'] = apply_binning(df['y_original'], n_bins=10, method='mean')
df['y_median'] = apply_binning(df['y_original'], n_bins=10, method='median')
df['y_boundary'] = apply_binning(df['y_original'], n_bins=10, method='boundary')

In [None]:
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=("Datos Originales", 
                                   "Binning por Media", 
                                   "Binning por Mediana", 
                                   "Binning por Fronteras"))

fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_original'], mode='markers', name='Original'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_mean'], mode='lines', name='Media', line=dict(color='red', width=2)),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_original'], mode='markers', name='Original', marker=dict(opacity=0.3)),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_median'], mode='lines', name='Mediana', line=dict(color='green', width=2)),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_original'], mode='markers', name='Original', marker=dict(opacity=0.3)),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_boundary'], mode='lines', name='Fronteras', line=dict(color='purple', width=2)),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(x=df['x'], y=df['y_original'], mode='markers', name='Original', marker=dict(opacity=0.3)),
    row=2, col=2
)

fig.update_layout(
    title_text="Comparación de Técnicas de Binning para Datos Ruidosos",
    height=800,
    width=1000,
    showlegend=False
)

fig.show()

# III.I~$\rhd$~ Data Cleaning - Missing Values

## Ejemplo 5 - `sklearn.impute.SimpleImputer` - Mean - Axis 0

In [None]:
data_mean = np.array([[1, 2, np.nan], 
                     [4, np.nan, 6], 
                     [7, 8, 9], 
                     [np.nan, 11, 12]])

In [None]:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
imputed_data_mean = imputer_mean.fit_transform(data_mean)

In [None]:
print("Datos originales:")
print(data_mean)
print("\nDatos imputados con la media:")
print(imputed_data_mean)
print("\nValores usados para la imputación (medias):")
print(imputer_mean.statistics_)

## Ejemplo 6 - `sklearn.impute.SimpleImputer` - Median - Axis 0

In [None]:
data_median = np.array([[10, 20, np.nan], 
                       [40, np.nan, 60], 
                       [70, 80, 90], 
                       [np.nan, 110, 120]])

In [None]:
imputer_median = SimpleImputer(missing_values=np.nan, strategy='median')

In [None]:
imputed_data_median = imputer_median.fit_transform(data_median)

In [None]:
print("Datos originales:")
print(data_median)
print("\nDatos imputados con la mediana:")
print(imputed_data_median)
print("\nValores usados para la imputación (medianas):")
print(imputer_median.statistics_)

## Ejemplo 7 - `sklearn.impute.KNNImputer` - k=2 - Axis 0

In [None]:
data = np.array([
    [1, 2],
    [2, np.nan],    # Falta Y
    [3, 6],
    [4, 8],
    [np.nan, 4],    # Falta X
    [6, 12],
    [7, np.nan],    # Falta Y
    [8, 16]
])

In [None]:
imputer = KNNImputer(n_neighbors=2)
imputed_data = imputer.fit_transform(data)

In [None]:
fig = go.Figure()

complete_mask = ~np.isnan(data).any(axis=1)
complete_indices = np.where(complete_mask)[0]
fig.add_trace(go.Scatter(
    x=data[complete_mask, 0],
    y=data[complete_mask, 1],
    mode='markers',
    marker=dict(size=12, color='blue', line=dict(width=2, color='DarkSlateGrey')),
    name='Datos conocidos',
    text=[f"Punto {i+1}" for i in complete_indices],
    hoverinfo='text+x+y'
))

missing_indices = np.where(np.isnan(data).any(axis=1))[0]

for idx in missing_indices:
    x_orig = data[idx, 0] if not np.isnan(data[idx, 0]) else imputed_data[idx, 0]
    y_orig = data[idx, 1] if not np.isnan(data[idx, 1]) else imputed_data[idx, 1]
    
    fig.add_trace(go.Scatter(
        x=[x_orig],
        y=[y_orig],
        mode='markers',
        marker=dict(size=12, color='red', symbol='x', line=dict(width=2)),
        name=f'Faltante (fila {idx+1})',
        hoverinfo='text',
        hovertext=f"Original: [{data[idx, 0] if not np.isnan(data[idx, 0]) else 'NaN'}, "
                 f"{data[idx, 1] if not np.isnan(data[idx, 1]) else 'NaN'}]"
    ))

    fig.add_trace(go.Scatter(
        x=[imputed_data[idx, 0]],
        y=[imputed_data[idx, 1]],
        mode='markers',
        marker=dict(size=12, color='green', line=dict(width=2)),
        name=f'Imputado (fila {idx+1})',
        hoverinfo='text',
        hovertext=f"Imputado: [{imputed_data[idx, 0]:.2f}, {imputed_data[idx, 1]:.2f}]"
    ))

    if np.isnan(data[idx, 0]):
        valid_points = data[complete_mask, 1]
        distances = np.abs(valid_points - data[idx, 1])
    else:
        valid_points = data[complete_mask, 0]
        distances = np.abs(valid_points - data[idx, 0])
    
    nearest_positions = np.argsort(distances)[:2]
    nearest_indices = complete_indices[nearest_positions]
    
    for neighbor_idx in nearest_indices:
        fig.add_trace(go.Scatter(
            x=[imputed_data[idx, 0], data[neighbor_idx, 0]],
            y=[imputed_data[idx, 1], data[neighbor_idx, 1]],
            mode='lines',
            line=dict(color='gray', width=1, dash='dot'),
            showlegend=False,
            hoverinfo='none'
        ))

fig.update_layout(
    title='<b>Visualización de KNNImputer (n_neighbors=2)</b><br><sup>Círculos verdes: valores imputados | Cruces rojas: valores faltantes originales</sup>',
    xaxis_title='Variable X',
    yaxis_title='Variable Y',
    hovermode='closest',
    legend=dict(orientation="h", yanchor="bottom", y=1.1, xanchor="center", x=0.5),
    margin=dict(l=40, r=40, t=100, b=40),
    width=1000,
    height=900,
)

fig.show()

In [None]:
print("Datos originales con valores faltantes:")
print(data)

print("\nDatos después de la imputación KNN:")
print(imputed_data)