# Semana 04 - Análisis Exploratorio de Datos (EDA)

Este notebook guía un flujo EDA completo: calidad, distribuciones, relaciones, outliers, transformaciones y una baseline rápida de modelo.

> Nota: El dataset generado es sintético (placeholder). Sustituir por datos reales cuando estén disponibles.

## 1. Configuración del Entorno y Carga de Librerías

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import textwrap, math, hashlib, random
np.random.seed(42)
random.seed(42)
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize']=(8,4)

## 2. Carga de Datos Crudos (Train/Test)
Creamos un dataset sintético para demostración. Reemplaza esta celda por la lectura real (CSV/Parquet).

In [None]:
N=1000
dates = pd.date_range('2024-01-01', periods=N, freq='D')
age = np.random.normal(40, 12, N).clip(18,80)
income = np.random.lognormal(mean=10, sigma=0.5, size=N) / 1000  # escala miles
score = np.random.beta(2,5, N)*100
segment = np.random.choice(['A','B','C','D','E'], p=[0.35,0.25,0.2,0.15,0.05], size=N)
city = np.random.choice(['Santiago','Valparaiso','Concepcion','Serena','Otra'], size=N, p=[0.45,0.2,0.15,0.1,0.1])
# texto sintético
words = ['data','model','quality','python','big','value','cloud','batch','stream','etl']
comment = [' '.join(np.random.choice(words, size=np.random.randint(4,11))) for _ in range(N)]
# target binaria dependiente de income y score
logit = -5 + 0.4*(income>np.median(income)) + 0.03*score + 0.02*(age-40)
prob = 1/(1+np.exp(-logit))
target = (np.random.rand(N) < prob).astype(int)
df = pd.DataFrame({ 'date':dates,'age':age,'income':income,'score':score,'segment':segment,'city':city,'comment':comment,'target':target })
# Introducimos nulos y outliers
for col,pct in [('age',0.02),('income',0.03),('score',0.01)]:
    mask = np.random.rand(N) < pct
    df.loc[mask,col] = np.nan
# outliers extremos income
df.loc[np.random.choice(df.index,5), 'income'] *= 8
# Mezclar
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

In [None]:
print('Shape:', df.shape)
print('Columnas:', list(df.columns))
print('Hash SHA256 dataset actual:', hashlib.sha256(pd.util.hash_pandas_object(df, index=True).values).hexdigest()[:16])

## 3. Inspección Estructural Inicial (shape, dtypes, memoria)

In [None]:
def memory_usage_report(df: pd.DataFrame):
    mu = (df.memory_usage(deep=True)/1024).sort_values(ascending=False)
    return pd.DataFrame({'kB':mu, 'kB_pct': mu/mu.sum()*100})
memory_usage_report(df)

## 4. Diccionario de Variables (Automático + Ajustes Manuales)

In [None]:
def variable_dictionary(df):
    rows=[]
    for c in df.columns:
        s=df[c]
        rows.append({
            'variable':c,
            'tipo':s.dtype.name,
            '%nulos': s.isna().mean()*100,
            'cardinalidad': s.nunique(dropna=True),
            'ejemplo': s.dropna().iloc[0] if s.dropna().size else None,
            'descripcion':'<editar>'
        })
    return pd.DataFrame(rows)
var_dict = variable_dictionary(df)
var_dict

## 5. Conversión y Optimización de Tipos

In [None]:
def optimize_types(df):
    df_opt = df.copy()
    before = df_opt.memory_usage(deep=True).sum()/1024
    # fechas
    df_opt['date'] = pd.to_datetime(df_opt['date'])
    # categorías
    for c in ['segment','city']:
        df_opt[c] = df_opt[c].astype('category')
    # downcast numéricos
    for c in df_opt.select_dtypes(include=['float64','int64']).columns:
        df_opt[c] = pd.to_numeric(df_opt[c], downcast='float')
    after = df_opt.memory_usage(deep=True).sum()/1024
    return df_opt, before, after
df_opt, mem_before, mem_after = optimize_types(df)
print(f'Memoria antes: {mem_before:.1f} kB / después: {mem_after:.1f} kB / ahorro {(1-mem_after/mem_before)*100:.2f}%')

## 6. Análisis de Valores Faltantes (Matriz, Porcentajes, Patrones)

In [None]:
missing_pct = df.isna().mean().sort_values(ascending=False)*100
missing_pct

## 7. Imputaciones Rápidas de Prueba (No Definitivas)

In [None]:
impute_cols = ['age','income','score']
df_mean_imp = df.copy()
for c in impute_cols: df_mean_imp[c]=df_mean_imp[c].fillna(df_mean_imp[c].mean())
knn_imp = KNNImputer(n_neighbors=5)
df_knn = df.copy()
df_knn[impute_cols] = knn_imp.fit_transform(df_knn[impute_cols])
df_knn[impute_cols].head()

## 8. Detección de Duplicados y Registros Potencialmente Erróneos

In [None]:
dup_total = df.duplicated().sum()
dup_subset = df.duplicated(subset=['segment','city','date']).sum()
{'duplicados_totales':dup_total,'duplicados_segment_city_date':dup_subset}

## 9. Análisis Univariado Variables Numéricas

In [None]:
num_cols = df.select_dtypes(include=['float64','float32','int64','int32']).columns.tolist()
univar_metrics=[]
for c in num_cols:
    s=df[c]
    univar_metrics.append({'var':c,'mean':s.mean(),'median':s.median(),'std':s.std(),'skew':s.skew(),'kurt':s.kurt()})
    sns.histplot(s, kde=True); plt.title(f'Distribución {c}'); plt.show()
pd.DataFrame(univar_metrics)

## 10. Análisis Univariado Variables Categóricas

In [None]:
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
freq_tables={}
for c in cat_cols:
    vc = df[c].value_counts(dropna=False)
    freq = (vc/len(df)*100).rename('pct')
    ft = pd.concat([vc, freq.cumsum().rename('pct_acum')], axis=1)
    freq_tables[c]=ft
    display(ft.head())

## 11. Métricas de Dispersión y Rangos (IQR, MAD, z-score)

In [None]:
disp_rows=[]
for c in num_cols:
    s=df[c].dropna()
    q1,q3 = s.quantile([0.25,0.75])
    iqr = q3-q1
    mad = stats.median_abs_deviation(s, scale='normal') if s.size>0 else np.nan
    mean, std = s.mean(), s.std()
    disp_rows.append({'var':c,'iqr':iqr,'mad':mad,'std':std})
disp_df = pd.DataFrame(disp_rows); disp_df

## 12. Detección de Outliers (IQR, Z-Score, Robust Z)

In [None]:
def detect_outliers(series):
    s=series.dropna()
    if s.empty: return {'iqr_outliers':0,'z_outliers':0,'robust_outliers':0}
    q1,q3 = s.quantile([0.25,0.75])
    iqr=q3-q1
    lower, upper = q1-1.5*iqr, q3+1.5*iqr
    iqr_o = ((s<lower)|(s>upper)).sum()
    z = (s - s.mean())/s.std(ddof=0)
    z_o = (np.abs(z)>3).sum()
    mad = stats.median_abs_deviation(s, scale='normal')
    robust_z = (s - s.median())/(1.4826*mad) if mad>0 else np.zeros_like(s)
    rz_o = (np.abs(robust_z)>3.5).sum()
    return {'iqr_outliers':iqr_o,'z_outliers':z_o,'robust_outliers':rz_o}
out_summary=[]
for c in num_cols:
    out = detect_outliers(df[c])
    out['var']=c
    out_summary.append(out)
pd.DataFrame(out_summary)

## 13. Transformaciones de Escala y Distribución (Log, Box-Cox, Yeo-Johnson)

In [None]:
trans_cols=['income']
skew_report=[]
for c in trans_cols:
    orig = df[c].dropna()
    skew_report.append({'var':c,'stage':'original','skew':orig.skew()})
    log_tr = np.log1p(orig)
    skew_report.append({'var':c,'stage':'log','skew':log_tr.skew()})
    # Yeo-Johnson (funciona con ceros/negativos)
    pt = PowerTransformer(method='yeo-johnson')
    yj = pt.fit_transform(orig.values.reshape(-1,1)).ravel()
    skew_report.append({'var':c,'stage':'yeo-johnson','skew':pd.Series(yj).skew()})
pd.DataFrame(skew_report)

## 14. Análisis Bivariado Numérico vs Numérico

In [None]:
sns.pairplot(df.sample(250, random_state=42), vars=['age','income','score'], hue='segment'); plt.show()

## 15. Análisis Categórico vs Numérico (Boxplots, Violin, ANOVA)

In [None]:
anova_rows=[]
for cat in ['segment','city']:
    for num in ['age','income','score']:
        sns.boxplot(data=df, x=cat, y=num); plt.title(f'{num} por {cat}'); plt.xticks(rotation=45); plt.show()
        # ANOVA rápida (ignore NaN)
        groups=[g.dropna().values for _,g in df.groupby(cat)[num]]
        if all(len(g)>1 for g in groups):
            f,p = stats.f_oneway(*groups)
            anova_rows.append({'cat':cat,'num':num,'p_value':p})
pd.DataFrame(anova_rows).sort_values('p_value').head()

## 16. Matrices de Correlaciones (Pearson, Spearman, Kendall)

In [None]:
num_df = df[['age','income','score','target']].copy()
corr_pearson = num_df.corr(method='pearson')
corr_spearman = num_df.corr(method='spearman')
corr_kendall = num_df.corr(method='kendall')
corr_pearson

## 17. Mapa de Calor de Correlaciones Filtrado (|r| > 0.3)

In [None]:
mask = corr_pearson.abs() > 0.3
sns.heatmap(corr_pearson.where(mask), annot=True, cmap='coolwarm', vmin=-1, vmax=1); plt.show()

## 18. Detección de Colinealidad (VIF)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
vif_df = pd.DataFrame()
X = num_df.drop(columns=['target']).dropna()
X_const = sm.add_constant(X)
vif_df['variable'] = X.columns
vif_df['VIF'] = [variance_inflation_factor(X_const.values, i+1) for i in range(len(X.columns))]
vif_df

## 19. Interacciones Iniciales (Feature Crossing Simple)

In [None]:
df_fx = df.copy()
df_fx['income_score'] = df_fx['income']*df_fx['score']
df_fx['score_per_age'] = df_fx['score']/(df_fx['age']+1)
df_fx[['income_score','score_per_age']].head()

## 20. Relación con Variable Objetivo

In [None]:
sns.boxplot(data=df, x='target', y='income'); plt.title('Income vs target'); plt.show()
sns.histplot(data=df, x='score', hue='target', element='step', stat='density', common_norm=False); plt.show()

## 21. Análisis Temporal (si existe fecha)

In [None]:
ts = df.set_index('date').resample('M')['income'].mean()
ts.plot(marker='o'); plt.title('Income promedio mensual'); plt.show()

## 22. Análisis de Texto (si existen campos textuales)

In [None]:
text_len = df['comment'].str.len()
tokens = df['comment'].str.split()
vocab_size = len(set([w for toks in tokens for w in toks]))
{'longitud_media': text_len.mean(), 'vocab_size': vocab_size}

## 23. Engineering Exploratorio de Variables Derivadas

In [None]:
df_eng = df.copy()
df_eng['is_income_outlier'] = (df_eng['income'] > df['income'].quantile(0.99)).astype(int)
df_eng['rare_segment'] = df_eng['segment'].isin(['D','E']).astype(int)
df_eng[['is_income_outlier','rare_segment']].mean()

## 24. Chequeo de Fugas de Información (Leakage Heurístico)

In [None]:
# Heurística: correlaciones casi perfectas con target o campos derivados posteriores a un evento
leak_suspects = []
for c in num_df.columns:
    if c!='target':
        corr = num_df[['target',c]].corr().iloc[0,1]
        if abs(corr) > 0.95: leak_suspects.append({'variable':c,'corr_target':corr})
leak_suspects

## 25. Baseline Rápido (Modelo Simple)

In [None]:
model_df = df[['age','income','score','segment','city','target']].copy()
# imputación simple para baseline
model_df[['age','income','score']] = model_df[['age','income','score']].fillna(model_df[['age','income','score']].median())
model_df = pd.get_dummies(model_df, columns=['segment','city'], drop_first=True)
X = model_df.drop(columns=['target'])
y = model_df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))
feat_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False).head(15)
feat_imp

## 26. Reporte Resumen (Hallazgos + Sugerencias)
Construimos una tabla conceptual de issues detectados (placeholder editable).

In [None]:
issues=[]
# Ejemplos basados en análisis previo
if missing_pct.max()>0: issues.append({'variable':missing_pct.idxmax(),'issue':'Valores faltantes','recomendacion':'Evaluar imputación apropiada o eliminar si no aporta'})
top_out = sorted(out_summary, key=lambda d: d['iqr_outliers'], reverse=True)[0]
issues.append({'variable':top_out['var'],'issue':'Outliers detectados','recomendacion':'Aplicar winsorización o transformación robusta'})
issues.append({'variable':'income','issue':'Skew alto','recomendacion':'Considerar log/yeo-johnson'})
summary_df = pd.DataFrame(issues) if issues else pd.DataFrame(columns=['variable','issue','recomendacion'])
summary_df

### Exportar reporte resumen a Markdown

In [None]:
summary_md = summary_df.to_markdown(index=False) if not summary_df.empty else '# Sin issues'
with open('reporte_hallazgos.md','w',encoding='utf-8') as f: f.write('# Hallazgos EDA

' + summary_md)
print('Archivo generado: reporte_hallazgos.md')