Entrega : 01

Grupo : 07

In [None]:
import os
from pathlib import Path 
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer

In [97]:
BASE_DIR = Path.cwd() # directorio del notebook
print(BASE_DIR)

# Rutas para datos
DATA_DIR = BASE_DIR / "data"
RAW_DATA_PATH = DATA_DIR / "raw"
INTER_DATA_PATH = DATA_DIR / "inter"
FINAL_DATA_PATH = DATA_DIR / "final"
# Rutas para gráficos, scripts y documentación
GRAPHS_PATH = BASE_DIR / "graphs"
SCRIPTS_PATH = BASE_DIR / "scripts"
DOCS_PATH = BASE_DIR / "docs"


c:\Users\leydi\Documents\Machine-Learning-para-Finanzas_2026_0\Grupo07


In [98]:
# cargando datos
df_income_raw = pd.read_excel(RAW_DATA_PATH / "income_dataset.xlsx")
df_lending_raw = pd.read_excel(RAW_DATA_PATH / "lending_dataset.xlsx")

# EDA 

Análisis inicial

In [99]:
print(df_income_raw.shape)
print(df_lending_raw.shape)

(10000, 14)
(19908, 20)


In [100]:
df_income_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Age                             10000 non-null  int64 
 1   Education_Level                 10000 non-null  object
 2   Occupation                      10000 non-null  object
 3   Number_of_Dependents            10000 non-null  int64 
 4   Location                        10000 non-null  object
 5   Work_Experience                 10000 non-null  int64 
 6   Marital_Status                  10000 non-null  object
 7   Employment_Status               10000 non-null  object
 8   Household_Size                  10000 non-null  int64 
 9   Homeownership_Status            10000 non-null  object
 10  Type_of_Housing                 10000 non-null  object
 11  Gender                          10000 non-null  object
 12  Primary_Mode_of_Transportation  10000 non-null 

In [101]:
df_lending_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19908 entries, 0 to 19907
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 19908 non-null  int64  
 1   loan_amnt             19908 non-null  int64  
 2   term                  19908 non-null  int64  
 3   int_rate              19908 non-null  float64
 4   installment           19908 non-null  float64
 5   grade                 19908 non-null  object 
 6   emp_title             18723 non-null  object 
 7   emp_length            19406 non-null  object 
 8   home_ownership        19905 non-null  object 
 9   annual_inc            19905 non-null  float64
 10  verification_status   19905 non-null  object 
 11  loan_status           19905 non-null  object 
 12  pymnt_plan            19908 non-null  object 
 13  purpose               19905 non-null  object 
 14  addr_state            19899 non-null  object 
 15  dti                

## 1. Análisis estadístico inicial 

In [102]:
TARGET_REG = "Income"  # variable objetivo para regresión
TARGET_CLAS = "loan_status"  # variable objetivo para clasificación

# selecciona las columnas numéricas y categóricas excepto la variable objetivo

df_reg = df_income_raw.copy()
num_reg= df_reg.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_reg = df_reg.select_dtypes(include=['object', "category", "bool"]).columns.tolist()
num_reg.remove(TARGET_REG)  

df_clas = df_lending_raw.copy()
num_clas = df_clas.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_clas = df_clas.select_dtypes(include=['object', "category", "bool"]).columns.tolist()
cat_clas.remove(TARGET_CLAS)  

print("=== Variables Identificadas ===")
print("Target de regresión:", TARGET_REG)
print("Variables numéricas para regresión:", num_reg)
print("Variables categóricas para regresión:", cat_reg)
print("--------------------------------")
print("Target de clasificación:", TARGET_CLAS)
print("Variables numéricas para clasificación:", num_clas)
print("Variables categóricas para clasificación:", cat_clas)

=== Variables Identificadas ===
Target de regresión: Income
Variables numéricas para regresión: ['Age', 'Number_of_Dependents', 'Work_Experience', 'Household_Size']
Variables categóricas para regresión: ['Education_Level', 'Occupation', 'Location', 'Marital_Status', 'Employment_Status', 'Homeownership_Status', 'Type_of_Housing', 'Gender', 'Primary_Mode_of_Transportation']
--------------------------------
Target de clasificación: loan_status
Variables numéricas para clasificación: ['index', 'loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'revol_util', 'pub_rec_bankruptcies']
Variables categóricas para clasificación: ['grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'addr_state', 'application_type']


### 1.1 Numéricas - descriptivas 

In [119]:
def eda_numericas_descriptivas(df, num_cols, top_n=5):
    """
    Realiza EDA descriptivo para variables numéricas.
    
    Args:
        df: DataFrame a analizar
        num_cols: Lista de columnas numéricas
        top_n: Número de valores más frecuentes a mostrar
    """
    print("=" * 50)
    print("ESTADÍSTICAS DESCRIPTIVAS")
    print("=" * 50)
    display(df[num_cols].describe().T.round(2))
    
    print("\n" + "=" * 50)
    print("QUANTILES (1%, 25%, 50%, 75%, 99%)")
    print("=" * 50)
    display(df[num_cols].quantile([0.01, 0.25, 0.5, 0.75, 0.99]).T.round(2))
    
    print("\n" + "=" * 50)
    print("VALORES NULOS")
    print("=" * 50)
    print(df[num_cols].isna().sum())
    
    print("\n" + "=" * 50)
    print(f"TOP {top_n} VALORES MÁS FRECUENTES POR VARIABLE")
    print("=" * 50)
    for col in num_cols:
        print(f"\n--- {col} ---")
        print(df[col].value_counts().head(top_n).round(2))

In [104]:
# regresión
eda_numericas_descriptivas(df_reg, num_reg)

ESTADÍSTICAS DESCRIPTIVAS


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,44.0217,15.203998,18.0,31.0,44.0,57.0,70.0
Number_of_Dependents,10000.0,2.527,1.713991,0.0,1.0,3.0,4.0,5.0
Work_Experience,10000.0,24.8588,14.652622,0.0,12.0,25.0,37.0,50.0
Household_Size,10000.0,3.9896,2.010496,1.0,2.0,4.0,6.0,7.0



QUANTILES (1%, 25%, 50%, 75%, 99%)


Unnamed: 0,0.01,0.25,0.50,0.75,0.99
Age,18.0,31.0,44.0,57.0,70.0
Number_of_Dependents,0.0,1.0,3.0,4.0,5.0
Work_Experience,0.0,12.0,25.0,37.0,50.0
Household_Size,1.0,2.0,4.0,6.0,7.0



VALORES NULOS
Age                     0
Number_of_Dependents    0
Work_Experience         0
Household_Size          0
dtype: int64

TOP 5 VALORES MÁS FRECUENTES POR VARIABLE

--- Age ---
Age
43    223
66    216
62    215
40    212
64    209
Name: count, dtype: int64

--- Number_of_Dependents ---
Number_of_Dependents
5    1745
3    1712
1    1651
0    1642
4    1629
Name: count, dtype: int64

--- Work_Experience ---
Work_Experience
2     224
44    221
48    219
24    218
33    216
Name: count, dtype: int64

--- Household_Size ---
Household_Size
1    1477
3    1445
6    1437
7    1433
5    1411
Name: count, dtype: int64


In [120]:
# clasificación
eda_numericas_descriptivas(df_clas, num_clas)

ESTADÍSTICAS DESCRIPTIVAS


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,19908.0,9954.5,5747.09,1.0,4977.75,9954.5,14931.25,19908.0
loan_amnt,19908.0,11353.85,7463.7,500.0,5750.0,10000.0,15000.0,35000.0
term,19908.0,42.1,10.45,36.0,36.0,36.0,60.0,60.0
int_rate,19908.0,12.09,3.7,5.42,9.45,11.86,14.61,24.4
installment,19908.0,330.61,210.56,15.69,171.29,285.78,440.82,1302.69
annual_inc,19905.0,71074.71,69810.72,4000.0,42000.0,60000.0,85000.0,6000000.0
dti,19897.0,13.01,6.66,0.0,7.84,13.08,18.2,29.99
delinq_2yrs,19899.0,0.14,0.49,0.0,0.0,0.0,0.0,11.0
revol_util,19880.0,49.74,49.82,0.0,26.5,50.3,72.8,5829.0
pub_rec_bankruptcies,19587.0,0.04,0.19,0.0,0.0,0.0,0.0,2.0



QUANTILES (1%, 25%, 50%, 75%, 99%)


Unnamed: 0,0.01,0.25,0.50,0.75,0.99
index,200.07,4977.75,9954.5,14931.25,19708.93
loan_amnt,1200.0,5750.0,10000.0,15000.0,35000.0
term,36.0,36.0,36.0,60.0,60.0
int_rate,5.42,9.45,11.86,14.61,20.99
installment,39.14,171.29,285.78,440.82,933.13
annual_inc,14400.0,42000.0,60000.0,85000.0,248003.84
dti,0.35,7.84,13.08,18.2,26.26
delinq_2yrs,0.0,0.0,0.0,0.0,2.0
revol_util,0.0,26.5,50.3,72.8,98.5
pub_rec_bankruptcies,0.0,0.0,0.0,0.0,1.0



VALORES NULOS
index                     0
loan_amnt                 0
term                      0
int_rate                  0
installment               0
annual_inc                3
dti                      11
delinq_2yrs               9
revol_util               28
pub_rec_bankruptcies    321
dtype: int64

TOP 5 VALORES MÁS FRECUENTES POR VARIABLE

--- index ---
index
19908    1
1        1
2        1
3        1
19892    1
Name: count, dtype: int64

--- loan_amnt ---
loan_amnt
10000    1466
12000    1208
5000     1026
6000      994
15000     984
Name: count, dtype: int64

--- term ---
term
36    14852
60     5056
Name: count, dtype: int64

--- int_rate ---
int_rate
10.99    492
11.49    407
13.49    406
7.51     384
7.88     381
Name: count, dtype: int64

--- installment ---
installment
311.11    37
180.96    37
368.45    33
311.02    30
325.74    28
Name: count, dtype: int64

--- annual_inc ---
annual_inc
60000.0    794
50000.0    554
40000.0    445
45000.0    431
75000.0    425
Name:

### 1.2 Categóricas - frecuencias

In [106]:
def eda_categoricas_descriptivas(df, cat_cols, top_n=10):
    """
    Realiza EDA descriptivo para variables categóricas.
    
    Args:
        df: DataFrame a analizar
        cat_cols: Lista de columnas categóricas
        top_n: Número de categorías más frecuentes a mostrar
    """
    print("=" * 50)
    print("VALORES ÚNICOS POR VARIABLE")
    print("=" * 50)
    display(df[cat_cols].nunique().to_frame("n_unique"))
    
    print("\n" + "=" * 50)
    print("VALORES NULOS")
    print("=" * 50)
    print(df[cat_cols].isna().sum())
    
    print("\n" + "=" * 50)
    print(f"TOP {top_n} FRECUENCIAS POR VARIABLE")
    print("=" * 50)
    for col in cat_cols:
        print(f"\n--- {col} ---")
        print(df[col].value_counts(dropna=False).head(top_n))

In [107]:
# regresión
eda_categoricas_descriptivas(df_reg, cat_reg)

VALORES ÚNICOS POR VARIABLE


Unnamed: 0,n_unique
Education_Level,4
Occupation,5
Location,3
Marital_Status,3
Employment_Status,3
Homeownership_Status,2
Type_of_Housing,3
Gender,2
Primary_Mode_of_Transportation,4



VALORES NULOS
Education_Level                   0
Occupation                        0
Location                          0
Marital_Status                    0
Employment_Status                 0
Homeownership_Status              0
Type_of_Housing                   0
Gender                            0
Primary_Mode_of_Transportation    0
dtype: int64

TOP 10 FRECUENCIAS POR VARIABLE

--- Education_Level ---
Education_Level
Bachelor's     4058
High School    2959
Master's       2482
Doctorate       501
Name: count, dtype: int64

--- Occupation ---
Occupation
Healthcare    3035
Technology    2407
Finance       1525
Others        1521
Education     1512
Name: count, dtype: int64

--- Location ---
Location
Urban       7037
Suburban    1951
Rural       1012
Name: count, dtype: int64

--- Marital_Status ---
Marital_Status
Married     5136
Single      3900
Divorced     964
Name: count, dtype: int64

--- Employment_Status ---
Employment_Status
Full-time        5004
Part-time        3016
Self-em

In [108]:
# clasificación
eda_categoricas_descriptivas(df_clas, cat_clas)

VALORES ÚNICOS POR VARIABLE


Unnamed: 0,n_unique
grade,7
emp_title,15047
emp_length,11
home_ownership,5
verification_status,3
pymnt_plan,1
purpose,14
addr_state,8
application_type,2



VALORES NULOS
grade                     0
emp_title              1185
emp_length              502
home_ownership            3
verification_status       3
pymnt_plan                0
purpose                   3
addr_state                9
application_type          0
dtype: int64

TOP 10 FRECUENCIAS POR VARIABLE

--- grade ---
grade
B    6065
A    4875
C    4087
D    2748
E    1434
F     540
G     159
Name: count, dtype: int64

--- emp_title ---
emp_title
NaN                    1185
Bank of America          53
Kaiser Permanente        50
State of California      42
US Army                  40
AT&T                     36
JP Morgan Chase          27
IBM                      25
Verizon                  24
UPS                      23
Name: count, dtype: int64

--- emp_length ---
emp_length
10+ years    4249
< 1 year     2304
2 years      2218
3 years      2108
4 years      1871
5 years      1649
1 year       1636
6 years      1151
7 years       867
8 years       718
Name: count, dtype: int6

## 2. Análisis de calidad (diagnóstico)

### 2.1) Numéricas

In [109]:
def eda_calidad_numericas(df, num_cols, nombre_dataset="Dataset"):
    """
    Realiza análisis de calidad (diagnóstico) para variables numéricas.
    
    Args:
        df: DataFrame a analizar
        num_cols: Lista de columnas numéricas
        nombre_dataset: Nombre del dataset para identificación
    """
    print("=" * 50)
    print(f"ANÁLISIS DE CALIDAD - {nombre_dataset}")
    print("=" * 50)
    
    print("\nVariables numéricas analizadas:")
    print(num_cols)
    
    # 1) Información general
    print("\n" + "=" * 50)
    print("INFORMACIÓN GENERAL (dtypes, nulos)")
    print("=" * 50)
    df[num_cols].info()
    
    # 2) Diagnóstico de dtypes incorrectos
    print("\n" + "=" * 50)
    print("DIAGNÓSTICO DE DTYPES INCORRECTOS")
    print("=" * 50)
    
    dtype_rows = []
    for col in num_cols:
        serie_original = df[col]
        serie_coerce = pd.to_numeric(serie_original, errors="coerce")
        
        dtype_rows.append({
            "variable": col,
            "dtype_original": serie_original.dtype,
            "NaN_antes": serie_original.isna().sum(),
            "NaN_despues_coerce": serie_coerce.isna().sum(),
            "valores_no_numericos": serie_coerce.isna().sum() - serie_original.isna().sum()
        })
    
    dtype_diag = pd.DataFrame(dtype_rows)
    display(dtype_diag)
    
    # 3) Análisis de missings redondeado a 2 decimales  

    print("\n" + "=" * 50)
    print("ANÁLISIS DE MISSINGS")
    print("=" * 50)
    
    missing_num = (
        df[num_cols]
        .isna()
        .mean()
        .round(2)
        .sort_values(ascending=False)
        .to_frame("pct_missing")
    )
    display(missing_num) 
    
    # 4) Análisis de outliers
    print("\n" + "=" * 50)
    print("ANÁLISIS DE OUTLIERS")
    print("=" * 50)
    
    quantiles_num = df[num_cols].quantile([0.01, 0.50, 0.99]).T
    quantiles_num.columns = ["p1", "p50", "p99"]
    
    min_max_num = df[num_cols].agg(["min", "max"]).T
    
    outliers_num = pd.concat([min_max_num, quantiles_num], axis=1)
    outliers_num = outliers_num[["min", "p1", "p50", "p99", "max"]]
    display(outliers_num)
    
    # 5) Análisis de duplicados
    print("\n" + "=" * 50)
    print("ANÁLISIS DE DUPLICADOS")
    print("=" * 50)
    
    n_duplicados = df.duplicated().sum()
    print(f"Número de filas duplicadas: {n_duplicados}")

In [110]:
eda_calidad_numericas(df_reg,num_reg, "Dataset de Regresión")

ANÁLISIS DE CALIDAD - Dataset de Regresión

Variables numéricas analizadas:
['Age', 'Number_of_Dependents', 'Work_Experience', 'Household_Size']

INFORMACIÓN GENERAL (dtypes, nulos)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Age                   10000 non-null  int64
 1   Number_of_Dependents  10000 non-null  int64
 2   Work_Experience       10000 non-null  int64
 3   Household_Size        10000 non-null  int64
dtypes: int64(4)
memory usage: 312.6 KB

DIAGNÓSTICO DE DTYPES INCORRECTOS


Unnamed: 0,variable,dtype_original,NaN_antes,NaN_despues_coerce,valores_no_numericos
0,Age,int64,0,0,0
1,Number_of_Dependents,int64,0,0,0
2,Work_Experience,int64,0,0,0
3,Household_Size,int64,0,0,0



ANÁLISIS DE MISSINGS


Unnamed: 0,pct_missing
Age,0.0
Number_of_Dependents,0.0
Work_Experience,0.0
Household_Size,0.0



ANÁLISIS DE OUTLIERS


Unnamed: 0,min,p1,p50,p99,max
Age,18,18.0,44.0,70.0,70
Number_of_Dependents,0,0.0,3.0,5.0,5
Work_Experience,0,0.0,25.0,50.0,50
Household_Size,1,1.0,4.0,7.0,7



ANÁLISIS DE DUPLICADOS
Número de filas duplicadas: 0


In [111]:
eda_calidad_numericas(df_clas, num_clas, "Dataset dde Clasificación")

ANÁLISIS DE CALIDAD - Dataset dde Clasificación

Variables numéricas analizadas:
['index', 'loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'revol_util', 'pub_rec_bankruptcies']

INFORMACIÓN GENERAL (dtypes, nulos)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19908 entries, 0 to 19907
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 19908 non-null  int64  
 1   loan_amnt             19908 non-null  int64  
 2   term                  19908 non-null  int64  
 3   int_rate              19908 non-null  float64
 4   installment           19908 non-null  float64
 5   annual_inc            19905 non-null  float64
 6   dti                   19897 non-null  float64
 7   delinq_2yrs           19899 non-null  float64
 8   revol_util            19880 non-null  float64
 9   pub_rec_bankruptcies  19587 non-null  float64
dtypes: float64(7), int64(3)


Unnamed: 0,variable,dtype_original,NaN_antes,NaN_despues_coerce,valores_no_numericos
0,index,int64,0,0,0
1,loan_amnt,int64,0,0,0
2,term,int64,0,0,0
3,int_rate,float64,0,0,0
4,installment,float64,0,0,0
5,annual_inc,float64,3,3,0
6,dti,float64,11,11,0
7,delinq_2yrs,float64,9,9,0
8,revol_util,float64,28,28,0
9,pub_rec_bankruptcies,float64,321,321,0



ANÁLISIS DE MISSINGS


Unnamed: 0,pct_missing
pub_rec_bankruptcies,0.02
index,0.0
loan_amnt,0.0
term,0.0
installment,0.0
int_rate,0.0
annual_inc,0.0
dti,0.0
delinq_2yrs,0.0
revol_util,0.0



ANÁLISIS DE OUTLIERS


Unnamed: 0,min,p1,p50,p99,max
index,1.0,200.07,9954.5,19708.93,19908.0
loan_amnt,500.0,1200.0,10000.0,35000.0,35000.0
term,36.0,36.0,36.0,60.0,60.0
int_rate,5.42,5.42,11.86,20.99,24.4
installment,15.69,39.1449,285.78,933.1337,1302.69
annual_inc,4000.0,14400.0,60000.0,248003.84,6000000.0
dti,0.0,0.35,13.08,26.26,29.99
delinq_2yrs,0.0,0.0,0.0,2.0,11.0
revol_util,0.0,0.0,50.3,98.5,5829.0
pub_rec_bankruptcies,0.0,0.0,0.0,1.0,2.0



ANÁLISIS DE DUPLICADOS
Número de filas duplicadas: 0


### 2.2) Categóricas

In [112]:
def eda_calidad_categoricas(df, cat_cols, nombre_dataset="Dataset", umbral_low_freq=0.01):
    """
    Realiza análisis de calidad (diagnóstico) para variables categóricas.
    
    Args:
        df: DataFrame a analizar
        cat_cols: Lista de columnas categóricas
        nombre_dataset: Nombre del dataset para identificación
        umbral_low_freq: Umbral para detectar categorías con baja frecuencia (default 1%)
    """
    print("=" * 50)
    print(f"ANÁLISIS DE CALIDAD CATEGÓRICAS - {nombre_dataset}")
    print("=" * 50)
    
    print("\nVariables categóricas analizadas:")
    print(cat_cols)
    
    # 1) Análisis de missings
    print("\n" + "=" * 50)
    print("ANÁLISIS DE MISSINGS")
    print("=" * 50)
    
    missing_cat = (
        df[cat_cols]
        .isna()
        .mean()
        .round(2)
        .sort_values(ascending=False)
        .to_frame("pct_missing")
    )
    display(missing_cat)
    
    # 2) Análisis de missings codificados
    print("\n" + "=" * 50)
    print("MISSINGS CODIFICADOS (vacíos, '?')")
    print("=" * 50)
    
    rows_missing_cod = []
    for col in cat_cols:
        conteo_vacio = (df[col].astype(str).str.strip() == "").sum()
        conteo_preg = (df[col].astype(str).str.strip() == "?").sum()
        
        rows_missing_cod.append({
            "variable": col,
            "empty_string": conteo_vacio,
            "question_mark": conteo_preg
        })
    
    missing_cod = pd.DataFrame(rows_missing_cod)
    display(missing_cod)
    
    # 3) Categorías con baja frecuencia
    print("\n" + "=" * 50)
    print(f"CATEGORÍAS CON FRECUENCIA < {umbral_low_freq*100:.0f}%")
    print("=" * 50)
    
    rows_low_freq = []
    for col in cat_cols:
        freq = df[col].value_counts(normalize=True)
        low_freq = freq[freq < umbral_low_freq]
        
        for categoria, propor in low_freq.items():
            rows_low_freq.append({
                "variable": col,
                "categoria": categoria,
                "frecuencia_relativa": round(propor, 4)
            })
    
    low_freq_df = pd.DataFrame(rows_low_freq)
    
    if low_freq_df.empty:
        print("No se identificaron categorías con baja frecuencia.")
    else:
        display(low_freq_df)

In [113]:
# regresión
eda_calidad_categoricas(df_reg, cat_reg, "Dataset de Regresión")

ANÁLISIS DE CALIDAD CATEGÓRICAS - Dataset de Regresión

Variables categóricas analizadas:
['Education_Level', 'Occupation', 'Location', 'Marital_Status', 'Employment_Status', 'Homeownership_Status', 'Type_of_Housing', 'Gender', 'Primary_Mode_of_Transportation']

ANÁLISIS DE MISSINGS


Unnamed: 0,pct_missing
Education_Level,0.0
Occupation,0.0
Location,0.0
Marital_Status,0.0
Employment_Status,0.0
Homeownership_Status,0.0
Type_of_Housing,0.0
Gender,0.0
Primary_Mode_of_Transportation,0.0



MISSINGS CODIFICADOS (vacíos, '?')


Unnamed: 0,variable,empty_string,question_mark
0,Education_Level,0,0
1,Occupation,0,0
2,Location,0,0
3,Marital_Status,0,0
4,Employment_Status,0,0
5,Homeownership_Status,0,0
6,Type_of_Housing,0,0
7,Gender,0,0
8,Primary_Mode_of_Transportation,0,0



CATEGORÍAS CON FRECUENCIA < 1%
No se identificaron categorías con baja frecuencia.


In [114]:
# clasificación
eda_calidad_categoricas(df_clas, cat_clas, "Dataset de Clasificación")

ANÁLISIS DE CALIDAD CATEGÓRICAS - Dataset de Clasificación

Variables categóricas analizadas:
['grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'addr_state', 'application_type']

ANÁLISIS DE MISSINGS


Unnamed: 0,pct_missing
emp_title,0.06
emp_length,0.03
grade,0.0
home_ownership,0.0
verification_status,0.0
pymnt_plan,0.0
purpose,0.0
addr_state,0.0
application_type,0.0



MISSINGS CODIFICADOS (vacíos, '?')


Unnamed: 0,variable,empty_string,question_mark
0,grade,0,0
1,emp_title,0,0
2,emp_length,0,0
3,home_ownership,0,0
4,verification_status,0,0
5,pymnt_plan,0,0
6,purpose,0,0
7,addr_state,0,0
8,application_type,0,0



CATEGORÍAS CON FRECUENCIA < 1%


Unnamed: 0,variable,categoria,frecuencia_relativa
0,grade,G,0.0080
1,emp_title,Bank of America,0.0028
2,emp_title,Kaiser Permanente,0.0027
3,emp_title,State of California,0.0022
4,emp_title,US Army,0.0021
...,...,...,...
15052,purpose,educational,0.0081
15053,purpose,renewable_energy,0.0026
15054,addr_state,951xx,0.0001
15055,addr_state,106xx,0.0001


## 3) Preprocesamiento

#### Numéricas

In [115]:
df_reg_inter = df_reg.copy()
df_clas_inter = df_clas.copy()

In [116]:
def imputar_missings_numericas(df, num_cols, estrategia="median"):
    """
    Imputa missings en variables numéricas.
    
    Args:
        df: DataFrame a procesar
        num_cols: Lista de columnas numéricas
        estrategia: Estrategia de imputación ("mean", "median", "zero")
    
    Returns:
        DataFrame con resumen de imputación
    """
    resumen = []
    
    for col in num_cols:
        n_missing = df[col].isna().sum()
        pct_missing = df[col].isna().mean() * 100
        
        if estrategia == "mean":
            valor_imputar = df[col].mean()
        elif estrategia == "median":
            valor_imputar = df[col].median()
        elif estrategia == "zero":
            valor_imputar = 0
        else:
            raise ValueError("Estrategia no reconocida. Use 'mean', 'median' o 'zero'.")
        
        df[col].fillna(valor_imputar, inplace=True)
        
        resumen.append({
            "variable": col,
            "n_missings_imputados": n_missing,
            "pct_missings_imputados": round(pct_missing, 2),
            "valor_imputado": round(valor_imputar, 4)
        })
    
    resumen_df = pd.DataFrame(resumen)
    
    print(f"Estrategia usada: {estrategia}")
    
    return resumen_df

In [117]:
imputar_missings_numericas(df_reg_inter, num_reg, estrategia="median")

Estrategia usada: median


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(valor_imputar, inplace=True)


Unnamed: 0,variable,n_missings_imputados,pct_missings_imputados,valor_imputado
0,Age,0,0.0,44.0
1,Number_of_Dependents,0,0.0,3.0
2,Work_Experience,0,0.0,25.0
3,Household_Size,0,0.0,4.0


In [118]:
imputar_missings_numericas(df_clas_inter, num_clas, estrategia="median")

Estrategia usada: median


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(valor_imputar, inplace=True)


Unnamed: 0,variable,n_missings_imputados,pct_missings_imputados,valor_imputado
0,index,0,0.0,9954.5
1,loan_amnt,0,0.0,10000.0
2,term,0,0.0,36.0
3,int_rate,0,0.0,11.86
4,installment,0,0.0,285.78
5,annual_inc,3,0.02,60000.0
6,dti,11,0.06,13.08
7,delinq_2yrs,9,0.05,0.0
8,revol_util,28,0.14,50.3
9,pub_rec_bankruptcies,321,1.61,0.0


In [None]:
# regresión
transformar_log_numericas(df_reg_inter, num_reg, umbral_skew=1.0)

In [None]:
# clasificación
transformar_log_numericas(df_clas_inter, num_clas, umbral_skew=1.0)