In [None]:
# ================================================
# PIPELINE MODULAR FINAL PARA TRAIN Y TEST
# ================================================
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import joblib

def preprocesar_pipeline(df, y_col=None, id_col_name="ID",
                         generar_csv_individual=True, train=True, prefijo_csv="train"):
    """
    Preprocesa un DataFrame, genera CSV individuales y completo.
    
    Args:
        df: DataFrame de entrada.
        y_col: nombre de la columna target (opcional para test).
        id_col_name: nombre de la columna ID.
        generar_csv_individual: si True, genera CSV por cada categórica.
        train: si True, guarda objetos de fit para test.
        prefijo_csv: prefijo de los archivos CSV generados.
        
    Returns:
        df_completo: DataFrame preprocesado completo.
    """
    
    df = df.copy()
    
    # ----------------------------
    # Separar ID y target
    # ----------------------------
    id_col = df[id_col_name].reset_index(drop=True)
    
    if y_col is not None:
        y = df[y_col].reset_index(drop=True)
        df = df.drop(columns=[y_col])
    else:
        y = None
    
    # ----------------------------
    # Eliminar columnas innecesarias
    # ----------------------------
    cols_a_eliminar = [
        "E_VALORMATRICULAUNIVERSIDAD",
        "F_TIENELAVADORA",
        "INDICADOR_1",
        "INDICADOR_2",
        "INDICADOR_3",
        "INDICADOR_4",
        "PERIODO_ACADEMICO"
    ]
    df = df.drop(columns=[col for col in cols_a_eliminar if col in df.columns])
    
    # ----------------------------
    # Identificar tipos de columnas
    # ----------------------------
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
    if id_col_name in cat_cols:
        cat_cols.remove(id_col_name)
    
    # ----------------------------
    # Procesamiento numérico
    # ----------------------------
    imp_num = SimpleImputer(strategy="median")
    X_num = pd.DataFrame(imp_num.fit_transform(df[num_cols]), columns=num_cols)
    scaler = RobustScaler()
    X_num = pd.DataFrame(scaler.fit_transform(X_num), columns=num_cols)
    X_num = X_num.reset_index(drop=True)
    
    # Guardar objetos si es train
    if train:
        joblib.dump(imp_num, "imp_num.pkl")
        joblib.dump(scaler, "scaler.pkl")
    
    # ----------------------------
    # Procesamiento categórico
    # ----------------------------
    imp_cat = SimpleImputer(strategy="most_frequent")
    max_unique = 10
    
    all_cat_processed = []
    ohe_dict = {}
    freq_dict = {}
    
    y_reset = y.reset_index(drop=True) if y is not None else None
    id_col_reset = id_col.reset_index(drop=True)
    
    for col in cat_cols:
        # Imputar
        col_data = imp_cat.fit_transform(df[[col]])
        col_series = pd.Series(col_data.ravel(), name=col).reset_index(drop=True)
        
        # Codificación
        if col_series.nunique() <= max_unique:
            ohe = OneHotEncoder(drop='first', sparse_output=False)
            col_ohe = ohe.fit_transform(col_series.values.reshape(-1,1))
            col_names = ohe.get_feature_names_out([col])
            df_col = pd.DataFrame(col_ohe, columns=col_names).reset_index(drop=True)
            ohe_dict[col] = ohe
        else:
            freq_map = col_series.value_counts(normalize=True)
            df_col = col_series.map(freq_map).to_frame(name=f"{col}_freq").reset_index(drop=True)
            freq_dict[col] = freq_map
        
        all_cat_processed.append(df_col)
        
        # CSV individual
        if generar_csv_individual:
            df_individual = pd.concat([X_num, df_col], axis=1)
            df_individual[col + "_original"] = col_series
            
            # Evitar duplicado de ID
            if "ID" in df_individual.columns:
                df_individual.drop(columns=["ID"], inplace=True)
            
            # Insertar ID y target correctamente alineados
            df_individual.insert(0, "ID", id_col_reset)
            if y is not None:
                df_individual["RENDIMIENTO_GLOBAL"] = y_reset
            
            df_individual.to_csv(f"{prefijo_csv}preprocesado{col}.csv", index=False)
    
    # Guardar objetos si es train
    if train:
        joblib.dump(imp_cat, "imp_cat.pkl")
        joblib.dump(ohe_dict, "ohe_dict.pkl")
        joblib.dump(freq_dict, "freq_dict.pkl")
    
    # ----------------------------
    # CSV completo
    # ----------------------------
    all_cat_processed_reset = [df.reset_index(drop=True) for df in all_cat_processed]
    df_completo = pd.concat([X_num] + all_cat_processed_reset, axis=1)
    
    # Evitar duplicado de ID
    if "ID" in df_completo.columns:
        df_completo.drop(columns=["ID"], inplace=True)
    
    df_completo.insert(0, "ID", id_col_reset)
    
    if y is not None:
        df_completo["RENDIMIENTO_GLOBAL"] = y_reset
    
    df_completo.to_csv(f"{prefijo_csv}_preprocesado_completo.csv", index=False)
    
    return df_completo

# ------------------------------------------------
# Ejemplo de uso para TRAIN
# ------------------------------------------------
df_train = pd.read_csv("train.csv")
df_train_proc = preprocesar_pipeline(df_train, y_col="RENDIMIENTO_GLOBAL", id_col_name="ID",
                                     generar_csv_individual=True, train=True, prefijo_csv="train")

# ------------------------------------------------
# Ejemplo de uso para TEST (sin target)
# ------------------------------------------------
# df_test = pd.read_csv("test.csv")
# df_test_proc = preprocesar_pipeline(df_test, y_col=None, id_col_name="ID",
#                                     generar_csv_individual=False, train=False, prefijo_csv="test")