# PROCEDIMIENTO
1) Se cargaron los conjuntos train/validation/test en formato CSV provistos.
2) Se definió una búsqueda exhaustiva (grid) sobre 3 hiperparámetros de RIPPER:
   - k            ∈ {param_grid['k']}
   - prune_size   ∈ {param_grid['prune_size']}
   - max_rules    ∈ {param_grid['max_rules']}
3) Para cada combinación (for's anidados) se entrenó un modelo con TRAIN y se evaluó
   en TRAIN y VALIDATION, calculando tres métricas: Accuracy, F1 (macro) y LogLoss.
   En la tabla comparativa se incluye:
   - 'hiperparámetros': combinación usada
   - 'métrica'        : nombre de la métrica
   - 'score_train'    y 'error_train' (1 - score para accuracy/f1; logloss como error)
   - 'score_val'      y 'error_val'   (ídem, en VALIDATION)
4) Se seleccionó el mejor modelo por el mayor F1 en VALIDATION y se reentrenó con TRAIN.
5) Se calculó el rendimiento en TEST (accuracy, F1 y logloss).
6) Se generó un ejemplo 'inventado' usando medias/modas de TRAIN y se predijo su clase.
7) Se imprimieron las reglas aprendidas para interpretabilidad.

In [None]:
# ==========================================================
# RIPPER (JRip) - Grid de Hiperparámetros + Evaluación RÁPIDA
# Cumple: a) b) c) d) e) f)
# Archivos esperados:
#   TrainX_eng_cls.csv, TrainY_eng_cls.csv
#   ValidationX_eng_cls.csv, ValidationY_eng_cls.csv
#   TestX_eng_cls.csv, TestY_eng_cls.csv
# ==========================================================

# 0) Instalación e imports
try:
    import wittgenstein as lw
except ModuleNotFoundError:
    !pip install -q wittgenstein
    import wittgenstein as lw

import os, numpy as np, pandas as pd, time
from itertools import product
from IPython.display import display
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.multiclass import OneVsRestClassifier

pd.set_option('display.max_colwidth', None)

# -----------------------
# 1) Configuración rápida
# -----------------------
FAST = True            # True = acelera el grid con submuestreo
GRID_MAX_TRAIN = 6000  # nº máx. de filas de TRAIN para el grid (solo si FAST)
GRID_MAX_VAL   = 3000  # nº máx. de filas de VAL para el grid (solo si FAST)
SHOW_RULES_MAX_CLASSES = 3  # nº máx. de clases para imprimir reglas (para no bloquear)

# -----------------------
# 2) Carga de datos
# -----------------------
base_path = "./"  # cambia si están en otra ruta

def _p(name):
    return os.path.join(base_path, name)

files = [
    "TrainX_eng_cls.csv","TrainY_eng_cls.csv",
    "ValidationX_eng_cls.csv","ValidationY_eng_cls.csv",
    "TestX_eng_cls.csv","TestY_eng_cls.csv"
]
missing = [f for f in files if not os.path.exists(_p(f))]
assert not missing, f"No se encontraron estos archivos: {missing}"

X_tr = pd.read_csv(_p("TrainX_eng_cls.csv"))
y_tr = pd.read_csv(_p("TrainY_eng_cls.csv")).squeeze()

X_va = pd.read_csv(_p("ValidationX_eng_cls.csv"))
y_va = pd.read_csv(_p("ValidationY_eng_cls.csv")).squeeze()

X_te = pd.read_csv(_p("TestX_eng_cls.csv"))
y_te = pd.read_csv(_p("TestY_eng_cls.csv")).squeeze()

# asegurar vector 1D
if isinstance(y_tr, pd.DataFrame): y_tr = y_tr.iloc[:,0]
if isinstance(y_va, pd.DataFrame): y_va = y_va.iloc[:,0]
if isinstance(y_te, pd.DataFrame): y_te = y_te.iloc[:,0]

# Submuestreo SOLO para el grid (reentrenamos el mejor con todo)
def maybe_sample(X, y, nmax, seed=42):
    if nmax is None or len(X) <= nmax:
        return X, y
    idx = pd.Series(range(len(X))).sample(n=nmax, random_state=seed, replace=False).sort_index().values
    return X.iloc[idx].reset_index(drop=True), y.iloc[idx].reset_index(drop=True)

if FAST:
    X_tr_g, y_tr_g = maybe_sample(X_tr, y_tr, GRID_MAX_TRAIN)
    X_va_g, y_va_g = maybe_sample(X_va, y_va, GRID_MAX_VAL)
else:
    X_tr_g, y_tr_g = X_tr, y_tr
    X_va_g, y_va_g = X_va, y_va

# -----------------------
# 3) Helpers de métricas
# -----------------------
def evaluate_set(y_true, labels_or_proba, metric="accuracy"):
    """
    Devuelve (score, error) para accuracy/f1/log_loss.
    - Para velocidad NO llamamos predict_proba: 'labels_or_proba' serán etiquetas.
      -> para log_loss generamos probabilidades 'duras' (one-hot con eps).
    """
    if metric == "accuracy":
        score = accuracy_score(y_true, labels_or_proba)
        return score, 1 - score
    elif metric == "f1":
        score = f1_score(y_true, labels_or_proba, average="macro")
        return score, 1 - score
    elif metric == "log_loss":
        classes = np.unique(y_true)
        mapping = {c:i for i,c in enumerate(classes)}
        hard = np.zeros((len(labels_or_proba), len(classes)))
        eps = 1e-9
        for i, lbl in enumerate(labels_or_proba):
            j = mapping[lbl]
            hard[i, :] = eps/(len(classes)-1) if len(classes)>1 else 0.0
            hard[i, j] = 1.0 - eps
        loss = log_loss(y_true, hard, labels=classes)
        return -loss, loss
    else:
        raise ValueError("Métrica no reconocida.")

metrics = ["accuracy", "f1", "log_loss"]

# ------------------------------------------
# 4) Binario vs Multiclase y constructor RIPPER
# ------------------------------------------
classes = np.unique(y_tr)
is_binary = len(classes) == 2

# Para binario fijamos pos_class de forma robusta (evita el NameError)
pos_class_for_binary = None
if is_binary:
    if set(classes) == {0,1}:
        pos_class_for_binary = 1
    else:
        pos_class_for_binary = sorted(classes)[-1]

def make_estimator(k, prune, maxr, pos_class=None):
    base = lw.RIPPER(
        k=k,
        prune_size=prune,
        max_rules=maxr,
        n_discretize_bins=10,
        random_state=42,
        verbosity=0,
        **({"pos_class": pos_class} if pos_class is not None else {})
    )
    if is_binary:
        return base
    else:
        # One-vs-Rest para multiclase (un modelo por clase)
        return OneVsRestClassifier(base)

# -----------------------
# 5) Grid de hiperparámetros (a, b)
# -----------------------
param_grid = {
    "k": [1, 2, 3],
    "prune_size": [0.2, 0.33, 0.5],
    "max_rules": [None, 10, 20],
}

# -----------------------
# 6) Búsqueda + Tabla (c)
# -----------------------
results = []
t0 = time.time()
total_combos = len(param_grid["k"])*len(param_grid["prune_size"])*len(param_grid["max_rules"])
combo_idx = 0

for k, prune, maxr in product(param_grid["k"], param_grid["prune_size"], param_grid["max_rules"]):
    combo_idx += 1
    print(f"[{combo_idx:02d}/{total_combos}] Entrenando k={k}, prune={prune}, max_rules={maxr} ...", flush=True)
    model = make_estimator(k, prune, maxr, pos_class=pos_class_for_binary)

    # ENTRENAR (grid) en subset rápido si FAST
    model.fit(X_tr_g, y_tr_g)

    # PREDICCIONES etiquetas (más rápido que predict_proba)
    tr_pred = model.predict(X_tr_g)
    va_pred = model.predict(X_va_g)

    for m in metrics:
        tr_score, tr_error = evaluate_set(y_tr_g, tr_pred, m)
        va_score, va_error = evaluate_set(y_va_g, va_pred, m)

        results.append({
            "hiperparámetros": f"k={k} | prune_size={prune} | max_rules={maxr}",
            "métrica": m,
            "score_train": tr_score,
            "error_train": tr_error,
            "score_val": va_score,
            "error_val": va_error
        })

print(f"\nGrid terminado en {time.time()-t0:.1f} s")
df_results = pd.DataFrame(results).sort_values(
    by=["métrica","score_val"], ascending=[True, False]
).reset_index(drop=True)

print("\nTabla comparativa de hiperparámetros, métricas y errores (train/val):")
display(df_results)

# -----------------------
# 7) Selección del mejor por F1(VAL) y reentreno con TODO
# -----------------------
best_row = df_results[df_results["métrica"]=="f1"].sort_values(
    by="score_val", ascending=False
).iloc[0]
print("\nMejor combinación por F1 (validation):")
print(best_row[["hiperparámetros","score_val","error_val"]].to_string(index=False))

def parse_params(s):
    parts = [p.strip() for p in s.split("|")]
    vals = {}
    for p in parts:
        k, v = p.split("=")
        k = k.strip(); v = v.strip()
        if v == "None": vals[k] = None
        else:
            try: vals[k] = float(v) if "." in v else int(v)
            except: vals[k] = v
    return vals

best_params = parse_params(best_row["hiperparámetros"])
best_model = make_estimator(
    k=best_params["k"],
    prune=best_params["prune_size"],
    maxr=best_params["max_rules"],
    pos_class=pos_class_for_binary
)

print("\nReentrenando el mejor modelo con TODO el TRAIN ...")
best_model.fit(X_tr, y_tr)

# -----------------------
# 8) Error en TEST (d)
# -----------------------
te_pred = best_model.predict(X_te)
test_f1_score, test_f1_error = evaluate_set(y_te, te_pred, "f1")
test_acc_score, test_acc_error = evaluate_set(y_te, te_pred, "accuracy")
_, test_ll_error = evaluate_set(y_te, te_pred, "log_loss")  # con probas duras

print("\n=== Rendimiento en TEST (mejor por F1-VAL) ===")
print(f"F1 (macro): {test_f1_score:.4f}  |  Error F1: {test_f1_error:.4f}")
print(f"Accuracy   : {test_acc_score:.4f}  |  Error Acc: {test_acc_error:.4f}")
print(f"LogLoss    : {test_ll_error:.4f}   (menor es mejor)")

# -----------------------
# 9) Predicción de un dato nuevo (inventado) (e)
# -----------------------
new_sample = {}
for col in X_tr.columns:
    if pd.api.types.is_numeric_dtype(X_tr[col]):
        new_sample[col] = float(np.nanmean(X_tr[col].values))
    else:
        moda = X_tr[col].mode(dropna=True)
        if not moda.empty:
            new_sample[col] = moda.iloc[0]
        else:
            new_sample[col] = pd.concat([X_tr[col], X_va[col], X_te[col]]).dropna().iloc[0]

new_df = pd.DataFrame([new_sample], columns=X_tr.columns)
new_pred = best_model.predict(new_df)[0]
print("\nPredicción para un dato NUEVO (inventado):")
display(new_df.head(1))
print(f"Clase predicha: {new_pred}")

# -----------------------
# 10) Reglas aprendidas (interpretabilidad)
# -----------------------
print("\n==== Reglas del modelo RIPPER seleccionado ====")
try:
    # Caso binario sin OVR
    best_model.ruleset_.out_pretty()
except AttributeError:
    # Multiclase OVR: imprimimos hasta N clases para evitar bloqueos largos
    if hasattr(best_model, "estimators_"):
        n_show = min(len(best_model.estimators_), SHOW_RULES_MAX_CLASSES)
        for idx, est in enumerate(best_model.estimators_[:n_show]):
            print(f"\n[Reglas para una de las clases #{idx+1}]")
            try:
                est.ruleset_.out_pretty()
            except AttributeError:
                print("(No se pudieron imprimir reglas para este estimador)")
        if len(best_model.estimators_) > n_show:
            print(f"\n(Se omitieron reglas de {len(best_model.estimators_) - n_show} clases para evitar demoras.)")


[01/27] Entrenando k=1, prune=0.2, max_rules=None ...
[02/27] Entrenando k=1, prune=0.2, max_rules=10 ...
[03/27] Entrenando k=1, prune=0.2, max_rules=20 ...
[04/27] Entrenando k=1, prune=0.33, max_rules=None ...
[05/27] Entrenando k=1, prune=0.33, max_rules=10 ...
[06/27] Entrenando k=1, prune=0.33, max_rules=20 ...
[07/27] Entrenando k=1, prune=0.5, max_rules=None ...
[08/27] Entrenando k=1, prune=0.5, max_rules=10 ...
[09/27] Entrenando k=1, prune=0.5, max_rules=20 ...
[10/27] Entrenando k=2, prune=0.2, max_rules=None ...
[11/27] Entrenando k=2, prune=0.2, max_rules=10 ...
[12/27] Entrenando k=2, prune=0.2, max_rules=20 ...
[13/27] Entrenando k=2, prune=0.33, max_rules=None ...
[14/27] Entrenando k=2, prune=0.33, max_rules=10 ...
[15/27] Entrenando k=2, prune=0.33, max_rules=20 ...
[16/27] Entrenando k=2, prune=0.5, max_rules=None ...
[17/27] Entrenando k=2, prune=0.5, max_rules=10 ...
[18/27] Entrenando k=2, prune=0.5, max_rules=20 ...
[19/27] Entrenando k=3, prune=0.2, max_rules=N

Unnamed: 0,hiperparámetros,métrica,score_train,error_train,score_val,error_val
0,k=1 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
1,k=2 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
2,k=3 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
3,k=1 | prune_size=0.2 | max_rules=None,accuracy,0.724167,0.275833,0.649000,0.351000
4,k=2 | prune_size=0.2 | max_rules=None,accuracy,0.724167,0.275833,0.649000,0.351000
...,...,...,...,...,...,...
76,k=1 | prune_size=0.5 | max_rules=10,log_loss,-8.837840,8.837840,-8.894950,8.894950
77,k=1 | prune_size=0.33 | max_rules=10,log_loss,-8.852117,8.852117,-8.916367,8.916367
78,k=2 | prune_size=0.2 | max_rules=10,log_loss,-8.898520,8.898520,-9.023449,9.023449
79,k=3 | prune_size=0.2 | max_rules=10,log_loss,-8.898520,8.898520,-9.023449,9.023449



Mejor combinación por F1 (validation):
k=1 | prune_size=0.2 | max_rules=None
                             0.616484
                             0.383516

Reentrenando el mejor modelo con TODO el TRAIN ...

=== Rendimiento en TEST (mejor por F1-VAL) ===
F1 (macro): 0.6981  |  Error F1: 0.3019
Accuracy   : 0.7145  |  Error Acc: 0.2855
LogLoss    : 6.1144   (menor es mejor)

Predicción para un dato NUEVO (inventado):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.006931,0.004414,-0.010935,-0.008443,-0.003074,-0.002669,-0.010067,-0.004021,0.000807,-0.000577,-0.000516,0.004027,0.00335


Clase predicha: 2

==== Reglas del modelo RIPPER seleccionado ====

[Reglas para una de las clases #1]
[[10=<-1.34 ^ 7=<-1.32 ^ 5=<-1.36] V
[10=<-1.34 ^ 7=<-1.32 ^ 5=-0.79-0.18] V
[10=<-1.34 ^ 11=>1.31 ^ 9=0.0017-0.28] V
[10=<-1.34 ^ 11=>1.31 ^ 0=<-1.21 ^ 6=-0.43--0.33] V
[10=<-1.34 ^ 11=0.87-1.31] V
[10=<-1.34 ^ 7=<-1.32] V
[11=>1.31 ^ 10=<-1.34 ^ 3=-0.14--0.012] V
[11=>1.31 ^ 10=-1.34--0.94 ^ 8=0.87-1.33] V
[11=>1.31 ^ 7=-1.32--0.88 ^ 3=-0.14--0.012] V
[10=<-1.34 ^ 12=<-1.28 ^ 2=<-1.24] V
[11=>1.31 ^ 10=<-1.34 ^ 7=-0.55--0.28] V
[7=<-1.32 ^ 10=-1.34--0.94 ^ 11=0.011-0.27] V
[11=>1.31 ^ 10=-0.94--0.61 ^ 8=0.87-1.33 ^ 9=0.28-0.57] V
[7=<-1.32 ^ 11=0.87-1.31 ^ 1=>1.32] V
[11=>1.31 ^ 7=-1.32--0.88 ^ 5=<-1.36] V
[11=>1.31 ^ 10=-0.94--0.61 ^ 8=>1.33 ^ 3=<-1.39] V
[10=<-1.34 ^ 7=-1.32--0.88 ^ 12=-0.85--0.53] V
[11=>1.31 ^ 10=-1.34--0.94 ^ 4=-1.19--0.99] V
[10=<-1.34 ^ 7=-1.32--0.88] V
[7=<-1.32 ^ 10=-1.34--0.94 ^ 1=-1.5--1.39] V
[11=>1.31 ^ 7=-0.88--0.55 ^ 10=-0.61--0.31] V
[7=<-1.32 ^ 11=0

In [None]:
# Mostrar las primeras filas del DataFrame
print("Primeras filas del DataFrame de resultados:")
display(df_results.head())

# Obtener información general del DataFrame
print("\nInformación del DataFrame:")
df_results.info()

# Obtener estadísticas descriptivas para las columnas numéricas
print("\nEstadísticas descriptivas:")
display(df_results.describe())

# Contar la frecuencia de cada métrica
print("\nConteo de métricas:")
display(df_results['métrica'].value_counts())

Primeras filas del DataFrame de resultados:


Unnamed: 0,hiperparámetros,métrica,score_train,error_train,score_val,error_val
0,k=1 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
1,k=2 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
2,k=3 | prune_size=0.5 | max_rules=None,accuracy,0.718333,0.281667,0.655667,0.344333
3,k=1 | prune_size=0.2 | max_rules=None,accuracy,0.724167,0.275833,0.649,0.351
4,k=2 | prune_size=0.2 | max_rules=None,accuracy,0.724167,0.275833,0.649,0.351



Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   hiperparámetros  81 non-null     object 
 1   métrica          81 non-null     object 
 2   score_train      81 non-null     float64
 3   error_train      81 non-null     float64
 4   score_val        81 non-null     float64
 5   error_val        81 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.9+ KB

Estadísticas descriptivas:


Unnamed: 0,score_train,error_train,score_val,error_val
count,81.0,81.0,81.0,81.0
mean,-2.082361,2.749028,-2.333908,3.000574
std,3.923969,3.458313,4.181032,3.708449
min,-8.980616,0.271167,-9.066282,0.344333
25%,-6.03229,0.359667,-7.609965,0.390667
50%,0.5845,0.4155,0.570066,0.429934
75%,0.640333,6.03229,0.609333,7.609965
max,0.728833,8.980616,0.655667,9.066282



Conteo de métricas:


Unnamed: 0_level_0,count
métrica,Unnamed: 1_level_1
accuracy,27
f1,27
log_loss,27
