
# TP2 — Redes Neuronales con Keras 

> **Alumno/a:** Brigitte Blau y Micaela Bodner
> **Materia:** Seminario avanzado IA 
:)





In [None]:

# ==== Setup ====
import os
import sys
import math
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




2.20.0


In [2]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


df = pd.read_csv('datos/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:


if 'income' not in df.columns:
 
    cand = [c for c in df.columns if c.lower() in ('income','class','target','label')]
    assert len(cand) == 1, f"No se encontró target unívoco, columnas: {df.columns}"
    df = df.rename(columns={cand[0]: 'income'})


df = df.dropna().reset_index(drop=True)


df['income'] = df['income'].astype(str).str.strip()
y = (df['income'] == '>50K').astype(int).values  # 1 si >50K, 0 si <=50K
X = df.drop(columns=['income'])


numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()

print("Numéricas:", numeric_cols)
print("Categóricas:", categorical_cols)


Numéricas: ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
Categóricas: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']



# train / validación / test

- Primero separamos **train (60%)** y **temp (40%)**  
- Luego dividimos **temp** en **validación (20%)** y **test (20%)** (del total)


In [4]:

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


Train: (19536, 14) Val: (6512, 14) Test: (6513, 14)


In [5]:

preprocess = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

X_train_pp = preprocess.fit_transform(X_train)
X_val_pp   = preprocess.transform(X_val)
X_test_pp  = preprocess.transform(X_test)

input_dim = X_train_pp.shape[1]
input_dim


107



Creamos una función `build_model(**h)` y tres diccionarios de hiperparámetros:
- **Set A (chico):** 1 capa oculta (64), `dropout=0.0`, `lr=1e-3`
- **Set B (medio):** 2 capas (128, 64), `dropout=0.2`, `lr=5e-4`
- **Set C (grande):** 3 capas (256, 128, 64), `dropout=0.3`, `lr=3e-4`


In [6]:

def build_model(input_dim, hidden_layers=(64,), dropout=0.0, lr=1e-3, l2_reg=0.0):
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_dim,)))
    for units in hidden_layers:
        model.add(layers.Dense(units, activation='relu',
                               kernel_regularizer=keras.regularizers.l2(l2_reg) if l2_reg>0 else None))
        if dropout > 0:
            model.add(layers.Dropout(dropout))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

hyperparams_list = [
    {"name": "Set A (chico)",  "hidden_layers": (64,),              "dropout": 0.0, "lr": 1e-3, "l2_reg": 0.0, "epochs": 20, "batch_size": 256},
    {"name": "Set B (medio)",  "hidden_layers": (128, 64),          "dropout": 0.2, "lr": 5e-4, "l2_reg": 1e-5, "epochs": 25, "batch_size": 256},
    {"name": "Set C (grande)", "hidden_layers": (256, 128, 64),     "dropout": 0.3, "lr": 3e-4, "l2_reg": 1e-5, "epochs": 30, "batch_size": 512},
]
hyperparams_list


[{'name': 'Set A (chico)',
  'hidden_layers': (64,),
  'dropout': 0.0,
  'lr': 0.001,
  'l2_reg': 0.0,
  'epochs': 20,
  'batch_size': 256},
 {'name': 'Set B (medio)',
  'hidden_layers': (128, 64),
  'dropout': 0.2,
  'lr': 0.0005,
  'l2_reg': 1e-05,
  'epochs': 25,
  'batch_size': 256},
 {'name': 'Set C (grande)',
  'hidden_layers': (256, 128, 64),
  'dropout': 0.3,
  'lr': 0.0003,
  'l2_reg': 1e-05,
  'epochs': 30,
  'batch_size': 512}]



Entrenamos cada set con `EarlyStopping` monitorizando `val_loss`.  
Guardamos accuracy en validación y seleccionamos el mejor.


In [None]:

histories = {}
val_scores = []

for h in hyperparams_list:
    print("\n==== Entrenando:", h["name"], "====")
    model = build_model(
        input_dim=input_dim,
        hidden_layers=h["hidden_layers"],
        dropout=h["dropout"],
        lr=h["lr"],
        l2_reg=h["l2_reg"]
    )
    cb = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    ]
    history = model.fit(
        X_train_pp, y_train,
        validation_data=(X_val_pp, y_val),
        epochs=h["epochs"],
        batch_size=h["batch_size"],
        verbose=1,
        callbacks=cb
    )
    histories[h["name"]] = history.history

    # Eval en validación
    val_pred = (model.predict(X_val_pp) >= 0.5).astype(int).ravel()
    val_acc = accuracy_score(y_val, val_pred)
    val_scores.append((h["name"], val_acc, model))
    print(f"Accuracy de validación ({h['name']}): {val_acc:.4f}")

# Ordenar por accuracy desc
val_scores.sort(key=lambda x: x[1], reverse=True)
best_name, best_val_acc, best_model = val_scores[0]
print("\nMejor set por validación:", best_name, "— acc:", round(best_val_acc, 4))



==== Entrenando: Set A (chico) ====
Epoch 1/20



### Curvas de entrenamiento (loss y accuracy)


In [None]:

# Una figura por métrica, sin estilos de color específicos
plt.figure()
for name, hist in histories.items():
    plt.plot(hist['loss'], label=f"{name} - train")
    plt.plot(hist['val_loss'], label=f"{name} - val")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Pérdida (train vs val)")
plt.legend()
plt.show()

plt.figure()
for name, hist in histories.items():
    plt.plot(hist['accuracy'], label=f"{name} - train")
    plt.plot(hist['val_accuracy'], label=f"{name} - val")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy (train vs val)")
plt.legend()
plt.show()


In [None]:

test_pred = (best_model.predict(X_test_pp) >= 0.5).astype(int).ravel()
test_acc = accuracy_score(y_test, test_pred)
print(f"Accuracy en test con '{best_name}': {test_acc:.4f}")

# Matriz de confusión y reporte (opcional)
cm = confusion_matrix(y_test, test_pred)
print("\nMatriz de confusión:\n", cm)
print("\nReporte de clasificación:\n", classification_report(y_test, test_pred, digits=4))



- Regresión logística: _accuracy = 0.8125703756781656
- Árbol de decisión: _accuracy =  0.8117514586958747

**Resumen comparativo:** _Escribir análisis: ¿mejoró Keras? ¿sobreajuste? ¿tiempos? ¿sensibilidad a hiperparámetros?_
