In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# ----------------------------
# 1. Configuración y semillas
# ----------------------------
DATA_PATH    = 'train.csv'
TEST_PATH    = 'test.csv'
MODEL_DIR    = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE   = 256
MAX_EPOCHS   = 100
PATIENCE     = 10
LEARNING_RATE= 1e-3
TEST_SIZE    = 0.1
SUBMIT_FILE  = os.path.join(MODEL_DIR, 'submission_pytorch.csv')

# ----------------------------
# 2. Carga y preprocessado train
# ----------------------------
df = pd.read_csv(DATA_PATH)
y  = df['prezo_euros'].values
X  = df.drop(columns=['prezo_euros']).copy()

# 2.1 Columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['float64','int64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# 2.2 Imputación
for col in num_cols:
    med = X[col].median()
    X[col].fillna(med, inplace=True)
for col in cat_cols:
    X[col] = X[col].fillna('Missing').astype(str)

# 2.3 One-hot encoding
X_enc      = pd.get_dummies(X, columns=cat_cols, drop_first=True)
FEATURES   = X_enc.columns.tolist()

# 2.4 Escalado numérico
scaler     = StandardScaler()
X_enc[num_cols] = scaler.fit_transform(X_enc[num_cols])

# 2.5 Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_enc.values, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)

In [3]:

# ----------------------------
# 3. Dataset & DataLoader
# ----------------------------
class HousePriceDataset(Dataset):
    def __init__(self, features, targets=None):
        self.X = torch.from_numpy(features).float()
        self.y = torch.from_numpy(targets).float() if targets is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds    = HousePriceDataset(X_train, y_train)
val_ds      = HousePriceDataset(X_val,   y_val)
train_loader= DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_loader  = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)


In [4]:
# ----------------------------
# 4. Definición del modelo (arquitectura aumentada)
# ----------------------------
class RegressionNNBig(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze(1)

# Instanciación
model = RegressionNNBig(X_train.shape[1]).to(DEVICE)


In [6]:
# ----------------------------
# 5. Optimizer y loss
# ----------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()


In [7]:

# ----------------------------
# 6. Entrenamiento con EarlyStopping
# ----------------------------
best_val_rmse     = np.inf
epochs_no_improve = 0

for epoch in range(1, MAX_EPOCHS + 1):
    # Training
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        preds  = model(xb)
        loss   = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds  = model(xb)
            val_losses.append(((preds - yb)**2).mean().item())
    val_rmse = np.sqrt(np.mean(val_losses))
    print(f"Epoch {epoch:03d}  Val RMSE: {val_rmse:.2f}")

    # EarlyStopping
    if val_rmse + 1e-4 < best_val_rmse:
        best_val_rmse     = val_rmse
        epochs_no_improve = 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR, 'best_model.pt'))
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Deteniendo tras {epoch} epochs sin mejora.")
            break

# Carga mejor modelo
model.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'best_model.pt')))
print(f"Mejor RMSE en validación: {best_val_rmse:.2f} euros")


Epoch 001  Val RMSE: 106086.15
Epoch 002  Val RMSE: 69603.23
Epoch 003  Val RMSE: 65893.44
Epoch 004  Val RMSE: 62002.23
Epoch 005  Val RMSE: 60281.22
Epoch 006  Val RMSE: 60006.60
Epoch 007  Val RMSE: 58323.16
Epoch 008  Val RMSE: 57724.31
Epoch 009  Val RMSE: 57214.03
Epoch 010  Val RMSE: 56697.99
Epoch 011  Val RMSE: 55698.69
Epoch 012  Val RMSE: 55844.26
Epoch 013  Val RMSE: 56447.93
Epoch 014  Val RMSE: 53819.20
Epoch 015  Val RMSE: 53284.78
Epoch 016  Val RMSE: 53488.03
Epoch 017  Val RMSE: 53206.54
Epoch 018  Val RMSE: 52285.48
Epoch 019  Val RMSE: 51799.98
Epoch 020  Val RMSE: 52050.05
Epoch 021  Val RMSE: 52598.46
Epoch 022  Val RMSE: 51302.59
Epoch 023  Val RMSE: 51032.68
Epoch 024  Val RMSE: 51035.25
Epoch 025  Val RMSE: 54397.64
Epoch 026  Val RMSE: 49817.60
Epoch 027  Val RMSE: 49836.71
Epoch 028  Val RMSE: 49642.41
Epoch 029  Val RMSE: 49628.43
Epoch 030  Val RMSE: 49161.15
Epoch 031  Val RMSE: 49024.49
Epoch 032  Val RMSE: 48893.61
Epoch 0

In [8]:

# ----------------------------
# 7. Generar submission final
# ----------------------------
if os.path.exists(TEST_PATH):
    # Leer test.csv evitando columna Unnamed
    df_test = pd.read_csv(TEST_PATH, index_col=0)
    X_test  = df_test.reindex(columns=FEATURES).copy()

    # Imputación y encoding idénticos al train
    for col in num_cols:
        X_test[col].fillna(df[col].median(), inplace=True)
    for col in cat_cols:
        # no es necesario porque las dummies ya están alineadas
        pass

    # Escalado
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    # DataLoader test
    test_ds    = HousePriceDataset(X_test.values)
    test_loader= DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False,
                            num_workers=2, pin_memory=True)

    # Predicción
    model.eval()
    preds = []
    with torch.no_grad():
        for xb in test_loader:
            xb = xb.to(DEVICE)
            p  = model(xb).cpu().numpy()
            preds.append(p)
    preds = np.concatenate(preds, axis=0)

    submission = pd.DataFrame({
        'id':          df_test['id'],
        'prezo_euros': preds
    })
    submission.to_csv(SUBMIT_FILE, index=False)
    print(f"Submission guardada en {SUBMIT_FILE}")

Submission guardada en models/submission_pytorch.csv
