In [22]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, FunctionTransformer, RobustScaler, 
    PolynomialFeatures
)
from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ----------------------------
# 1. Configuración y semillas
# ----------------------------
DATA_PATH     = 'train.csv'
TEST_PATH     = 'test.csv'
MODEL_DIR     = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)

RANDOM_STATE  = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE        = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE    = 256
MAX_EPOCHS    = 100
PATIENCE      = 10
LEARNING_RATE = 1e-3
TEST_SIZE     = 0.2
SUBMIT_FILE   = os.path.join(MODEL_DIR, 'submission_pytorch.csv')

# ----------------------------
# 2. Definición de feature engineering pipeline
# ----------------------------
class FeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, current_year=2025):
        self.current_year = current_year
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        # Nuevas variables
        X['superficie_total_m2'] = X['superficie_interior_m2'].fillna(0) + X['superficie_exterior_m2'].fillna(0)
        X['ratio_ext_int']       = X['superficie_exterior_m2'].fillna(0) / (X['superficie_interior_m2'].fillna(0) + 1)
        X['antiguedad']          = self.current_year - X['ano_construccion']
        X['hab_por_bano']        = X['numero_habitacions'] / (X['numero_banos'] + 1)
        X['log_superficie_int']  = np.log1p(X['superficie_interior_m2'].fillna(0))
        X['log_superficie_ext']  = np.log1p(X['superficie_exterior_m2'].fillna(0))
        orient_map = {'Norte':0, 'Este':90, 'Sur':180, 'Oeste':270}
        ang = X['orientacion'].map(orient_map).fillna(0) * np.pi/180
        X['orient_sin']          = np.sin(ang)
        X['orient_cos']          = np.cos(ang)
        return X

NUM_COLS = [
    'superficie_interior_m2', 'superficie_exterior_m2', 'numero_habitacions',
    'numero_banos', 'ano_construccion', 'temperatura_media_mes_construccion',
    'distancia_centro_km', 'distancia_escola_km', 'indice_criminalidade',
    'numero_arboles_xardin'
]
CAT_COLS = [
    'tipo_edificacion', 'calidade_materiais', 'cor_favorita_propietario',
    'acceso_transporte_publico', 'orientacion', 'eficiencia_enerxetica'
]
NEW_NUMERIC = [
    'superficie_total_m2', 'ratio_ext_int', 'antiguedad',
    'hab_por_bano', 'log_superficie_int', 'log_superficie_ext',
    'orient_sin', 'orient_cos'
]

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('winsor', FunctionTransformer(
        lambda arr: np.apply_along_axis(
            lambda col: winsorize(col, limits=[0.01, 0.01]), 0, arr)
    )),
    ('scaler', RobustScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, NUM_COLS + NEW_NUMERIC),
    ('cat', categorical_pipeline, CAT_COLS)
])

full_pipeline = Pipeline([
    ('feature_creator', FeatureCreator(current_year=2025)),
    ('preprocessor', preprocessor)
])

# ----------------------------
# 3. Carga datos y transformación
# ----------------------------
df = pd.read_csv(DATA_PATH)
y  = df['prezo_euros'].values
X  = df.drop(columns=['prezo_euros'])

# Aplicar pipeline correctamente
X_transformed = full_pipeline.fit_transform(X, y)

# Split train/val
X_train, X_val, y_train, y_val = train_test_split(
    X_transformed, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)

# ----------------------------
# 4. Dataset y DataLoader
# ----------------------------
class HousePriceDataset(Dataset):
    def __init__(self, features, targets=None):
        self.X = torch.from_numpy(features).float()
        self.y = torch.from_numpy(targets).float() if targets is not None else None
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx]) if self.y is not None else self.X[idx]

train_ds    = HousePriceDataset(X_train, y_train)
val_ds      = HousePriceDataset(X_val,   y_val)
train_loader= DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True)
val_loader  = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# ----------------------------
# 5. Definición del modelo
# ----------------------------
class RegressionNNBig(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze(1)

model     = RegressionNN(X_train.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()

In [23]:

# ----------------------------
# 6. Entrenamiento con early stopping
# ----------------------------
best_val_rmse     = np.inf
epochs_no_improve = 0

for epoch in range(1, MAX_EPOCHS+1):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        preds  = model(xb)
        loss   = criterion(preds, yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            val_losses.append(((model(xb)-yb)**2).mean().item())
    val_rmse = np.sqrt(np.mean(val_losses))
    print(f"Epoch {epoch:03d}  Val RMSE: {val_rmse:.2f}")
    if val_rmse + 1e-4 < best_val_rmse:
        best_val_rmse, epochs_no_improve = val_rmse, 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR,'best_model.pt'))
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Deteniendo tras {epoch} epochs sin mejora.")
            break

model.load_state_dict(torch.load(os.path.join(MODEL_DIR,'best_model.pt')))
print(f"Mejor RMSE en validación: {best_val_rmse:.2f} euros")


Epoch 001  Val RMSE: 271709.14
Epoch 002  Val RMSE: 161588.71
Epoch 003  Val RMSE: 116762.70
Epoch 004  Val RMSE: 96216.29
Epoch 005  Val RMSE: 84401.66
Epoch 006  Val RMSE: 80016.68
Epoch 007  Val RMSE: 76185.49
Epoch 008  Val RMSE: 73287.27
Epoch 009  Val RMSE: 71245.36
Epoch 010  Val RMSE: 69056.22
Epoch 011  Val RMSE: 68365.83
Epoch 012  Val RMSE: 66095.86
Epoch 013  Val RMSE: 65893.99
Epoch 014  Val RMSE: 62798.64
Epoch 015  Val RMSE: 62967.87
Epoch 016  Val RMSE: 61812.50
Epoch 017  Val RMSE: 60022.25
Epoch 018  Val RMSE: 59329.98
Epoch 019  Val RMSE: 59742.94
Epoch 020  Val RMSE: 57531.81
Epoch 021  Val RMSE: 57219.95
Epoch 022  Val RMSE: 58115.51
Epoch 023  Val RMSE: 56873.47
Epoch 024  Val RMSE: 56489.10
Epoch 025  Val RMSE: 56016.74
Epoch 026  Val RMSE: 56717.35
Epoch 027  Val RMSE: 55485.42
Epoch 028  Val RMSE: 55947.97
Epoch 029  Val RMSE: 54469.59
Epoch 030  Val RMSE: 54564.42
Epoch 031  Val RMSE: 54584.24
Epoch 032  Val RMSE: 55371.73
Epoch 033  Val RMSE: 54373.89
Epoch 0

In [24]:
# ----------------------------
# 7. Inferencia y submission
# ----------------------------
if os.path.exists(TEST_PATH):
    df_test = pd.read_csv(TEST_PATH)
    X_test_trans = full_pipeline.transform(df_test)
    test_ds     = HousePriceDataset(X_test_trans)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    preds=[]