In [8]:
import os
import random
import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [9]:

# ----------------------------
# 0. Extendida: Feature Engineer + Winsorizer + Preprocessor
# ----------------------------
class ExtendedFeatureEngineer(BaseEstimator, TransformerMixin):
    _orientation_map = {
        "Norte": 0, "Nordeste": 45, "Este": 90, "Sudeste": 135,
        "Sur": 180, "Sudoeste": 225, "Oeste": 270, "Noroeste": 315,
    }
    def __init__(self, current_year=2025, geo_clusters=10):
        self.current_year = current_year
        self.geo_clusters = geo_clusters
        self.km_model_ = None
        self.agg_stats_ = {}
    def fit(self, X, y=None):
        if {"latitud","longitud"}.issubset(X.columns):
            coords = X[["latitud","longitud"]].fillna(0)
            self.km_model_ = KMeans(n_clusters=self.geo_clusters, random_state=42)
            self.km_model_.fit(coords)
        if y is not None and 'tipo_edificacion' in X.columns:
            grp = pd.DataFrame({'precio':y,'type':X['tipo_edificacion']})
            agg = grp.groupby('type').precio.agg(['mean','std'])
            self.agg_stats_ = agg.to_dict()
        return self
    def transform(self, X):
        df = X.copy()
        # Edad
        df['antiguedad']  = self.current_year - df['ano_construccion']
        df['antiguedad2'] = df['antiguedad']**2
        df['decada']      = (df['ano_construccion']//10)*10
        # Superficies
        df['superficie_total']      = df['superficie_interior_m2'].fillna(0) + df['superficie_exterior_m2'].fillna(0)
        df['log_superficie_total']  = np.log1p(df['superficie_total'])
        df['habitacion_area']       = df['superficie_interior_m2']/df['numero_habitacions'].replace(0,np.nan)
        df['banos_area']            = df['superficie_interior_m2']/df['numero_banos'].replace(0,np.nan)
        # Distancias
        for col in ['distancia_centro_km','distancia_escola_km']:
            if col in df:
                df[f'log_{col}'] = np.log1p(df[col])
                df[f'inv_{col}'] = 1/(df[col]+0.1)
        # Temperatura
        if 'temperatura_media_mes_construccion' in df:
            t = df['temperatura_media_mes_construccion']
            df['temp_norm'] = (t - t.mean())/t.std()
            df['temp_sq']   = t**2
        # Índice criminalidad
        if 'indice_criminalidade' in df:
            df['crime_q'] = pd.qcut(df['indice_criminalidade'],5,labels=False,duplicates='drop')
        # Orientación
        deg = df.get('orientacion',pd.Series()).map(self._orientation_map).fillna(0)
        rad = np.deg2rad(deg)
        df['orient_sin'] = np.sin(rad)
        df['orient_cos'] = np.cos(rad)
        # Clusters geográficos
        if self.km_model_:
            coords = df[['latitud','longitud']].fillna(0)
            df['geo_cluster'] = self.km_model_.predict(coords)
        else:
            df['geo_cluster'] = 0
        # Stats por tipo de edificación
        if 'tipo_edificacion' in df and self.agg_stats_:
            df['type_price_mean'] = df['tipo_edificacion'].map(self.agg_stats_['mean'])
            df['type_price_std']  = df['tipo_edificacion'].map(self.agg_stats_['std'])
        else:
            df['type_price_mean'] = 0
            df['type_price_std']  = 0
        # One-hot color favorito
        if 'cor_favorita_propietario' in df:
            cols = pd.get_dummies(df['cor_favorita_propietario'],prefix='color')
            df = pd.concat([df,cols],axis=1)
        # Eliminamos originales
        drops = [
            'ano_construccion','superficie_interior_m2','superficie_exterior_m2',
            'numero_habitacions','numero_banos','temperatura_media_mes_construccion',
            'distancia_centro_km','distancia_escola_km','indice_criminalidade',
            'orientacion','tipo_edificacion','cor_favorita_propietario'
        ]
        df.drop(columns=[c for c in drops if c in df], inplace=True)
        return df

class WinsorizerSelective(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995, active=True):
        self.lower=lower; self.upper=upper; self.active=active; self.bounds_={}
    def fit(self, X, y=None):
        if not self.active: return self
        df = pd.DataFrame(X)
        for c in df:
            lo,hi = df[c].quantile(self.lower), df[c].quantile(self.upper)
            self.bounds_[c] = (lo,hi)
        return self
    def transform(self, X):
        if not self.active: return X
        df = pd.DataFrame(X).copy()
        for c,(lo,hi) in self.bounds_.items():
            df[c] = df[c].clip(lo,hi)
        return df.values

def build_preprocessor(df_sample, is_train=True):
    target = 'prezo_euros'
    # numéricas
    num_cols = df_sample.select_dtypes(include='number').columns.tolist()
    if target in num_cols: num_cols.remove(target)
    # placeholders generadas
    engineered = [
        'antiguedad','antiguedad2','decada','superficie_total','log_superficie_total',
        'habitacion_area','banos_area','log_distancia_centro_km','inv_distancia_centro_km',
        'log_distancia_escola_km','inv_distancia_escola_km','temp_norm','temp_sq','crime_q',
        'orient_sin','orient_cos','geo_cluster','type_price_mean','type_price_std'
    ]
    num_cols += [f for f in engineered if f in df_sample.columns]
    # categóricas
    cat_cols = df_sample.select_dtypes(include='object').columns.tolist()
    ordinal_maps = {
        'calidade_materiais': ['Baixa','Media','Alta'],
        'acceso_transporte_publico': ['Malo','Regular','Bo','Moi bo'],
        'eficiencia_enerxetica': ['G','F','E','D','C','B','A'],
    }
    ord_feats = [c for c in ordinal_maps if c in cat_cols]
    ord_cats  = [ordinal_maps[c] for c in ord_feats]
    ohe_feats = [c for c in cat_cols if c not in ord_feats]

    num_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('winsor', WinsorizerSelective(active=is_train)),
        ('scale', RobustScaler())
    ])
    ord_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder(categories=ord_cats))
    ])
    ohe_pipe= Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preproc = ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('ord', ord_pipe, ord_feats),
        ('ohe', ohe_pipe, ohe_feats)
    ], remainder='drop', n_jobs=-1)

    full = Pipeline([
        ('feat', ExtendedFeatureEngineer()),
        ('prep', preproc)
    ])
    return full, target

In [10]:

# ----------------------------
# 1. Configuración y semillas
# ----------------------------
DATA_PATH    = 'train.csv'
TEST_PATH    = 'test.csv'
MODEL_DIR    = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE    = 256
MAX_EPOCHS    = 100
PATIENCE      = 10
LEARNING_RATE = 1e-3
TEST_SIZE     = 0.1
SUBMIT_FILE   = os.path.join(MODEL_DIR,'submission_pytorch.csv')


In [11]:
# ----------------------------
# 2. Carga y preprocesado
# ----------------------------
df        = pd.read_csv(DATA_PATH)
df_test   = pd.read_csv(TEST_PATH)

pipe, tgt = build_preprocessor(df, is_train=True)
X_all     = pipe.fit_transform(df, df[tgt])
y_all     = df[tgt].values

# split train/val
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# dataset & loader
class HousePriceDataset(Dataset):
    def __init__(self,X,y=None):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float() if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self,i):
        return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

train_dl = DataLoader(HousePriceDataset(X_train,y_train),
                      batch_size=BATCH_SIZE,shuffle=True,pin_memory=True,num_workers=2)
val_dl   = DataLoader(HousePriceDataset(X_val,y_val),
                      batch_size=BATCH_SIZE,shuffle=False,pin_memory=True,num_workers=2)




ValueError: A given column is not a column of the dataframe

In [12]:

# ----------------------------
# 3. Modelo
# ----------------------------
class RegressionNN(nn.Module):
    def __init__(self,input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256,128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128,64),  nn.ReLU(),
            nn.Linear(64,1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

model = RegressionNN(X_train.shape[1]).to(DEVICE)

optim   = torch.optim.Adam(model.parameters(),lr=LEARNING_RATE)
criterion = nn.MSELoss()


NameError: name 'X_train' is not defined

In [None]:
# ----------------------------
# 4. Training + EarlyStopping
# ----------------------------
best_rmse = np.inf
wait      = 0

for epoch in range(1,MAX_EPOCHS+1):
    model.train()
    for xb,yb in train_dl:
        xb,yb = xb.to(DEVICE), yb.to(DEVICE)
        loss = criterion(model(xb), yb)
        optim.zero_grad(); loss.backward(); optim.step()

    model.eval()
    losses = []
    with torch.no_grad():
        for xb,yb in val_dl:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = model(xb)
            losses.append(((preds-yb)**2).mean().item())
    rmse = np.sqrt(np.mean(losses))
    print(f"Epoch {epoch:03d}  Val RMSE: {rmse:.2f}")
    if rmse + 1e-4 < best_rmse:
        best_rmse = rmse; wait = 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR,'best.pt'))
    else:
        wait+=1
        if wait>=PATIENCE:
            print("EarlyStopping!") 
            break

model.load_state_dict(torch.load(os.path.join(MODEL_DIR,'best.pt')))


In [None]:

# ----------------------------
# 5. Inference y submission
# ----------------------------
# Desactiva winsor en test
pipe.set_params(prep__num__winsor__active=False)

X_test = pipe.transform(df_test)
test_dl = DataLoader(HousePriceDataset(X_test),batch_size=BATCH_SIZE,
                     shuffle=False,pin_memory=True,num_workers=2)

preds=[]
model.eval()
with torch.no_grad():
    for xb in test_dl:
        xb = xb.to(DEVICE)
        preds.append(model(xb).cpu().numpy())
preds = np.concatenate(preds,axis=0)

submission = pd.DataFrame({'id': df_test['id'], 'prezo_euros': preds})
submission.to_csv(SUBMIT_FILE,index=False)
print(f"Submission guardada en {SUBMIT_FILE}")

In [15]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import random
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


# -----------------------------------------------------------------------------
# 1. Extended Feature Engineer (usa el nombre exacto de columnas de tu CSV)
# -----------------------------------------------------------------------------
class ExtendedFeatureEngineer(BaseEstimator, TransformerMixin):
    _orientation_map = {
      "Norte": 0, "Nordeste": 45, "Este": 90, "Sudeste": 135,
      "Sur": 180, "Sudoeste": 225, "Oeste": 270, "Noroeste": 315,
    }
    def __init__(self, current_year=2025, geo_clusters=10):
        self.current_year = current_year
        self.geo_clusters = geo_clusters
        self.km_model_ = None
        self.agg_stats_ = {}

    def fit(self, X: pd.DataFrame, y=None):
        # Ajusta KMeans con 'latitude' y 'lonxitude'
        if {"latitude","lonxitude"}.issubset(X.columns):
            coords = X[["latitude","lonxitude"]].fillna(0)
            self.km_model_ = KMeans(n_clusters=self.geo_clusters, random_state=42)
            self.km_model_.fit(coords)
        # Estadísticas por tipo_edificacion
        if y is not None and "tipo_edificacion" in X.columns:
            grp = pd.DataFrame({"precio": y, "type": X["tipo_edificacion"]})
            agg = grp.groupby("type").precio.agg(["mean","std"])
            self.agg_stats_ = agg.to_dict()
        return self

    def transform(self, X: pd.DataFrame):
        df = X.copy()
        # Edad
        df["antiguedad"]   = self.current_year - df["ano_construccion"]
        df["antiguedad2"]  = df["antiguedad"]**2
        df["decada"]       = (df["ano_construccion"]//10)*10
        # Superficies
        df["superficie_total"]     = df["superficie_interior_m2"].fillna(0) + df["superficie_exterior_m2"].fillna(0)
        df["log_superficie_total"] = np.log1p(df["superficie_total"])
        df["habitacion_area"]      = df["superficie_interior_m2"]/df["numero_habitacions"].replace(0,np.nan)
        df["banos_area"]           = df["superficie_interior_m2"]/df["numero_banos"].replace(0,np.nan)
        # Distancias
        for c in ["distancia_centro_km","distancia_escola_km"]:
            if c in df:
                df[f"log_{c}"] = np.log1p(df[c])
                df[f"inv_{c}"] = 1/(df[c]+0.1)
        # Temperatura
        if "temperatura_media_mes_construccion" in df:
            t = df["temperatura_media_mes_construccion"]
            df["temp_norm"] = (t - t.mean())/t.std()
            df["temp_sq"]   = t**2
        # Índice criminalidad
        if "indice_criminalidade" in df:
            df["crime_q"] = pd.qcut(df["indice_criminalidade"],5,labels=False,duplicates="drop")
        # Orientación
        deg = df.get("orientacion", pd.Series()).map(self._orientation_map).fillna(0)
        rad = np.deg2rad(deg)
        df["orient_sin"] = np.sin(rad)
        df["orient_cos"] = np.cos(rad)
        # Geo-clusters
        if self.km_model_ is not None:
            coords = df[["latitude","lonxitude"]].fillna(0)
            df["geo_cluster"] = self.km_model_.predict(coords)
        else:
            df["geo_cluster"] = 0
        # Stats por tipo_edificacion
        if "tipo_edificacion" in df and self.agg_stats_:
            df["type_price_mean"] = df["tipo_edificacion"].map(self.agg_stats_["mean"])
            df["type_price_std"]  = df["tipo_edificacion"].map(self.agg_stats_["std"])
        else:
            df["type_price_mean"] = 0
            df["type_price_std"]  = 0
        # One-hot color favorito
        if "cor_favorita_propietario" in df:
            cols = pd.get_dummies(df["cor_favorita_propietario"], prefix="color")
            df = pd.concat([df, cols], axis=1)
        # Eliminar las originales
        drops = [
          "ano_construccion","superficie_interior_m2","superficie_exterior_m2",
          "numero_habitacions","numero_banos","temperatura_media_mes_construccion",
          "distancia_centro_km","distancia_escola_km","indice_criminalidade",
          "orientacion","tipo_edificacion","cor_favorita_propietario","fecha"
        ]
        for c in drops:
            if c in df: df.drop(columns=c, inplace=True)
        return df


# -----------------------------------------------------------------------------
# 2. Winsorizer Selectivo
# -----------------------------------------------------------------------------
class WinsorizerSelective(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995, active=True):
        self.lower, self.upper, self.active = lower, upper, active
        self.bounds_ = {}
    def fit(self, X, y=None):
        if not self.active: return self
        df = pd.DataFrame(X)
        for c in df.columns:
            lo = df[c].quantile(self.lower)
            hi = df[c].quantile(self.upper)
            self.bounds_[c] = (lo,hi)
        return self
    def transform(self, X):
        if not self.active: return X
        df = pd.DataFrame(X).copy()
        for c,(lo,hi) in self.bounds_.items():
            df[c] = df[c].clip(lo,hi)
        return df.values





In [16]:
# -- Configuración --
DATA_PATH  = "train.csv"
TEST_PATH  = "test.csv"
MODEL_DIR  = "models"
os.makedirs(MODEL_DIR, exist_ok=True)
SEED       = 42
np.random.seed(SEED); random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
DEVICE    = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE  = 256
TEST_SIZE   = 0.1
MAX_EPOCHS  = 100
PATIENCE    = 10
LR          = 1e-3
SUBMIT_FILE = os.path.join(MODEL_DIR,"submission_pytorch.csv")

# -- 1) Leo datos --
df       = pd.read_csv(DATA_PATH)
df_test  = pd.read_csv(TEST_PATH)

# -- 2) Feature Engineering --
feat_eng = ExtendedFeatureEngineer(current_year=2025, geo_clusters=10)
feat_eng.fit(df, df["prezo_euros"].values)
df_fe       = feat_eng.transform(df)
df_test_fe  = feat_eng.transform(df_test)

# -- 3) ColumnTransformer sobre df_fe --
# Detecto numéricas y ordinales
num_cols = df_fe.select_dtypes(include=np.number).columns.tolist()
# (no hay target en df_fe, así no hay que removerlo)
ord_feats = [f for f in ["calidade_materiais",
                            "acceso_transporte_publico",
                            "eficiencia_enerxetica"]
                if f in df_fe.columns]

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("winsor", WinsorizerSelective(active=True)),
    ("scale", RobustScaler())
])
ord_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OrdinalEncoder(categories=[
        ["Baixa","Media","Alta"],
        ["Malo","Regular","Bo","Moi bo"],
        ["G","F","E","D","C","B","A"]
    ]))
])

preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("ord", ord_pipe, ord_feats),
], remainder="drop", n_jobs=-1)

X_all = preproc.fit_transform(df_fe)
y_all = df["prezo_euros"].values

# -- 4) Split train/val --
X_tr, X_val, y_tr, y_val = train_test_split(
    X_all, y_all, test_size=TEST_SIZE,
    random_state=SEED
)

# -- 5) DataLoaders --
class HPDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float() if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self,i):
        return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

tr_dl  = DataLoader(HPDataset(X_tr,y_tr), batch_size=BATCH_SIZE,
                    shuffle=True,  num_workers=2, pin_memory=True)
val_dl = DataLoader(HPDataset(X_val,y_val), batch_size=BATCH_SIZE,
                    shuffle=False, num_workers=2, pin_memory=True)

# -- 6) Modelo PyTorch (mismo que antes) --
class RegressionNN(nn.Module):
    def __init__(self, d): 
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d,256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256,128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128,64),  nn.ReLU(),
            nn.Linear(64,1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

model   = RegressionNN(X_tr.shape[1]).to(DEVICE)
optim    = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn  = nn.MSELoss()

# -- 7) Entrenamiento + EarlyStopping --
best_rmse = np.inf
wait      = 0

for ep in range(1, MAX_EPOCHS+1):
    model.train()
    for xb,yb in tr_dl:
        xb,yb = xb.to(DEVICE), yb.to(DEVICE)
        loss = loss_fn(model(xb), yb)
        optim.zero_grad(); loss.backward(); optim.step()

    model.eval()
    vals = []
    with torch.no_grad():
        for xb,yb in val_dl:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            p = model(xb)
            vals.append(((p-yb)**2).mean().item())
    rmse = np.sqrt(np.mean(vals))
    print(f"Epoch {ep:03d}  Val RMSE: {rmse:.2f}")

    if rmse + 1e-4 < best_rmse:
        best_rmse = rmse; wait = 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR,"best.pt"))
    else:
        wait += 1
        if wait >= PATIENCE:
            print("EarlyStopping!") 
            break

# cargo el mejor
model.load_state_dict(torch.load(os.path.join(MODEL_DIR,"best.pt")))

# -- 8) Inference en test --
# Desactivo winsor
preproc.named_transformers_["num"].named_steps["winsor"].active = False

X_test = preproc.transform(df_test_fe)
test_dl = DataLoader(HPDataset(X_test), batch_size=BATCH_SIZE,
                        shuffle=False, num_workers=2, pin_memory=True)

preds = []
model.eval()
with torch.no_grad():
    for xb in test_dl:
        xb = xb.to(DEVICE)
        preds.append(model(xb).cpu().numpy())
preds = np.concatenate(preds, axis=0)

# -- 9) Submission --
submission = pd.DataFrame({
    "id": df_test["id"],
    "prezo_euros": preds
})
submission.to_csv(SUBMIT_FILE, index=False)
print("Submission guardada en", SUBMIT_FILE)



Epoch 001  Val RMSE: 277646.80
Epoch 002  Val RMSE: 237224.98
Epoch 003  Val RMSE: 143107.71
Epoch 004  Val RMSE: 80829.19
Epoch 005  Val RMSE: 72275.65
Epoch 006  Val RMSE: 67498.47
Epoch 007  Val RMSE: 63859.09
Epoch 008  Val RMSE: 61025.03
Epoch 009  Val RMSE: 58807.05
Epoch 010  Val RMSE: 56705.42
Epoch 011  Val RMSE: 55020.72
Epoch 012  Val RMSE: 53601.27
Epoch 013  Val RMSE: 52245.53
Epoch 014  Val RMSE: 50839.45
Epoch 015  Val RMSE: 49565.80
Epoch 016  Val RMSE: 48435.39
Epoch 017  Val RMSE: 46615.49
Epoch 018  Val RMSE: 45111.30
Epoch 019  Val RMSE: 43437.27
Epoch 020  Val RMSE: 41655.45
Epoch 021  Val RMSE: 39612.38
Epoch 022  Val RMSE: 37504.06
Epoch 023  Val RMSE: 35097.13
Epoch 024  Val RMSE: 32482.15
Epoch 025  Val RMSE: 29492.38
Epoch 026  Val RMSE: 26490.58
Epoch 027  Val RMSE: 23415.04
Epoch 028  Val RMSE: 20167.78
Epoch 029  Val RMSE: 17104.09
Epoch 030  Val RMSE: 14050.47
Epoch 031  Val RMSE: 11982.98
Epoch 032  Val RMSE: 10072.03
Epoch

KeyError: "['prezo_euros'] not in index"

In [18]:
DATA_PATH  = "train.csv"
TEST_PATH  = "test.csv"
MODEL_DIR  = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

SEED       = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 256
TEST_SIZE  = 0.1
MAX_EPOCHS = 100
PATIENCE   = 10
LR         = 1e-3
SUBMIT_FILE= os.path.join(MODEL_DIR,"submission_pytorch.csv")

### 1) Leer datos ###
df       = pd.read_csv(DATA_PATH)
df_test  = pd.read_csv(TEST_PATH)

# Separamos id y target
y = df["prezo_euros"].values
X = df.drop(columns=["id","prezo_euros"])

### 2) Feature Engineering ###
feat_eng = ExtendedFeatureEngineer(current_year=2025, geo_clusters=10)
feat_eng.fit(X, y)
df_fe      = feat_eng.transform(X)
df_test_fe = feat_eng.transform(df_test.drop(columns=["id"]))

### 3) Construir ColumnTransformer sobre df_fe ###
# Columnas numéricas dinámicas
num_cols = df_fe.select_dtypes(include=[np.number]).columns.tolist()
# Columnas ordinales específicas
ord_feats = [c for c in [
    "calidade_materiais",
    "acceso_transporte_publico",
    "eficiencia_enerxetica"
] if c in df_fe.columns]

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("winsor", WinsorizerSelective(active=True)),
    ("scale", RobustScaler())
])
ord_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OrdinalEncoder(categories=[
        ["Baixa","Media","Alta"],
        ["Malo","Regular","Bo","Moi bo"],
        ["G","F","E","D","C","B","A"]
    ]))
])

preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("ord", ord_pipe, ord_feats)
], remainder="drop", n_jobs=-1)

# Preprocesado final
X_all = preproc.fit_transform(df_fe, y)
y_all = y

### 4) Split train/val ###
X_tr, X_val, y_tr, y_val = train_test_split(
    X_all, y_all,
    test_size=TEST_SIZE,
    random_state=SEED
)

### 5) DataLoaders ###
class HPDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float() if y is not None else None
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

train_dl = DataLoader(HPDataset(X_tr, y_tr),
                        batch_size=BATCH_SIZE, shuffle=True,
                        num_workers=2, pin_memory=True)
val_dl   = DataLoader(HPDataset(X_val, y_val),
                        batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=2, pin_memory=True)

### 6) Modelo PyTorch ###
class RegressionNN(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256,128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128,64),  nn.ReLU(),
            nn.Linear(64,1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

model   = RegressionNN(X_tr.shape[1]).to(DEVICE)
optim    = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn  = nn.MSELoss()

### 7) Entrenamiento + EarlyStopping ###
best_rmse, wait = np.inf, 0
for ep in range(1, MAX_EPOCHS+1):
    model.train()
    for xb,yb in train_dl:
        xb,yb = xb.to(DEVICE), yb.to(DEVICE)
        loss = loss_fn(model(xb), yb)
        optim.zero_grad(); loss.backward(); optim.step()

    model.eval()
    vals = []
    with torch.no_grad():
        for xb,yb in val_dl:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            p     = model(xb)
            vals.append(((p-yb)**2).mean().item())
    rmse = np.sqrt(np.mean(vals))
    print(f"Epoch {ep:03d}  Val RMSE: {rmse:.2f}")

    if rmse + 1e-4 < best_rmse:
        best_rmse, wait = rmse, 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR,"best.pt"))
    else:
        wait += 1
        if wait >= PATIENCE:
            print("EarlyStopping!") 
            break

model.load_state_dict(torch.load(os.path.join(MODEL_DIR,"best.pt")))

### 8) Inference en test ###
# Desactivo winsor para no recortar en test
preproc.named_transformers_["num"].named_steps["winsor"].active = False

X_test = preproc.transform(df_test_fe)
test_dl = DataLoader(HPDataset(X_test),
                        batch_size=BATCH_SIZE,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True)

preds = []
model.eval()
with torch.no_grad():
    for xb in test_dl:
        xb = xb.to(DEVICE)
        preds.append(model(xb).cpu().numpy())
preds = np.concatenate(preds, axis=0)

### 9) Submission ###
submission = pd.DataFrame({
    "id": df_test["id"],
    "prezo_euros": preds
})
submission.to_csv(SUBMIT_FILE, index=False)
print("Submission guardada en", SUBMIT_FILE)



Epoch 001  Val RMSE: 276841.25
Epoch 002  Val RMSE: 225659.90
Epoch 003  Val RMSE: 125480.16
Epoch 004  Val RMSE: 86156.02
Epoch 005  Val RMSE: 79636.00
Epoch 006  Val RMSE: 75523.72
Epoch 007  Val RMSE: 72652.40
Epoch 008  Val RMSE: 70622.13
Epoch 009  Val RMSE: 69402.00
Epoch 010  Val RMSE: 68188.64
Epoch 011  Val RMSE: 67500.02
Epoch 012  Val RMSE: 67000.54
Epoch 013  Val RMSE: 66664.15
Epoch 014  Val RMSE: 66401.92
Epoch 015  Val RMSE: 66403.15
Epoch 016  Val RMSE: 66085.30
Epoch 017  Val RMSE: 65960.38
Epoch 018  Val RMSE: 65953.91
Epoch 019  Val RMSE: 65850.73
Epoch 020  Val RMSE: 65808.87
Epoch 021  Val RMSE: 65987.14
Epoch 022  Val RMSE: 65612.94
Epoch 023  Val RMSE: 65677.18
Epoch 024  Val RMSE: 65779.35
Epoch 025  Val RMSE: 65500.46
Epoch 026  Val RMSE: 65525.02
Epoch 027  Val RMSE: 65492.09
Epoch 028  Val RMSE: 65435.15
Epoch 029  Val RMSE: 65484.67
Epoch 030  Val RMSE: 65493.20
Epoch 031  Val RMSE: 65634.56
Epoch 032  Val RMSE: 65437.89
Epoch