In [4]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet


In [12]:

# ----------------------------
# 1. Configuración y semillas
# ----------------------------
DATA_PATH    = 'train.csv'
TEST_PATH    = 'test.csv'
MODEL_DIR    = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE   = 256
MAX_EPOCHS   = 100
PATIENCE     = 10
LEARNING_RATE= 1e-3
TEST_SIZE    = 0.1
SUBMIT_FILE  = os.path.join(MODEL_DIR, 'submission_pytorch.csv')





In [13]:
###############################################################################
# 1. Extended Feature Engineering
###############################################################################
class ExtendedFeatureEngineer(BaseEstimator, TransformerMixin):
    _orientation_map = {
        "Norte": 0, "Nordeste": 45, "Este": 90, "Sudeste": 135,
        "Sur": 180, "Sudoeste": 225, "Oeste": 270, "Noroeste": 315,
    }

    def __init__(self, current_year: int = 2025, geo_clusters: int = 10):
        self.current_year = current_year
        self.geo_clusters = geo_clusters
        self.km_model_ = None
        self.agg_stats_ = {}

    def fit(self, X: pd.DataFrame, y=None):
        # Fit KMeans on latitude & longitude
        if {"latitud", "longitud"}.issubset(X.columns):
            coords = X[["latitud", "longitud"]].fillna(0)
            self.km_model_ = KMeans(n_clusters=self.geo_clusters, random_state=42)
            self.km_model_.fit(coords)
        # Compute aggregate stats by building type if target y provided
        if y is not None and 'tipo_edificacion' in X.columns:
            grouped = pd.DataFrame({'precio': y, 'type': X['tipo_edificacion']})
            agg = grouped.groupby('type').precio.agg(['mean', 'std'])
            self.agg_stats_ = agg.to_dict()
        return self

    def transform(self, X: pd.DataFrame):
        df = X.copy()
        # Age features
        df['antiguedad'] = self.current_year - df['ano_construccion']
        df['antiguedad2'] = df['antiguedad'] ** 2
        df['decada'] = (df['ano_construccion'] // 10) * 10

        # Surface features
        df['superficie_total'] = df['superficie_interior_m2'].fillna(0) + df['superficie_exterior_m2'].fillna(0)
        df['log_superficie_total'] = np.log1p(df['superficie_total'])
        df['habitacion_area'] = df['superficie_interior_m2'] / df['numero_habitacions'].replace(0, np.nan)
        df['banos_area'] = df['superficie_interior_m2'] / df['numero_banos'].replace(0, np.nan)

        # Distance transformations
        for col in ['distancia_centro_km', 'distancia_escola_km']:
            if col in df.columns:
                df[f'log_{col}'] = np.log1p(df[col])
                df[f'inv_{col}'] = 1 / (df[col] + 0.1)

        # Temperature features
        if 'temperatura_media_mes_construccion' in df.columns:
            temp = df['temperatura_media_mes_construccion']
            df['temp_norm'] = (temp - temp.mean()) / temp.std()
            df['temp_sq'] = temp ** 2

        # Crime index buckets
        if 'indice_criminalidade' in df.columns:
            df['crime_q'] = pd.qcut(df['indice_criminalidade'], 5, labels=False, duplicates='drop')

        # Orientation encoding
        deg = df.get('orientacion', pd.Series()).map(self._orientation_map).fillna(0)
        rad = np.deg2rad(deg)
        df['orient_sin'] = np.sin(rad)
        df['orient_cos'] = np.cos(rad)

        # Geo clusters
        if self.km_model_ is not None:
            coords = df[['latitud', 'longitud']].fillna(0)
            df['geo_cluster'] = self.km_model_.predict(coords)
        else:
            df['geo_cluster'] = 0

        # Aggregated stats by building type
        if 'tipo_edificacion' in df.columns and self.agg_stats_:
            df['type_price_mean'] = df['tipo_edificacion'].map(self.agg_stats_['mean'])
            df['type_price_std'] = df['tipo_edificacion'].map(self.agg_stats_['std'])
        else:
            df['type_price_mean'] = 0
            df['type_price_std'] = 0

        # One-hot favorite color if exists
        if 'cor_favorita_propietario' in df.columns:
            colors = pd.get_dummies(df['cor_favorita_propietario'], prefix='color')
            df = pd.concat([df, colors], axis=1)

        # Date features
        if 'fecha' in df.columns:
            dt = pd.to_datetime(df['fecha'], errors='coerce')
            df['mes'] = dt.dt.month
            df['dia'] = dt.dt.day
            df['dia_semana'] = dt.dt.weekday
            df['is_fin_de_semana'] = dt.dt.weekday.isin([5, 6]).astype(int)

        # Drop original columns
        drops = [
            'ano_construccion', 'superficie_interior_m2', 'superficie_exterior_m2',
            'numero_habitacions', 'numero_banos', 'temperatura_media_mes_construccion',
            'distancia_centro_km', 'distancia_escola_km', 'indice_criminalidade',
            'orientacion', 'tipo_edificacion', 'cor_favorita_propietario', 'fecha'
        ]
        df.drop(columns=[c for c in drops if c in df.columns], inplace=True)
        return df

###############################################################################
# 2. Winsorizer Selectivo
###############################################################################
class WinsorizerSelective(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995, active=True):
        self.lower = lower
        self.upper = upper
        self.active = active
        self.bounds_ = {}

    def fit(self, X, y=None):
        if not self.active:
            return self
        df = pd.DataFrame(X)
        for col in df.columns:
            lo = df[col].quantile(self.lower)
            hi = df[col].quantile(self.upper)
            self.bounds_[col] = (lo, hi)
        return self

    def transform(self, X):
        if not self.active:
            return X
        df = pd.DataFrame(X).copy()
        for col, (lo, hi) in self.bounds_.items():
            df[col] = df[col].clip(lo, hi)
        return df.values

###############################################################################
# 3. Build Preprocessor
###############################################################################
def build_preprocessor(df_sample: pd.DataFrame, is_train=True):
    target = 'prezo_euros'

    # Identify numeric and categorical cols
    numeric_cols = df_sample.select_dtypes(include='number').columns.tolist()
    if target in numeric_cols:
        numeric_cols.remove(target)

    # Engineered features placeholders
    engineered = [
        'antiguedad', 'antiguedad2', 'decada',
        'superficie_total', 'log_superficie_total', 'habitacion_area', 'banos_area',
        'log_distancia_centro_km', 'inv_distancia_centro_km',
        'log_distancia_escola_km', 'inv_distancia_escola_km',
        'temp_norm', 'temp_sq', 'crime_q',
        'orient_sin', 'orient_cos', 'geo_cluster',
        'type_price_mean', 'type_price_std',
        'mes', 'dia', 'dia_semana', 'is_fin_de_semana',
        # plus any one-hot color_* columns
    ]
    numeric_cols += [f for f in engineered if f in df_sample.columns]

    categorical_cols = [c for c in df_sample.select_dtypes(include='object').columns]

    # Ordinal mappings
    ordinal_maps = {
        'calidade_materiais': ['Baixa', 'Media', 'Alta'],
        'acceso_transporte_publico': ['Malo', 'Regular', 'Bo', 'Moi bo'],
        'eficiencia_enerxetica': ['G', 'F', 'E', 'D', 'C', 'B', 'A'],
    }
    ord_feats = [c for c in ordinal_maps if c in categorical_cols]
    ord_cats = [ordinal_maps[c] for c in ord_feats]
    onehot_feats = [c for c in categorical_cols if c not in ord_feats]

    # Pipelines
    num_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('winsor', WinsorizerSelective(active=is_train)),
        ('scale', RobustScaler())
    ])
    ord_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder(categories=ord_cats))
    ])
    ohe_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipe, numeric_cols),
        ('ord', ord_pipe, ord_feats),
        ('ohe', ohe_pipe, onehot_feats)
    ], remainder='drop', n_jobs=-1)

    # Full pipeline
    full_pipeline = Pipeline([
        ('features', ExtendedFeatureEngineer()),
        ('preproc', preprocessor)
    ])

    return full_pipeline, target

In [14]:
###############################################################################
# 4. PyTorch Training Script
###############################################################################

# Load raw data
df_train = pd.read_csv(DATA_PATH)
df_test  = pd.read_csv(TEST_PATH)

# Build and apply preprocessing
pipeline, target = build_preprocessor(df_train, is_train=True)
X_all  = pipeline.fit_transform(df_train, df_train[target])
y_all  = df_train[target].values

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.1, random_state=RANDOM_STATE
)

# Dataset & DataLoader
class HousePriceDataset(Dataset):
    def __init__(self, features, targets=None):
        self.X = torch.from_numpy(features).float()
        self.y = torch.from_numpy(targets).float() if targets is not None else None
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds    = HousePriceDataset(X_train, y_train)
val_ds      = HousePriceDataset(X_val,   y_val)
train_loader= DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4, pin_memory=True)
val_loader  = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

# Model definition (very large)
class RegressionNNVeryBig(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 4096), nn.BatchNorm1d(4096), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(4096, 2048), nn.BatchNorm1d(2048), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(2048, 1024), nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(1024, 512),  nn.BatchNorm1d(512),  nn.ReLU(), nn.Dropout(0.4),
            nn.Linear(512, 256),   nn.BatchNorm1d(256),  nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 128),   nn.BatchNorm1d(128),  nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, 64),    nn.BatchNorm1d(64),   nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32),     nn.BatchNorm1d(32),   nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.model(x).squeeze(1)

model = RegressionNNVeryBig(X_train.shape[1]).to(DEVICE)

# Optimizer, loss, early stopping
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()
best_val = np.inf
wait = 0

for epoch in range(1, MAX_EPOCHS + 1):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        preds  = model(xb)
        loss   = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds  = model(xb)
            val_losses.append(((preds - yb)**2).mean().item())
    val_rmse = np.sqrt(np.mean(val_losses))
    print(f"Epoch {epoch:03d}  Val RMSE: {val_rmse:.2f}")

    if val_rmse + 1e-4 < best_val:
        best_val = val_rmse
        wait = 0
        torch.save(model.state_dict(), os.path.join(MODEL_DIR, 'best_preproc.pt'))
    else:
        wait += 1
        if wait >= PATIENCE:
            print(f"Stopping after {epoch} epochs without improvement.")
            break

# Load best model
model.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'best_preproc.pt')))
print(f"Best Val RMSE: {best_val:.2f} euros")

# Generate submission
pipeline.set_params(preproc__num__winsor__active=False)
X_test = pipeline.transform(df_test)
test_ds     = HousePriceDataset(X_test)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
preds = []
model.eval()
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(DEVICE)
        preds.append(model(xb).cpu().numpy())
preds = np.concatenate(preds, axis=0)
submission = pd.DataFrame({'id': df_test['id'], 'prezo_euros': preds})
submission.to_csv(SUBMIT_FILE, index=False)
print(f"Submission saved to {SUBMIT_FILE}")



ValueError: A given column is not a column of the dataframe