In [37]:
# Modelado para Competición Kaggle - Precios de Viviendas en Galicia
# =============================================================================

# Importación de bibliotecas necesarias
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import os
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

# Configuración para reproducibilidad
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Configuración de visualización
#plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")
pd.set_option('display.max_columns', None)




In [38]:
# =============================================================================
# 1. CARGA DE DATOS PROCESADOS
# =============================================================================
print("Cargando datos procesados...")
if not os.path.exists('train_processed.csv') or not os.path.exists('test_processed.csv'):
    raise FileNotFoundError("No se encontraron los archivos procesados. Ejecute primero el script de preprocesamiento.")

train_data = pd.read_csv('train.csv')
test_data  = pd.read_csv('test.csv')
#train_data = pd.read_csv('train_processed.csv')
#test_data  = pd.read_csv('test_processed.csv')
print(f"Dimensiones del conjunto de entrenamiento: {train_data.shape}")
print(f"Dimensiones del conjunto de prueba: {test_data.shape}")


Cargando datos procesados...
Dimensiones del conjunto de entrenamiento: (20000, 20)
Dimensiones del conjunto de prueba: (10000, 20)


In [39]:

# =============================================================================
# 2. PREPARACIÓN PARA EL MODELADO
# =============================================================================
print("\nPreparando datos para el modelado...")
if 'log_prezo' in train_data.columns:
    print("Usando transformación logarítmica de precios para el modelado...")
    y = train_data['log_prezo']
    use_log = True
else:
    print("Usando precios originales para el modelado...")
    y = train_data['prezo_euros']
    use_log = False


# Excluir ID, target, indicador outlier y variables categóricas originales
exclude = ['id', 'prezo_euros', 'log_prezo', 'is_outlier',
           'tipo_edificacion', 'calidade_materiais', 'cor_favorita_propietario',
           'acceso_transporte_publico', 'orientacion', 'eficiencia_enerxetica']
feature_cols = [c for c in train_data.columns if c not in exclude]
X = train_data[feature_cols]

# Imputar faltantes restantes
if X.isnull().sum().sum() > 0:
    print("ADVERTENCIA: Hay valores faltantes en las características. Imputando con la mediana...")
    X = X.fillna(X.median())

# Dividir en train/validation
#time = 0
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
print(f"Tamaño del train: {X_train.shape}, validación: {X_val.shape}")




Preparando datos para el modelado...
Usando precios originales para el modelado...
ADVERTENCIA: Hay valores faltantes en las características. Imputando con la mediana...
Tamaño del train: (16000, 12), validación: (4000, 12)


In [40]:
# =============================================================================
# 3. FUNCIONES AUXILIARES PARA EVALUACIÓN
# =============================================================================
def evaluate_model(model, X_val, y_val, use_log=False):
    y_pred = model.predict(X_val)
    if use_log:
        y_pred = np.expm1(y_pred)
        y_true = np.expm1(y_val)
    else:
        y_true = y_val
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

def cross_validate_model(model, X, y, cv=5, use_log=False):
    kf = KFold(n_splits=cv, shuffle=True, random_state=RANDOM_SEED)
    maes, r2s = [], []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_va)
        if use_log:
            y_pred = np.expm1(y_pred)
            y_va = np.expm1(y_va)
        maes.append(mean_absolute_error(y_va, y_pred))
        r2s.append(r2_score(y_va, y_pred))
    return {
        'MAE_CV': np.mean(maes),
        'MAE_STD': np.std(maes),
        'R2_CV': np.mean(r2s),
        'R2_STD': np.std(r2s)
    }

def make_submission(model, test_data, feature_cols, filename='submission.csv', use_log=False):
    X_test = test_data[feature_cols]
    if X_test.isnull().sum().sum() > 0:
        X_test = X_test.fillna(X_test.median())
    preds = model.predict(X_test)
    if use_log:
        preds = np.expm1(preds)
    submission = pd.DataFrame({'id': test_data['id'], 'prezo_euros': preds})
    submission.to_csv(filename, index=False)
    print(f"Submission guardado como {filename}")
    return submission


In [41]:
# =============================================================================
# 4. ENTRENAMIENTO DE MODELOS BÁSICOS
# =============================================================================
print("\nEntrenando modelos básicos...")
models={}
def train_and_record(name,model):
    t0=time.time(); model.fit(X_train,y_train)
    t=time.time()-t0
    ev=evaluate_model(model,X_val,y_val,use_log)
    cv=cross_validate_model(model,X,y,cv=5,use_log=use_log)
    models[name]={**ev,**cv,'train_time':t,'model':model}
    print(f"{name}: MAE={ev['MAE']:.4f}, RMSE={ev['RMSE']:.4f}, R2={ev['R2']:.4f}, time={t:.1f}s")
algorithms=[
    ('LinearRegression',LinearRegression()),
    ('Ridge',Ridge(random_state=RANDOM_SEED)),
    ('Lasso',Lasso(max_iter=10000,random_state=RANDOM_SEED)),
    ('ElasticNet',ElasticNet(max_iter=10000,random_state=RANDOM_SEED)),
    ('RandomForest',RandomForestRegressor(n_jobs=-1,random_state=RANDOM_SEED)),
    ('ExtraTrees',ExtraTreesRegressor(n_jobs=-1,random_state=RANDOM_SEED)),
    ('GradientBoosting',GradientBoostingRegressor(random_state=RANDOM_SEED)),
    ('XGBoost',xgb.XGBRegressor(n_jobs=-1,random_state=RANDOM_SEED)),
    ('LightGBM',lgb.LGBMRegressor(n_jobs=-1,random_state=RANDOM_SEED)),
    ('CatBoost',cb.CatBoostRegressor(verbose=0,random_state=RANDOM_SEED))
]
for n,m in algorithms: train_and_record(n,m)
print("\nResumen inicial:")
init_results=pd.DataFrame(models).T[['MAE','RMSE','R2','MAE_CV','R2_CV']].sort_values('MAE')
print(init_results)


Entrenando modelos básicos...
LinearRegression: MAE=43835.5804, RMSE=65771.1327, R2=0.8450, time=0.0s
Ridge: MAE=43835.1839, RMSE=65771.0286, R2=0.8450, time=0.0s
Lasso: MAE=43835.2397, RMSE=65771.0229, R2=0.8450, time=0.0s
ElasticNet: MAE=43916.3550, RMSE=66378.9186, R2=0.8421, time=0.0s
RandomForest: MAE=39774.6542, RMSE=55032.7295, R2=0.8915, time=0.4s
ExtraTrees: MAE=40389.6128, RMSE=56102.0198, R2=0.8872, time=0.3s
GradientBoosting: MAE=39641.2068, RMSE=55219.2893, R2=0.8907, time=3.4s
XGBoost: MAE=40540.0259, RMSE=56376.5742, R2=0.8861, time=1.8s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1948
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 12
[LightGBM] [Info] Start training from score 223357.201938
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.00066

In [None]:
# =============================================================================
# 5. BÚSQUEDA DE HIPERPARÁMETROS PARA TOP 4 MODELOS
# =============================================================================
top4=init_results.index[:4].tolist()
param_grids = {
    'Ridge': {'alpha':[0.1,1,10]},
    'RandomForest': {'n_estimators':[100,200],'max_depth':[None,10,20]},
    'XGBoost': {'n_estimators':[100,200],'learning_rate':[0.1,0.01],'max_depth':[3,6]},
    'LightGBM': {'n_estimators':[100,200],'learning_rate':[0.1,0.01],'num_leaves':[31,50]}
}
tuned_models={}
for name in top4:
    print(f"\nTuning {name}...")
    base=models[name]['model']
    grid=GridSearchCV(base.__class__(**{k:v for k,v in base.get_params().items() if k in ['random_state','n_jobs']}, ),
                      param_grids.get(name,{}),cv=3,scoring='neg_mean_absolute_error',verbose=1)
    grid.fit(X_train,y_train)
    best=grid.best_estimator_
    ev=evaluate_model(best,X_val,y_val,use_log)
    cv=cross_validate_model(best,X,y,cv=5,use_log=use_log)
    tuned_models[name+'_tuned']={**ev,**cv,'model':best}
    print(f"{name} best params: {grid.best_params_}, MAE: {ev['MAE']:.4f}")


Tuning CatBoost...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Learning rate set to 0.058682
0:	learn: 0.6655428	total: 2.44ms	remaining: 2.44s
1:	learn: 0.6347277	total: 4.36ms	remaining: 2.18s
2:	learn: 0.6065422	total: 8.92ms	remaining: 2.96s
3:	learn: 0.5798605	total: 10.6ms	remaining: 2.64s
4:	learn: 0.5555618	total: 12.3ms	remaining: 2.45s
5:	learn: 0.5323836	total: 14.1ms	remaining: 2.33s
6:	learn: 0.5106808	total: 15.7ms	remaining: 2.23s
7:	learn: 0.4907553	total: 17.5ms	remaining: 2.16s
8:	learn: 0.4722655	total: 19.1ms	remaining: 2.11s
9:	learn: 0.4554251	total: 20.9ms	remaining: 2.07s
10:	learn: 0.4387480	total: 22.6ms	remaining: 2.03s
11:	learn: 0.4236959	total: 24.2ms	remaining: 1.99s
12:	learn: 0.4092708	total: 25.9ms	remaining: 1.97s
13:	learn: 0.3961747	total: 27.6ms	remaining: 1.95s
14:	learn: 0.3844215	total: 29.3ms	remaining: 1.92s
15:	learn: 0.3731720	total: 31.1ms	remaining: 1.91s
16:	learn: 0.3629777	total: 32.8ms	remaining: 1.9s
17:	learn: 0.35381

In [None]:
# =============================================================================
# 6. RESUMEN FINAL Y SELECCIÓN DE MEJOR MODELO
# =============================================================================
print("\nResumen modelos tuneds:")
tuned_results=pd.DataFrame(tuned_models).T[['MAE','RMSE','R2','MAE_CV','R2_CV']].sort_values('MAE')
print(tuned_results)
best_name=tuned_results.index[0]
best_model=tuned_models[best_name]['model']
print(f"\nMejor modelo final: {best_name} con MAE {tuned_results.loc[best_name,'MAE']:.4f}")
joblib.dump(best_model,f"best_model_{best_name}.pkl")

# =============================================================================
# 7. GENERAR SUBMISSION
# =============================================================================
submission=make_submission(best_model,test_data,feature_cols,filename='submission.csv',use_log=use_log)
print(submission.head())



Resumen modelos tuneds:
                                 MAE          RMSE        R2        MAE_CV  \
CatBoost_tuned          30884.773224  43914.417158  0.927614  30991.575988   
LightGBM_tuned          31093.025209  44645.364924  0.925184  31328.578358   
GradientBoosting_tuned  32263.652675  46279.607366  0.919606  32316.475614   
ExtraTrees_tuned        33410.470595  47479.954201  0.915382  33303.284948   

                           R2_CV  
CatBoost_tuned          0.928534  
LightGBM_tuned           0.92622  
GradientBoosting_tuned  0.921404  
ExtraTrees_tuned        0.917691  

Mejor modelo final: CatBoost_tuned con MAE 30884.7732
Submission guardado como submission.csv
      id    prezo_euros
0   2309  449298.999251
1  22405  138571.774657
2  23398  373287.090040
3  25059  260324.057253
4   2665  582071.699270


Resumen modelos tuneds:
                                 MAE          RMSE        R2        MAE_CV  \
CatBoost_tuned          29712.308361  42271.989821  0.935963   30122.30243   
LightGBM_tuned          30295.700301  42778.390617  0.934419  30660.343119   
GradientBoosting_tuned  31371.522632  44632.757482  0.928611  31545.077512   
ExtraTrees_tuned        32079.344865  45068.092427  0.927211  32306.690182   

                           R2_CV  
CatBoost_tuned          0.932856  
LightGBM_tuned          0.930329  
GradientBoosting_tuned  0.925548  
ExtraTrees_tuned        0.923447  

Mejor modelo final: CatBoost_tuned con MAE 29712.3084
Submission guardado como submission.csv
      id    prezo_euros
0   2309  451844.676306
1  22405  140804.355573
2  23398  377153.396202
3  25059  265152.086969
4   2665  586952.930288