<a href="https://colab.research.google.com/github/Teodosiodg2002/practica-acuity/blob/main/HousePricesKaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalación (Colab) y imports

In [None]:
!pip install -q tensorflow_decision_forests
from pathlib import Path
from google.colab import drive

drive.mount('/content/drive')
# ------------- CONFIG -------------
RANDOM_STATE = 42
DATA_DIR = Path('/content/drive/MyDrive/casas')
LABEL = "PrecioVenta"
# ----------------------------------

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train: (1460, 81) Test: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# ============================================================
# 1. IMPORTS
# ============================================================
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

# ============================================================
# 2. CONFIGURACIÓN
# ============================================================
DATA_DIR = Path("/content/drive/MyDrive/casas")
TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV  = DATA_DIR / "test.csv"
SUBMISSION_CSV = Path("submission.csv")

LABEL = "SalePrice"
TARGET = "log_SalePrice"          # usamos log1p(SalePrice) para entrenar
RANDOM_STATE = 42

# Elige el modelo aquí: "gbt" (GradientBoostedTrees) o "rf" (RandomForest)
MODEL_CHOICE = "gbt"

# Hiperparámetros
HYPERPARAMS = {
    "gbt": {
        "num_trees": 300,
        "max_depth": 6,
    },
    "rf": {
        "num_trees": 500,
        "max_depth": 16,
    }
}

# ============================================================
# 3. FUNCIÓN DE PREPROCESADO (MISMA PARA TRAIN Y TEST)
# ============================================================
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesado simple y reproducible:
      - Rellena nulos (median para numéricos, 'MISSING' para categóricos)
      - Convierte object -> str (TF-DF acepta strings como categorías)
      - NO hacemos one-hot ni escalado (TF-DF no lo necesita)
    """
    df = df.copy()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

    # Rellenar numéricos con mediana
    if len(num_cols) > 0:
        medians = df[num_cols].median()
        df[num_cols] = df[num_cols].fillna(medians)

    # Rellenar categóricos con 'MISSING' y forzar str
    for c in cat_cols:
        df[c] = df[c].fillna("MISSING").astype(str)

    return df

# ============================================================
# 4. CARGAR DATOS
# ============================================================
print("Cargando datos...")
df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Guardar Id del test para submission
if "Id" in test_df.columns:
    test_ids = test_df["Id"].copy()
    test_df = test_df.drop(columns=["Id"])
else:
    raise ValueError("No se encontró columna 'Id' en test.csv")

# Eliminar Id del train si existe
if "Id" in df.columns:
    df = df.drop(columns=["Id"])

print("Train shape:", df.shape, "Test shape:", test_df.shape)

# ============================================================
# 5. PREPROCESADO
# ============================================================
df = preprocess(df)
test_pp = preprocess(test_df)

# ============================================================
# 6. TRANSFORMAR TARGET (log1p)
# ============================================================
df[TARGET] = np.log1p(df[LABEL])

# ============================================================
# 7. SPLIT TRAIN / VALID (para controlar overfitting)
# ============================================================
train_df, valid_df = train_test_split(df, test_size=0.20, random_state=RANDOM_STATE)

print("Train/Valid sizes:", len(train_df), len(valid_df))

# ============================================================
# 8. CONVERTIR A TF DATASET (TF-DF)
# ============================================================
label_col = TARGET

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.drop(columns=[LABEL]), label=label_col, task=tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.drop(columns=[LABEL]), label=label_col, task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_pp, task=tfdf.keras.Task.REGRESSION)

# ============================================================
# 9. CREAR MODELO SEGÚN ELECCIÓN
# ============================================================
print(f"Creando modelo: {MODEL_CHOICE}")

if MODEL_CHOICE == "gbt":
    p = HYPERPARAMS["gbt"]
    model = tfdf.keras.GradientBoostedTreesModel(
        task=tfdf.keras.Task.REGRESSION,
        num_trees=p["num_trees"],
        max_depth=p["max_depth"],
        random_seed=RANDOM_STATE
    )
elif MODEL_CHOICE == "rf":
    p = HYPERPARAMS["rf"]
    model = tfdf.keras.RandomForestModel(
        task=tfdf.keras.Task.REGRESSION,
        num_trees=p["num_trees"],
        max_depth=p["max_depth"],
        random_seed=RANDOM_STATE
    )
else:
    raise ValueError("MODEL_CHOICE debe ser 'gbt' o 'rf'")

# ============================================================
# 10. ENTRENAMIENTO (sobre train_ds)
# ============================================================
print("Entrenando...")
model.fit(train_ds)
print("Entrenamiento finalizado.")
print(model.summary())

# ============================================================
# 11. EVALUACIÓN EN VALID
# ============================================================
preds_log_valid = model.predict(valid_ds)[:, 0]
y_true_log = valid_df[TARGET].values

rmse_log = np.sqrt(mean_squared_error(y_true_log, preds_log_valid))
print(f"RMSE en valid (log1p scale): {rmse_log:.6f}")

preds_price_valid = np.expm1(preds_log_valid)
y_true_price = np.expm1(y_true_log)
rmse_price = np.sqrt(mean_squared_error(y_true_price, preds_price_valid))
print(f"RMSE en precio (valid): {rmse_price:.2f} USD aprox.")

# ============================================================
# 12. IMPORTANCIAS (NUM_AS_ROOT)
# ============================================================
inspector = model.make_inspector()
if "NUM_AS_ROOT" in inspector.variable_importances().keys():
    vi = inspector.variable_importances()["NUM_AS_ROOT"]
    print("\nTop features (NUM_AS_ROOT):")
    for feat, score in vi[:15]:
        print(f"  - {feat.name}: {score}")
else:
    print("No hay variable importance 'NUM_AS_ROOT' disponible.")

# ============================================================
# 13. REENTRENAR EN TODO EL TRAIN
# ============================================================
RETRAIN_ON_ALL = True
if RETRAIN_ON_ALL:
    print("\nReentrenando modelo final con todo el train (train+valid)...")

    full_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df.drop(columns=[LABEL]), label=TARGET, task=tfdf.keras.Task.REGRESSION)

    if MODEL_CHOICE == "gbt":
        p = HYPERPARAMS["gbt"]
        final_model = tfdf.keras.GradientBoostedTreesModel(
            task=tfdf.keras.Task.REGRESSION,
            num_trees=p["num_trees"],
            max_depth=p["max_depth"],
            random_seed=RANDOM_STATE
        )
    else:
        p = HYPERPARAMS["rf"]
        final_model = tfdf.keras.RandomForestModel(
            task=tfdf.keras.Task.REGRESSION,
            num_trees=p["num_trees"],
            max_depth=p["max_depth"],
            random_seed=RANDOM_STATE
        )

    final_model.fit(full_train_ds)
    model_for_submission = final_model
else:
    model_for_submission = model

# ============================================================
# 14. PREDICCIÓN SOBRE TEST Y CREAR SUBMISSION
# ============================================================
preds_log_test = model_for_submission.predict(test_ds)[:, 0]
preds_test_price = np.expm1(preds_log_test)

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": preds_test_price
})
submission.to_csv(SUBMISSION_CSV, index=False)
print(submission.head())


Cargando datos...
Train shape: (1460, 80) Test shape: (1459, 79)
Train/Valid sizes: 1168 292
Creando modelo: gbt
Use /tmp/tmpo0kabp6n as temporary training directory
Entrenando...
Reading training dataset...
Training dataset read in 0:00:00.845611. Found 1168 examples.
Training model...
Model trained in 0:00:02.202002
Compiling model...
Model compiled.
Entrenamiento finalizado.
Model: "gradient_boosted_trees_model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (79):
	1stFlrSF
	2ndFlrSF
	3SsnPorch
	Alley
	BedroomAbvGr
	BldgType
	BsmtCond
	BsmtExposure
	BsmtFinSF1
	BsmtFinSF2
	BsmtFinType1
	BsmtFinType2
	BsmtFullBath
	BsmtHalfBath
	BsmtQual
	BsmtUnfSF
	CentralAir