In [None]:
import os
import json
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from sklearn.metrics import root_mean_squared_error,  r2_score
from xgboost import XGBRegressor
import joblib

from src.dataset import PropertyDataset
from src.models import (
    LateFusionModel,
    WeightedLateFusionModel,
    CNNResidualRegressor
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

METRICS_PATH = os.path.join(OUTPUT_DIR, "metrics.json")


In [93]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


In [94]:
def evaluate(y_true, y_pred):
    return {
        "rmse": float(root_mean_squared_error(y_true, y_pred)),
        "r2": float(r2_score(y_true, y_pred))
    }

def update_metrics(model_name, values, path=METRICS_PATH):
    if os.path.exists(path):
        with open(path, "r") as f:
            data = json.load(f)
    else:
        data = {}

    data[model_name] = values

    with open(path, "w") as f:
        json.dump(data, f, indent=4)


In [95]:
train_df = pd.read_csv("data/processed/train_processed.csv")
val_df   = pd.read_csv("data/processed/val_processed.csv")

TAB_COLS = [
    c for c in train_df.columns
    if c not in ["id", "log_price", "lat", "long"]
]

y_train = train_df["log_price"].values
y_val   = val_df["log_price"].values


In [96]:
xgb = XGBRegressor(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb.fit(train_df[TAB_COLS], y_train)

val_preds = xgb.predict(val_df[TAB_COLS])
metrics_xgb = evaluate(y_val, val_preds)

update_metrics(
    "xgboost",
    {
        "rmse": metrics_xgb["rmse"],
        "r2": metrics_xgb["r2"]
    }
)

print("XGBoost:", metrics_xgb)

joblib.dump(xgb, os.path.join(OUTPUT_DIR, "xgboost_model.pkl"))


XGBoost: {'rmse': 0.27515337403342255, 'r2': 0.7222425901862053}


['outputs\\xgboost_model.pkl']

In [97]:
BATCH_SIZE = 32
EPOCHS = 8
LR = 1e-3

In [98]:
train_ds = PropertyDataset(
    "data/processed/train_processed.csv",
    "data/images",
    split="train",
    mode="fusion",
    zooms=("zoom16",)
)

val_ds = PropertyDataset(
    "data/processed/val_processed.csv",
    "data/images",
    split="train",
    mode="fusion",
    zooms=("zoom16",)
)

train_loader = DataLoader(train_ds, BATCH_SIZE, shuffle=True, num_workers=4)
val_loader   = DataLoader(val_ds, BATCH_SIZE, shuffle=False, num_workers=4)


In [99]:
model_naive = LateFusionModel(len(TAB_COLS)).to(DEVICE)
optimizer = torch.optim.Adam(model_naive.parameters(), lr=LR)
criterion = nn.MSELoss()

for epoch in range(1, EPOCHS + 1):
    model_naive.train()
    pbar = tqdm(train_loader, desc=f"Naive Fusion Epoch {epoch}/{EPOCHS}")
    for img, tab, y in pbar:
        img, tab, y = img.to(DEVICE), tab.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        preds = model_naive(img, tab)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())


Naive Fusion Epoch 1/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 2/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 3/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 4/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 5/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 6/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 7/8:   0%|          | 0/403 [00:00<?, ?it/s]

Naive Fusion Epoch 8/8:   0%|          | 0/403 [00:00<?, ?it/s]

In [109]:
model_naive.eval()
preds = []

with torch.no_grad():
    for img, tab, _ in val_loader:
        preds.append(model_naive(img.to(DEVICE), tab.to(DEVICE)).cpu().numpy())

preds = np.concatenate(preds)
metrics_naive = evaluate(y_val, preds)

update_metrics(
    "naive_fusion",
    {
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "rmse": metrics_naive["rmse"],
        "r2": metrics_naive["r2"]
    }
)

torch.save(
    model_naive.state_dict(),
    "outputs/naive_fusion.pth"
)

print("Saved: outputs/naive_fusion.pth")


Saved: outputs/naive_fusion.pth


In [101]:
EPOCHS = 10
LR = 1e-3


In [102]:
model_weighted = WeightedLateFusionModel(len(TAB_COLS)).to(DEVICE)
optimizer = torch.optim.Adam(model_weighted.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    model_weighted.train()
    pbar = tqdm(train_loader, desc=f"Weighted Fusion Epoch {epoch}/{EPOCHS}")
    for img, tab, y in pbar:
        img, tab, y = img.to(DEVICE), tab.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        preds = model_weighted(img, tab)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item(), alpha=model_weighted.alpha.item())


Weighted Fusion Epoch 1/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 2/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 3/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 4/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 5/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 6/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 7/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 8/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 9/10:   0%|          | 0/403 [00:00<?, ?it/s]

Weighted Fusion Epoch 10/10:   0%|          | 0/403 [00:00<?, ?it/s]

In [None]:
model_weighted.eval()
preds = []

with torch.no_grad():
    for img, tab, _ in val_loader:
        preds.append(model_weighted(img.to(DEVICE), tab.to(DEVICE)).cpu().numpy())

preds = np.concatenate(preds)
metrics_weighted = evaluate(y_val, preds)

update_metrics(
    "weighted_fusion",
    {
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "alpha": float(model_weighted.alpha.item()),
        "rmse": metrics_weighted["rmse"],
        "r2": metrics_weighted["r2"]
    }
)
torch.save(
    model_weighted.state_dict(),
    "outputs/intermediate_fusion.pth"
)

print("Saved: outputs/intermediate_fusion.pth")


Weighted Fusion: {'rmse': 0.2981878482161397, 'r2': 0.6737910665095028}


In [104]:
EPOCHS = 15
LR = 1e-4


In [105]:
train_df["xgb_pred"] = xgb.predict(train_df[TAB_COLS])
val_df["xgb_pred"]   = xgb.predict(val_df[TAB_COLS])

train_df.to_csv("data/processed/train_with_xgb.csv", index=False)
val_df.to_csv("data/processed/val_with_xgb.csv", index=False)


In [106]:
train_res_ds = PropertyDataset(
    "data/processed/train_with_xgb.csv",
    "data/images",
    split="train",
    mode="residual",
    zooms=("zoom16",),
    xgb_pred_col="xgb_pred"
)

val_res_ds = PropertyDataset(
    "data/processed/val_with_xgb.csv",
    "data/images",
    split="train",
    mode="residual",
    zooms=("zoom16",),
    xgb_pred_col="xgb_pred"
)

train_res_loader = DataLoader(train_res_ds, BATCH_SIZE, shuffle=True, num_workers=4)
val_res_loader   = DataLoader(val_res_ds, BATCH_SIZE, shuffle=False, num_workers=4)


In [107]:
model_residual = CNNResidualRegressor().to(DEVICE)
optimizer = torch.optim.Adam(model_residual.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    model_residual.train()
    pbar = tqdm(train_res_loader, desc=f"Residual Fusion Epoch {epoch}/{EPOCHS}")
    for img, res in pbar:
        img, res = img.to(DEVICE), res.to(DEVICE)
        optimizer.zero_grad()
        preds = model_residual(img)
        loss = criterion(preds, res)
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())


Residual Fusion Epoch 1/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 2/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 3/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 4/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 5/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 6/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 7/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 8/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 9/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 10/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 11/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 12/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 13/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 14/15:   0%|          | 0/403 [00:00<?, ?it/s]

Residual Fusion Epoch 15/15:   0%|          | 0/403 [00:00<?, ?it/s]

In [108]:
model_residual.eval()
cnn_residuals = []

with torch.no_grad():
    for img, _ in val_res_loader:
        cnn_residuals.append(model_residual(img.to(DEVICE)).cpu().numpy())

cnn_residuals = np.concatenate(cnn_residuals)
final_preds = val_df["xgb_pred"].values + cnn_residuals

metrics_residual = evaluate(y_val, final_preds)

update_metrics(
    "residual_fusion",
    {
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "rmse": metrics_residual["rmse"],
        "r2": metrics_residual["r2"]
    }
)

print("Residual Fusion:", metrics_residual)

torch.save(
    model_residual.state_dict(),
    os.path.join(OUTPUT_DIR, "adaptive_fusion_final.pth")
)


Residual Fusion: {'rmse': 0.2571100193866544, 'r2': 0.7574764199837505}
