In [2]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as T
import torchvision.models as models
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cpu


In [6]:
train = pd.read_csv("data/train_processed.csv")
test  = pd.read_csv("data/test_processed.csv")

FEATURES = ['bedrooms', 'bathrooms', 'sqft_living', 'lat', 'long']



# VALIDATION METRICS

In [7]:
train_df, val_df = train_test_split(
    train,
    test_size=0.2,
    random_state=42
)

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor()
])


# Multimodal Dataset

In [None]:
class PropertyDataset(Dataset):
    def __init__(self, df, img_dir, is_train=True):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        
        img_path = os.path.join(self.img_dir, f"{row.id}.png")
        img = Image.open(img_path).convert("RGB")
        img = transform(img)

        
        tab = torch.from_numpy(
            row[FEATURES].to_numpy(dtype=np.float32)
        )

        if self.is_train:
            y = torch.tensor(row.price_log, dtype=torch.float32)
            return img, tab, y
        else:
            return img, tab


In [9]:
train_ds = PropertyDataset(train_df, "images/train", is_train=True)
val_ds   = PropertyDataset(val_df,   "images/train", is_train=True)
test_ds  = PropertyDataset(test,     "images/test",  is_train=False)

train_dl = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=0)
val_dl   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=0)
test_dl  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=0)


# Multimodal Model

In [10]:
class ImageEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = models.resnet18(weights="IMAGENET1K_V1")
        self.cnn.fc = nn.Identity()

    def forward(self, x):
        return self.cnn(x)


class TabularEncoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)


class MultimodalModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.img_enc = ImageEncoder()
        self.tab_enc = TabularEncoder(len(FEATURES))

        self.regressor = nn.Sequential(
            nn.Linear(512 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, img, tab):
        img_feat = F.normalize(self.img_enc(img), dim=1)
        tab_feat = self.tab_enc(tab)
        x = torch.cat([img_feat, tab_feat], dim=1)
        return self.regressor(x).squeeze(1)


In [11]:
model = MultimodalModel().to(device)

# Freeze CNN backbone
for p in model.img_enc.cnn.parameters():
    p.requires_grad = False

# Unfreeze last block
for p in model.img_enc.cnn.layer4.parameters():
    p.requires_grad = True

criterion = nn.MSELoss()

optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)



# Training Loop

In [12]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for img, tab, y in train_dl:
        img, tab, y = img.to(device), tab.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(img, tab)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_dl):.4f}")



Epoch 1 | Loss: 24.1782
Epoch 2 | Loss: 1.5469
Epoch 3 | Loss: 1.2807
Epoch 4 | Loss: 1.2124
Epoch 5 | Loss: 1.2027


In [13]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for img, tab, y in val_dl:
        img, tab = img.to(device), tab.to(device)
        preds = model(img, tab)

        y_true.extend(y.numpy())
        y_pred.extend(preds.cpu().numpy())

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2   = r2_score(y_true, y_pred)

print("Validation RMSE:", rmse)
print("Validation R²:", r2)



Validation RMSE: 0.3281394783745918
Validation R²: 0.6098058663029915


# PREDICTION ON TEST SET

In [14]:
model.eval()
preds = []

with torch.no_grad():
    for img, tab in test_dl:
        img, tab = img.to(device), tab.to(device)
        out = model(img, tab)
        preds.extend(out.cpu().numpy())

test['predicted_price'] = np.expm1(preds)

os.makedirs("outputs", exist_ok=True)
test[['id', 'predicted_price']].to_csv(
    "outputs/23124005_final.csv",
    index=False
)

print("Saved outputs/23124005_final.csv")


Saved outputs/23124005_final.csv


# Baseline Model 

In [None]:

y_true = np.expm1(val_df["price_log"].values)

mean_price = y_true.mean()
y_pred_baseline = np.full_like(y_true, mean_price)

baseline_rmse = np.sqrt(mean_squared_error(y_true, y_pred_baseline))
baseline_r2 = r2_score(y_true, y_pred_baseline)

print("Baseline RMSE:", baseline_rmse)
print("Baseline R²:", baseline_r2)


Baseline RMSE: 354243.71596497693
Baseline R²: 0.0


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Tabular Model

In [19]:
X_train_tab = train_df[FEATURES].values
y_train_tab = np.expm1(train_df["price_log"].values)

X_val_tab = val_df[FEATURES].values
y_val_tab = np.expm1(val_df["price_log"].values)


In [20]:
tab_model = LinearRegression()
tab_model.fit(X_train_tab, y_train_tab)


0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [21]:
y_val_pred_tab = tab_model.predict(X_val_tab)

tabular_rmse = np.sqrt(mean_squared_error(y_val_tab, y_val_pred_tab))
tabular_r2 = r2_score(y_val_tab, y_val_pred_tab)

print("Tabular RMSE:", tabular_rmse)
print("Tabular R²:", tabular_r2)


Tabular RMSE: 231212.9942529547
Tabular R²: 0.5739896347299879


# High-Performance Tabular Model (XGBoost)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X_train = train[FEATURES]
y_train = train['price_log']

X_val = val_df[FEATURES]
y_val = val_df['price_log']


xgb_tabular = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

xgb_tabular.fit(X_train, y_train)

y_pred_log = xgb_tabular.predict(X_val)

y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_val)

xgb_tabular_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
xgb_tabular_r2   = r2_score(y_true, y_pred)

print("Improved Tabular (XGBoost) RMSE:", xgb_tabular_rmse)
print("Improved Tabular (XGBoost) R²:", xgb_tabular_r2)


Improved Tabular (XGBoost) RMSE: 112530.581871931
Improved Tabular (XGBoost) R²: 0.8990893928453424


# Multimodal Metrics

In [None]:

model.eval()
y_true_mm, y_pred_mm = [], []

with torch.no_grad():
    for img, tab, y in val_dl:
        img, tab = img.to(device), tab.to(device)
        preds = model(img, tab)

        y_true_mm.extend(np.expm1(y.numpy()))
        y_pred_mm.extend(np.expm1(preds.cpu().numpy()))

mm_rmse = np.sqrt(mean_squared_error(y_true_mm, y_pred_mm))
mm_r2 = r2_score(y_true_mm, y_pred_mm)

print("Multimodal RMSE:", mm_rmse)
print("Multimodal R²:", mm_r2)


Multimodal RMSE: 243248.8504979536
Multimodal R²: 0.5284831246689105


# Realistic Multimodal Strategy 

Use CNN only as a feature extractor → feed into XGBoost

In [24]:
with torch.no_grad():
    img_embeddings = []
    tab_features = []
    targets = []

    for img, tab, y in train_dl:
        emb = model.img_enc(img.to(device))
        img_embeddings.append(emb.cpu().numpy())
        tab_features.append(tab.cpu().numpy())
        targets.append(y.cpu().numpy())

X_img = np.vstack(img_embeddings)
X_tab = np.vstack(tab_features)
y = np.hstack(targets)


In [28]:
img_feats = []
tab_feats = []
targets = []

with torch.no_grad():
    for img, tab, y in train_dl:
        img = img.to(device)

        emb = model.img_enc(img)     # (batch, 512)
        img_feats.append(emb.cpu().numpy())
        tab_feats.append(tab.cpu().numpy())
        targets.append(y.cpu().numpy())

X_img = np.vstack(img_feats)
X_tab = np.vstack(tab_feats)
y_train = np.hstack(targets)


In [29]:
img_feats_val = []
tab_feats_val = []
targets_val = []

with torch.no_grad():
    for img, tab, y in val_dl:
        img = img.to(device)

        emb = model.img_enc(img)
        img_feats_val.append(emb.cpu().numpy())
        tab_feats_val.append(tab.cpu().numpy())
        targets_val.append(y.cpu().numpy())

X_img_val = np.vstack(img_feats_val)
X_tab_val = np.vstack(tab_feats_val)
y_val = np.hstack(targets_val)


In [30]:
X_train_combined = np.hstack([X_tab, X_img])
X_val_combined   = np.hstack([X_tab_val, X_img_val])


In [31]:
xgb_multi = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb_multi.fit(X_train_combined, y_train)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [None]:

y_pred_log = xgb_multi.predict(X_val_combined)


y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_val)

xgb_mm_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
xgb_mm_r2   = r2_score(y_true, y_pred)

print("Improved Multimodal RMSE:", xgb_mm_rmse)
print("Improved Multimodal R²:", xgb_mm_r2)


Improved Multimodal RMSE: 160822.6212943938
Improved Multimodal R²: 0.793894350528717


# Metrics Comparison

In [42]:
import pandas as pd

results = pd.DataFrame({
    "Model": [
        "Mean Baseline",
        "Tabular (Neural MLP)",
        "Improved Tabular (XGBoost)",
        "Multimodal (CNN + MLP)",
        "Improved Multimodal (CNN Embeddings + XGBoost)"
    ],
    "RMSE": [
        baseline_rmse,
        tabular_rmse,
        xgb_tabular_rmse,
        mm_rmse,
        xgb_mm_rmse
    ],
    "R²": [
        baseline_r2,
        tabular_r2,
        xgb_tabular_r2,
        mm_r2,
        xgb_mm_r2
    ]
})

results


Unnamed: 0,Model,RMSE,R²
0,Mean Baseline,354243.715965,0.0
1,Tabular (Neural MLP),231212.994253,0.57399
2,Improved Tabular (XGBoost),112530.581872,0.899089
3,Multimodal (CNN + MLP),243248.850498,0.528483
4,Improved Multimodal (CNN Embeddings + XGBoost),160822.621294,0.793894


In [43]:
torch.save(model.state_dict(), "outputs/multimodal_model.pth")
