In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Baseline Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [5]:
PROJECT_DIR = "/content/drive/MyDrive/price_predictor"
dataset=f"{PROJECT_DIR}/data/cleaned_tabular.csv"
df=pd.read_csv(dataset)

In [6]:
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,12.501142,0.677402,0.178963,-0.290276,-0.144952,0.922943,0,-0.306964,-0.626000,-0.557611,-0.345663,-0.207964,12.581985,-0.900034,0.192759,-0.473911,-0.129791
1,12.409018,-0.394132,0.505667,-0.521813,-0.311135,0.922943,0,-0.306964,0.908842,-0.557611,0.709771,-0.207964,12.591252,-1.137139,0.192759,-0.385919,-0.339019
2,12.206078,0.677402,0.505667,-0.389506,-0.160457,0.922943,0,-0.306964,-0.626000,0.296350,0.777864,-0.207964,12.531108,-2.098571,-0.706669,-0.165941,-0.196068
3,12.772806,-1.465666,0.178963,-0.918734,-0.364787,0.922943,0,-0.306964,-0.626000,-0.557611,1.288558,-0.207964,13.257362,-0.206791,1.006527,-1.089851,-0.445025
4,12.354497,-0.394132,-0.147741,-0.874632,-0.038936,-0.918626,0,-0.306964,-0.626000,-0.557611,0.777864,-0.207964,12.605452,-1.367738,0.999388,-0.576568,-0.173196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,12.842652,-0.394132,-0.801150,-1.183348,-0.204862,-0.918626,0,-0.306964,-0.626000,-0.557611,-0.822311,-0.207964,12.987940,1.111021,-0.749499,-1.441817,-0.214930
16205,12.899097,-0.394132,0.505667,1.117691,-0.254109,0.922943,0,-0.306964,-0.626000,0.296350,1.458789,-0.207964,12.523093,-1.903393,-0.963648,1.384176,-0.280428
16206,13.262127,-0.394132,0.505667,0.051517,-0.259827,0.922943,0,-0.306964,-0.626000,-0.557611,1.118326,-0.207964,13.330295,0.869579,1.299198,-0.429915,-0.374478
16207,12.409018,-2.537201,-1.781262,-1.866934,0.003408,-0.918626,0,-0.306964,-0.626000,-2.265534,-0.277571,-0.207964,12.356950,-0.576183,-0.778052,-1.192508,0.084078


In [7]:
X=df.drop(columns=["price"])
y=df["price"]
X.shape,y.shape

((16209, 16), (16209,))

In [8]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
ridge=Ridge(alpha=1.0)
ridge.fit(X_train,y_train)

In [10]:
pred=ridge.predict(X_val)
rsme=np.sqrt(mean_squared_error(np.expm1(y_val),np.expm1(pred)))
r2=r2_score(np.expm1(y_val),np.expm1(pred))
print(rsme)
print(r2)

140547.19015190305
0.8425872068208815


In [11]:
xgb=XGBRegressor(
    n_estimators=500,max_depth=6,learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,random_state=42,n_jobs=-1
)
xgb.fit(X_train,y_train)

In [12]:
pred=xgb.predict(X_val)
rsme=np.sqrt(mean_squared_error(np.expm1(y_val),np.expm1(pred)))
r2=r2_score(np.expm1(y_val),np.expm1(pred))
print(rsme,"by xgb")
print(r2,"by xgb")

111091.64144913829 by xgb
0.9016536021054534 by xgb


we can see a maximum R2 score with XGB Regressor

# Multimodal

In [13]:
from operator import index
importances=pd.Series(xgb.feature_importances_,index=X.columns).sort_values()
importances

Unnamed: 0,0
bedrooms,0.004132
floors,0.005242
sqft_lot15,0.007642
yr_renovated,0.008605
sqft_lot,0.009059
yr_built,0.011418
condition,0.012119
bathrooms,0.016094
sqft_living15,0.01846
long,0.020267


Now lets us build a multi-model by useing tabular data and images

In [14]:
# image embeddings extraction
import os
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
from torchvision import models,transforms

In [15]:
image_transform=transforms.Compose([
    transforms.Resize((224,224)),transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.485,0.406],std=[0.229,0.224,0.225])
])

In [16]:
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [17]:
resnet=models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc=nn.Identity()
for param in resnet.parameters():
  param.requires_grad=False
resnet=resnet.to(DEVICE)
resnet.eval()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 148MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
PROJECT_DIR = "/content/drive/MyDrive/price_predictor"
IMAGE_DIR = f"{PROJECT_DIR}/data/arcgis_images"
image_files=sorted(os.listdir(IMAGE_DIR))
len(image_files)

16209

In [22]:
BASE_PATH = "/content/drive/MyDrive/price_predictor/data"
X_tab = pd.read_csv(f"{BASE_PATH}/cleaned_tabular.csv")
embeddings = []
EMB_DIR = f"{BASE_PATH}/image_embeddings"
for i in range(len(X_tab)):
    emb_path = os.path.join(EMB_DIR, f"{i}.npy")

    if not os.path.exists(emb_path):
        raise FileNotFoundError(f"Missing embedding: {emb_path}")

    emb = np.load(emb_path)
    embeddings.append(emb)

X_img = np.vstack(embeddings)
print(X_img.shape)


(16209, 512)


In [23]:
print("Tabular rows:", X_tab.shape[0])
print("Image rows:", X_img.shape[0])


Tabular rows: 16209
Image rows: 16209


In [46]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components=64,
    random_state=42,
    svd_solver="randomized"
)

X_img_pca = pca.fit_transform(X_img)

X_img_pca.shape


(16209, 64)

In [None]:
y=X_tab["price"]
X_tab=X_tab.drop(columns=["price"])

In [47]:
X_tab.shape,y.shape

((16209, 16), (16209,))

In [48]:
assert X_tab.shape[0] == X_img_pca.shape[0], "Row mismatch!"


In [49]:
import numpy as np

X_fused = np.hstack([
    X_tab.values,
    X_img_pca
])

X_fused.shape


(16209, 80)

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_fused,
    y,
    test_size=0.2,
    random_state=42
)


In [51]:
from xgboost import XGBRegressor

mm_model = XGBRegressor(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

mm_model.fit(X_train, y_train)


In [52]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

preds = mm_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(
    np.expm1(y_val),
    np.expm1(preds)
))

r2 = r2_score(
    np.expm1(y_val),
    np.expm1(preds)
)

print("===== MULTIMODAL RESULTS =====")
print(f"RMSE: {rmse:,.0f}")
print(f"R²  : {r2:.4f}")


===== MULTIMODAL RESULTS =====
RMSE: 116,450
R²  : 0.8919
