# Baseline Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [2]:
dataset=f"cleaned_tabular1.csv"
df=pd.read_csv(dataset)

In [3]:
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,268643,0.677402,0.178963,-0.290276,-0.144952,0.922943,0,-0.306964,-0.626000,-0.557611,-0.345663,-0.207964,9,-0.900034,0.192759,-0.473911,-0.129791
1,245000,-0.394132,0.505667,-0.521813,-0.311135,0.922943,0,-0.306964,0.908842,-0.557611,0.709771,-0.207964,8,-1.137139,0.192759,-0.385919,-0.339019
2,200000,0.677402,0.505667,-0.389506,-0.160457,0.922943,0,-0.306964,-0.626000,0.296350,0.777864,-0.207964,5,-2.098571,-0.706669,-0.165941,-0.196068
3,352499,-1.465666,0.178963,-0.918734,-0.364787,0.922943,0,-0.306964,-0.626000,-0.557611,1.288558,-0.207964,47,-0.206791,1.006527,-1.089851,-0.445025
4,232000,-0.394132,-0.147741,-0.874632,-0.038936,-0.918626,0,-0.306964,-0.626000,-0.557611,0.777864,-0.207964,13,-1.367738,0.999388,-0.576568,-0.173196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,378000,-0.394132,-0.801150,-1.183348,-0.204862,-0.918626,0,-0.306964,-0.626000,-0.557611,-0.822311,-0.207964,33,1.111021,-0.749499,-1.441817,-0.214930
16205,399950,-0.394132,0.505667,1.117691,-0.254109,0.922943,0,-0.306964,-0.626000,0.296350,1.458789,-0.207964,6,-1.903393,-0.963648,1.384176,-0.280428
16206,575000,-0.394132,0.505667,0.051517,-0.259827,0.922943,0,-0.306964,-0.626000,-0.557611,1.118326,-0.207964,53,0.869579,1.299198,-0.429915,-0.374478
16207,245000,-2.537201,-1.781262,-1.866934,0.003408,-0.918626,0,-0.306964,-0.626000,-2.265534,-0.277571,-0.207964,1,-0.576183,-0.778052,-1.192508,0.084078


In [4]:
X=df.drop(columns=["price","zipcode"])
y=np.log1p(df["price"])
X.shape,y.shape

((16209, 15), (16209,))

In [5]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
ridge=Ridge(alpha=1.0)
ridge.fit(X_train,y_train)

In [7]:
pred=ridge.predict(X_val)
rsme=np.sqrt(mean_squared_error(np.expm1(y_val),np.expm1(pred)))
r2=r2_score(np.expm1(y_val),np.expm1(pred))
print(rsme)
print(r2)

176424.08434776927
0.7519658759199653


In [8]:
xgb=XGBRegressor(
    n_estimators=500,max_depth=6,learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,random_state=42,n_jobs=-1
)
xgb.fit(X_train,y_train)

In [9]:
pred=xgb.predict(X_val)
rsme=np.sqrt(mean_squared_error(np.expm1(y_val),np.expm1(pred)))
r2=r2_score(np.expm1(y_val),np.expm1(pred))
print(rsme,"by xgb")
print(r2,"by xgb")


116132.62314532339 by xgb
0.8925258146831104 by xgb


we can see a maximum R2 score with XGB Regressor

# Multimodal

In [10]:
from operator import index
importances=pd.Series(xgb.feature_importances_,index=X.columns).sort_values()
importances

bedrooms         0.003797
floors           0.005430
sqft_lot15       0.009249
yr_renovated     0.009700
sqft_lot         0.010469
bathrooms        0.011841
condition        0.014696
yr_built         0.017326
long             0.022535
sqft_living15    0.027979
view             0.048265
waterfront       0.051956
sqft_living      0.107535
lat              0.144978
grade            0.514243
dtype: float32

Now lets us build a multi-model by useing tabular data and images

In [11]:
# image embeddings extraction
import os
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
from torchvision import models,transforms

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
image_transform=transforms.Compose([
    transforms.Resize((224,224)),transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.485,0.406],std=[0.229,0.224,0.225])
])

In [13]:
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [14]:
resnet=models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc=nn.Identity()
for param in resnet.parameters():
  param.requires_grad=False
resnet=resnet.to(DEVICE)
resnet.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [15]:
IMAGE_DIR = "data/arcgis_images"
image_files=sorted(os.listdir(IMAGE_DIR))
len(image_files)

16209

In [16]:
TEST_DIR="data/test_arcgis_images/"
test_images=sorted(os.listdir(TEST_DIR))
len(test_images)

5404

In [17]:
import torch
from torch.utils.data import Dataset,DataLoader
from PIL import Image

class ImageDataset(Dataset):
    def __init__(self,image_dir,transform):
        self.image_dir=image_dir
        self.files=sorted(os.listdir(image_dir))
        self.transform=transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self,idx):
        img_path=os.path.join(self.image_dir,self.files[idx])
        img=Image.open(img_path).convert("RGB")
        return self.transform(img),self.files[idx]


In [18]:
# converting test images into embeddings............................................
from tqdm import tqdm
IMAGE_DIR="data/arcgis_images"
dataset=ImageDataset(IMAGE_DIR, image_transform)
loader=DataLoader(dataset, batch_size=32, shuffle=False)
EMBED_DIR=f"data/image_embeddings"
with torch.no_grad():
    for imgs,names in tqdm(loader):
        imgs=imgs.to(DEVICE)
        feats=resnet(imgs).squeeze(-1).squeeze(-1)
        feats=feats.cpu().numpy()

        for f,n in zip(feats, names):
            np.save(
                os.path.join(EMBED_DIR, n.replace(".jpg", ".npy")),
                f
            )


  0%|          | 0/507 [00:00<?, ?it/s]

100%|██████████| 507/507 [06:43<00:00,  1.26it/s]


In [19]:
# converting test images into embeddings............................................
from tqdm import tqdm
TEST_DIR="data/test_arcgis_images/"
dataset=ImageDataset(TEST_DIR, image_transform)
loader=DataLoader(dataset, batch_size=32, shuffle=False)
EMBED_DIR=f"data/test_image_embeddings"
with torch.no_grad():
    for imgs,names in tqdm(loader):
        imgs=imgs.to(DEVICE)
        feats=resnet(imgs).squeeze(-1).squeeze(-1)
        feats=feats.cpu().numpy()

        for f,n in zip(feats, names):
            np.save(
                os.path.join(EMBED_DIR, n.replace(".jpg", ".npy")),
                f
            )


100%|██████████| 169/169 [02:34<00:00,  1.09it/s]


In [20]:
# training data
BASE_PATH="data"
X_tab=pd.read_csv(f"cleaned_tabular1.csv")
embeddings=[]
EMB_DIR = f"{BASE_PATH}/image_embeddings"
for i in range(len(X_tab)):
    emb_path = os.path.join(EMB_DIR, f"{i}.npy")

    if not os.path.exists(emb_path):
        raise FileNotFoundError(f"Missing embedding:{emb_path}")

    emb=np.load(emb_path)
    embeddings.append(emb)

X_img=np.vstack(embeddings)
print(X_img.shape)


(16209, 512)


In [21]:
# testing data
BASE_PATH = "data"
test_X_tab = pd.read_csv(f"test_cleaned_tabular.csv")
embeddings = []
EMB_DIR = f"{BASE_PATH}/test_image_embeddings"
for i in range(len(test_X_tab)):
    emb_path = os.path.join(EMB_DIR,f"{i}.npy")

    if not os.path.exists(emb_path):
        raise FileNotFoundError(f"Missing embedding:{emb_path}")

    emb=np.load(emb_path)
    embeddings.append(emb)

X_test_img=np.vstack(embeddings)
print(X_test_img.shape)


(5404, 512)


In [22]:
print("Tabular rows:", test_X_tab.shape[0]) 
print("Image rows:", X_test_img.shape[0])


Tabular rows: 5404
Image rows: 5404


In [23]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components=64,
    random_state=42,
    svd_solver="randomized"
)

X_test_img_pca = pca.fit_transform(X_test_img)

X_test_img_pca.shape


(5404, 64)

In [24]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components=64,
    random_state=42,
    svd_solver="randomized"
)

X_img_pca = pca.fit_transform(X_img)

X_img_pca.shape


(16209, 64)

In [25]:
X_tab

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,268643,0.677402,0.178963,-0.290276,-0.144952,0.922943,0,-0.306964,-0.626000,-0.557611,-0.345663,-0.207964,9,-0.900034,0.192759,-0.473911,-0.129791
1,245000,-0.394132,0.505667,-0.521813,-0.311135,0.922943,0,-0.306964,0.908842,-0.557611,0.709771,-0.207964,8,-1.137139,0.192759,-0.385919,-0.339019
2,200000,0.677402,0.505667,-0.389506,-0.160457,0.922943,0,-0.306964,-0.626000,0.296350,0.777864,-0.207964,5,-2.098571,-0.706669,-0.165941,-0.196068
3,352499,-1.465666,0.178963,-0.918734,-0.364787,0.922943,0,-0.306964,-0.626000,-0.557611,1.288558,-0.207964,47,-0.206791,1.006527,-1.089851,-0.445025
4,232000,-0.394132,-0.147741,-0.874632,-0.038936,-0.918626,0,-0.306964,-0.626000,-0.557611,0.777864,-0.207964,13,-1.367738,0.999388,-0.576568,-0.173196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,378000,-0.394132,-0.801150,-1.183348,-0.204862,-0.918626,0,-0.306964,-0.626000,-0.557611,-0.822311,-0.207964,33,1.111021,-0.749499,-1.441817,-0.214930
16205,399950,-0.394132,0.505667,1.117691,-0.254109,0.922943,0,-0.306964,-0.626000,0.296350,1.458789,-0.207964,6,-1.903393,-0.963648,1.384176,-0.280428
16206,575000,-0.394132,0.505667,0.051517,-0.259827,0.922943,0,-0.306964,-0.626000,-0.557611,1.118326,-0.207964,53,0.869579,1.299198,-0.429915,-0.374478
16207,245000,-2.537201,-1.781262,-1.866934,0.003408,-0.918626,0,-0.306964,-0.626000,-2.265534,-0.277571,-0.207964,1,-0.576183,-0.778052,-1.192508,0.084078


In [26]:
test_X_tab=test_X_tab.drop(columns=["zipcode"])
test_X_tab

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,0.673788,0.165837,-0.031285,-0.143311,0.978736,0,-0.302237,0.903711,0.278855,0.525606,-0.216513,-0.856881,0.360644,0.565303,-0.182019
1,1.760375,1.121597,0.840906,-0.188031,-0.906185,0,-0.302237,2.446241,0.278855,0.219183,-0.216513,0.864195,-0.499439,0.536552,-0.231935
2,0.673788,0.484424,1.755129,-0.101962,0.978736,0,2.275960,-0.638820,2.803028,0.900124,-0.216513,0.046756,0.591398,2.462828,-0.112094
3,-0.412798,1.758770,2.585286,-0.025155,0.978736,0,2.275960,-0.638820,3.644420,0.661795,-0.216513,1.015759,-0.100864,2.951585,0.047870
4,-0.412798,0.484424,0.473114,-0.216024,0.978736,0,-0.302237,-0.638820,1.120246,1.138454,-0.216513,0.765786,0.913054,0.364050,-0.310596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5399,0.673788,0.484424,0.756839,-0.017051,0.978736,0,-0.302237,0.903711,1.120246,0.491559,-0.216513,1.202520,0.787189,0.637179,0.075135
5400,-0.412798,-0.152749,-0.672292,-0.197955,0.036275,0,-0.302237,0.903711,-0.562536,-1.993877,-0.216513,0.798829,-0.835081,-0.440961,-0.285937
5401,-0.412798,0.484424,-0.031285,-0.122492,-0.906185,0,-0.302237,-0.638820,0.278855,0.287277,-0.216513,-1.862518,-1.100797,0.105296,-0.162644
5402,0.673788,0.165837,2.795453,-0.161030,1.921197,0,-0.302237,2.446241,1.120246,-1.857689,-0.216513,0.599138,-0.499439,1.370314,-0.124846


In [27]:
y= np.log1p(X_tab["price"])
X_tab=X_tab.drop(columns=["price","zipcode"])

In [28]:
X_tab.shape,y.shape

((16209, 15), (16209,))

In [29]:
assert X_tab.shape[0] == X_img_pca.shape[0], "Row mismatch!"


In [30]:
import numpy as np

X_fused = np.hstack([
    X_tab.values,
    X_img
])

X_fused.shape


(16209, 527)

In [31]:
import numpy as np

X_test_fused = np.hstack([
    test_X_tab.values,
    X_test_img
])

X_test_fused.shape

(5404, 527)

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_fused,
    y,
    test_size=0.2,
    random_state=42
)


In [33]:
from xgboost import XGBRegressor

mm_model = XGBRegressor(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

mm_model.fit(X_train, y_train)


In [34]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

preds = mm_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(
    np.expm1(y_val),
    np.expm1(preds)
))

r2 = r2_score(
    np.expm1(y_val),
    np.expm1(preds)
)

print("===== MULTIMODAL RESULTS =====")
print(f"RMSE: {rmse:,.0f}")
print(f"R²  : {r2:.4f}")


===== MULTIMODAL RESULTS =====
RMSE: 123,918
R²  : 0.8776


In [35]:
preds = mm_model.predict(X_test_fused)
preds=np.expm1(preds)
test_df=pd.read_excel("data/test2.xlsx")
submission=pd.DataFrame({

    "id":test_df["id"],
    "price":preds
})
submission

Unnamed: 0,id,price
0,2591820310,3.333752e+05
1,7974200820,7.740588e+05
2,7701450110,7.587079e+05
3,9522300010,1.540316e+06
4,9510861140,5.936961e+05
...,...,...
5399,7732500270,5.157613e+05
5400,3856903515,5.300386e+05
5401,2557000400,2.458006e+05
5402,4386700135,1.222283e+06


In [36]:
submission

Unnamed: 0,id,price
0,2591820310,3.333752e+05
1,7974200820,7.740588e+05
2,7701450110,7.587079e+05
3,9522300010,1.540316e+06
4,9510861140,5.936961e+05
...,...,...
5399,7732500270,5.157613e+05
5400,3856903515,5.300386e+05
5401,2557000400,2.458006e+05
5402,4386700135,1.222283e+06


In [37]:
submission.to_csv("24115136_final.csv",index=False)