In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from torchvision.models import Inception_V3_Weights
from torchvision import transforms
from tqdm import tqdm
import os
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
#Device change
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
device

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")
# useful if all data is on a drive or working on google colab

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)

In [None]:
df = pd.read_csv('C:\\Users\\Aashit\\OneDrive\\Desktop\\Coding_stuff\\PROJECTS\\real-estate-multimodal\\train_preprocessed.csv')
df.head()

In [None]:
df.columns

## FIRST MODEL (Based on tabular only data)
- comparing various ML algorithms performance on the dataset to see which fits best and gives more accurate results.
- The target variable is log-transformed house price (price_log) to reduce skewness and improve regression stability. Continuous numerical features are standardized, while ordinal and binary variables are left unscaled to preserve their semantic meaning.
- Multiple regression algorithms (Linear, Ridge, Lasso, Random Forest, and XGBoost) are evaluated using R² score and RMSE on a validation set. The best-performing tabular model serves both as a performance baseline and as a fallback model for cases where satellite imagery is unavailable during inference.

In [None]:
DROP_COLS = [
    "id", "date",
    "price",
    "lat", "long",
    "yr_built", "yr_renovated"
]
#dropping redundant and unuseful columns for now
target = "price_log"
df_model = df.drop(columns=[c for c in DROP_COLS if c in df.columns])

In [None]:
X = df_model.drop(columns=[target])
y = df_model[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
# scale_cols = ["sqft_living", "sqft_lot", "sqft_basement","sqft_living15", "sqft_lot15", "house_age"]

# scaler = StandardScaler()
# X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
# X_val[scale_cols] = scaler.transform(X_val[scale_cols])

# trying scaling all the features (even ordinals)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

### Trying out linear models first
- Regularization parameters for linear models were initially set to standard values for baseline comparison and later tuned using cross-validation when necessary.

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    r2 = r2_score(y_val, preds)
    rmse = np.sqrt(mean_squared_error(y_val, preds))

    results.append({"Model": name,"R2": r2,"RMSE": rmse})

results_df = pd.DataFrame(results).sort_values("R2", ascending=False)
results_df

- The strong performance of the linear models (R² ≈ 0.82) indicates that the preprocessing and feature engineering steps were effective. Log-transforming the target reduced skewness, while engineered features such as house age, renovation status, and neighborhood-level variables improved signal quality. Proper handling of continuous versus ordinal features further contributed to a well-conditioned feature space, resulting in stable and consistent linear model performance

### Next Tree-Based Models

After establishing strong linear baselines, we now evaluate tree-based models that can capture non-linear interactions between housing features. These models help assess whether additional structural complexity improves performance over linear assumptions.

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300,max_depth=20,min_samples_split=10,min_samples_leaf=5,
                           max_features="sqrt", random_state=42,n_jobs=-1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_val)
r2 = r2_score(y_val,y_pred)
rmse = np.sqrt(mean_squared_error(y_val,y_pred))

train_preds_rf = rf.predict(X_train)
train_r2_rf = r2_score(y_train, train_preds_rf)
train_rmse_rf = np.sqrt(mean_squared_error(y_train,train_preds_rf))
print(f"R2: {r2}, RMSE: {rmse}")
print(f"Random forest Train R2: {train_r2_rf} , RMSE: {train_rmse_rf}")

- The Random Forest model outperforms all linear baselines, achieving an R² of 0.847 and a lower RMSE. This small improvement indicates the presence of some meaningful non-linear interactions among housing features, while still relying solely on tabular data.

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=600,learning_rate=0.05, max_depth=6, subsample=0.8,
                             colsample_bytree=0.8, reg_alpha=0.0, reg_lambda=1.0,
                             random_state=42,n_jobs=-1)
xgb_model.fit(X_train,y_train)
preds = xgb_model.predict(X_val)
r2_xgb = r2_score(y_val,preds)
rmse_xgb = np.sqrt(mean_squared_error(y_val,preds))

train_preds_xgb = xgb_model.predict(X_train)
train_r2_xgb = r2_score(y_train, train_preds_xgb)
train_rmse_xgb = np.sqrt(mean_squared_error(y_train,train_preds_xgb))

print(f"XGBoost Train R2: {train_r2_xgb} , RMSE: {train_rmse_xgb}")
print(f"R2: {r2_xgb}, RMSE: {rmse_xgb}")

In [None]:

# K-Fold setup to test mean R2 for best ML model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X), 1):
    X_tr, X_vl = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_vl = y.iloc[train_idx], y.iloc[val_idx]
    
    # Scaling
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_vl_scaled = scaler.transform(X_vl)
    
    model = xgb.XGBRegressor(n_estimators=600, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8,
                             reg_alpha=0.0, reg_lambda=1.0,
                             random_state=42, n_jobs=-1)
    
    model.fit(X_tr_scaled, y_tr)
    preds = model.predict(X_vl_scaled)
    
    r2 = r2_score(y_vl, preds)
    rmse = np.sqrt(mean_squared_error(y_vl, preds))
    
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    print(f"Fold {fold}: R2={r2:.4f}, RMSE={rmse:.2f}")

print(f"\nMean R2: {np.mean(r2_scores):.4f} +/- {np.std(r2_scores):.4f}")
print(f"Mean RMSE: {np.mean(rmse_scores):.2f} +/- {np.std(rmse_scores):.2f}")

- XGBoost achieves the best performance among tabular-only models with an R² of 0.86, marginally improving over Random Forest. The modest gain suggests diminishing returns from increased model complexity, establishing a strong and stable benchmark for evaluating multimodal approaches.


## Second Model: Tabular Neural Network
- In this stage, we train a neural network using tabular features only to evaluate whether a learned, non-linear representation can improve performance over traditional machine learning models. This experiment serves as a transitional step between classical tabular models and the final multimodal architecture, helping isolate the effect of neural networks on structured data before introducing satellite imagery.

In [None]:
# Convert pandas DataFrames/Series to NumPy arrays before creating PyTorch tensors

X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_val_tensor   = torch.from_numpy(X_val.astype(np.float32))

y_train_tensor = torch.from_numpy(y_train.to_numpy().astype(np.float32))
y_val_tensor   = torch.from_numpy(y_val.to_numpy().astype(np.float32))


In [None]:
X_train_tensor.shape

In [None]:
y_train_tensor.shape

In [None]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = features
    self.labels = labels

  def __len__(self):

    return len(self.features)

  def __getitem__(self, idx):

    return self.features[idx], self.labels[idx]

In [None]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
val_dataset = CustomDataset(X_val_tensor, y_val_tensor)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, pin_memory=True)

In [None]:
#number of batches
print(len(train_loader), len(train_dataset))

In [None]:
class TabularOnlyNN(nn.Module):
  def __init__(self, input_dim):
    super(TabularOnlyNN, self).__init__()
    self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128,64),
            nn.ReLU(),

            nn.Linear(64,1)
    )
  def forward(self, x):
        return self.model(x)


In [None]:
# making model and moving to gpu
tabularmodel = TabularOnlyNN(X_train.shape[1]).to(device)
criterion = nn.MSELoss() #loss function

optimizer_one = torch.optim.Adam( #optimizer for updating
    tabularmodel.parameters(),
    lr=1e-3,
    betas=(0.9, 0.999),
    weight_decay=1e-5
)

In [None]:
epochs = 70
for epoch in range(epochs):
  tabularmodel.train()
  total_epoch_loss = 0
  for batch_features, batch_labels in train_loader:
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device).view(-1,1)
    # pushing them into gpu

    outputs = tabularmodel(batch_features)
    optimizer_one.zero_grad()
    loss = criterion(outputs, batch_labels)
    loss.backward()
    optimizer_one.step()
    total_epoch_loss = total_epoch_loss + loss.item()
  avg_loss = total_epoch_loss/len(train_loader)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')


In [None]:
tabularmodel.eval()

In [None]:
nn_predictions = []
nn_labels = []
with torch.no_grad():
  for batch_features, batch_labels in val_loader:
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device).view(-1,1)
    outputs = tabularmodel(batch_features)
    nn_predictions.extend(outputs.cpu().numpy().flatten())
    nn_labels.extend(batch_labels.cpu().numpy().flatten())

nn_predictions = np.array(nn_predictions)
nn_labels = np.array(nn_labels)
print(nn_predictions.shape, nn_labels.shape)


In [None]:
r2 = r2_score(nn_labels, nn_predictions)
rmse = np.sqrt(mean_squared_error(nn_labels, nn_predictions))

print(f"R2: {r2}, RMSE: {rmse}")

In [None]:
nn_predictions = []
nn_labels = []
with torch.no_grad():
  for batch_features, batch_labels in train_loader:
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device).view(-1,1)
    outputs = tabularmodel(batch_features)
    nn_predictions.extend(outputs.cpu().numpy().flatten())
    nn_labels.extend(batch_labels.cpu().numpy().flatten())

nn_predictions = np.array(nn_predictions)
nn_labels = np.array(nn_labels)
r2 = r2_score(nn_labels, nn_predictions)
rmse = np.sqrt(mean_squared_error(nn_labels, nn_predictions))

print(f"train R2: {r2}, RMSE: {rmse}")

The tabular neural network demonstrated the ability to learn non-linear relationships from structured housing features but did not outperform tree-based models such as XGBoost on tabular data alone. Performance variations across runs highlighted the sensitivity of neural networks to training dynamics and initialization, reinforcing the importance of careful execution and evaluation.

Incorporating Batch Normalization stabilized neural network training and produced consistent performance (R² ≈ 0.84), confirming the tabular neural network’s role as a representation learner rather than a standalone competitor to tree-based models.

Despite this, the tabular neural network provides a learned feature representation that is well-suited for integration with visual features. In the next stage, this representation is combined with satellite imagery through a multimodal architecture to evaluate whether environmental context can further improve house price prediction.

Also , it doesn't overfit the data as seen from the train R2 and RMSE atleast not as much as XGBoost.

## Third Model: Multimodal Neural Network (Tabular + Satellite Imagery)

- We now move to the combined (multimodal) model. While the tabular-only model—particularly XGBoost—already achieves strong performance, this stage evaluates whether incorporating satellite imagery provides additional predictive value.

- To achieve this, satellite images are first processed using a pretrained Inception v3 model acting as a fixed feature extractor, generating 2048-dimensional image embeddings. These visual features are then fused with scaled structured housing attributes through a custom neural network architecture. The fusion model consists of separate branch networks for image and tabular data, which are concatenated and passed through fully connected layers to predict house prices. This multimodal deep learning approach leverages both visual context from satellite imagery and traditional housing attributes for price prediction.

### Multimodal Data Preparation

- Before training the combined model, we first need to bring everything together. For each house, we pair its satellite image with the corresponding tabular features and target price. This allows the model to learn from both visual surroundings and traditional housing attributes at the same time.

- For properties without available satellite imagery, a neutral image tensor was used, allowing the model to fall back on tabular features without discarding samples.

In [None]:
#Just for getting image features vector once
class ImageEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature_extractor = models.inception_v3(weights=Inception_V3_Weights.DEFAULT)
        self.feature_extractor.aux_logits = False
        self.feature_extractor.fc = nn.Identity()  # output: 2048

        for param in self.feature_extractor.parameters():
          param.requires_grad = False
        self.feature_extractor.eval() #evaluation mode

    def forward(self, x):
        return self.feature_extractor(x)

In [None]:
IMAGE_DIR = "C:\\Users\\Aashit\\Downloads\\satellite_images\\train"

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, df, image_dir, tabular_cols, label_col, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.tabular_cols = tabular_cols
        self.label_col = label_col
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # tabular_features
        tabular = torch.from_numpy(row[self.tabular_cols].to_numpy().astype("float32"))

        # labels
        label = torch.tensor(row[self.label_col],dtype=torch.float32)

        # image
        image_id = row["id"]
        image_path = os.path.join(self.image_dir, f"{image_id}.png")
        image_found = True

        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
        else:
            # Missing = zero (very low)
            image_found = False
            image = torch.zeros(3, 299, 299)

        return image, image_found, tabular, label 



In [None]:
from torchvision import transforms
image_transform = transforms.Compose([
    transforms.Resize((299, 299)),      # match Inception
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])
])
# ImageNet mean & std used to normalizes

### Must run this everytime

In [None]:

tabular_cols = ["bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view",
                "condition","grade","sqft_basement","house_age","was_renovated","sqft_living15",
                "sqft_lot15","zip_tier"]
label_col = "price_log"


### STARTING THE EXTRACTOR MODEL

In [None]:
print(f"DataFrame shape: {df.shape}")
print(f"IMAGE_DIR exists: {os.path.exists(IMAGE_DIR)}")
print(f"ImageEncoder: {ImageEncoder}")
print(f"MultimodalDataset: {MultimodalDataset}")
print(f"image_transform: {image_transform}")
# to check if everything is available or not

In [None]:
full_dataset = MultimodalDataset(
    df=df,
    image_dir=IMAGE_DIR,
    tabular_cols=tabular_cols,
    label_col=label_col,
    transform=image_transform
)
sample_image, sample_found, sample_tabular, sample_label = full_dataset[0]

print("Image shape:", sample_image.shape)
print("Image min/max:", sample_image.min().item(), sample_image.max().item())
print("Is all zeros?", torch.all(sample_image == 0).item())

# Checking a few more
zero_count = 0
for i in range(10):
    img, found, tab, label = full_dataset[i]
    if torch.all(img == 0):
        zero_count += 1

print(f"\nZero images in first 10: {zero_count}/10")

In [None]:
print(device)
image_encoder = ImageEncoder().to(device)

In [None]:
image_features_list = []
# ONLY NEED TO RUN THIS ONCE 
# Using DataLoader for speed with GPU
loader = DataLoader(full_dataset, batch_size=64, shuffle=False)

image_encoder.eval()
with torch.no_grad():
    for images, found, _, _ in tqdm(loader):
        images = images.to(device)
        
        # 1. Pass through Inception
        features = image_encoder(images) # (Batch, 2048)
        
        # 2. Use the 'found' flag to make zero
        # Move 'found' to GPU and reshape
        mask = found.to(device).view(-1, 1).float() 
        features = features * mask 
        
        image_features_list.append(features.cpu().numpy())

# saving
image_features = np.concatenate(image_features_list, axis=0)
np.save('image_features.npy', image_features)
        

### After getting image features , only need to run from this

In [None]:

image_features = np.load('image_features.npy')
print(f"Loaded image features: {image_features.shape}")

In [None]:
df.shape

In [None]:
tabular_data = df[tabular_cols].values
labels = df[label_col].values

In [None]:
class FusionModel(nn.Module):
    def __init__(self, img_dim, tabular_dim):
        super().__init__()
        self.image_branch = nn.Sequential(
            nn.BatchNorm1d(img_dim), 
            nn.Linear(img_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5), #best at 0.5
            nn.Linear(512, 128),
            nn.ReLU()
        )
        
        self.tabular_branch = nn.Sequential(
            nn.Linear(tabular_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        # Fusion layers
        self.fusionlayers = nn.Sequential(
            nn.Linear(64 + 128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single value output 
        )
    
    def forward(self, img_features, tab_features):
        img_encoded = self.image_branch(img_features)
        tab_encoded = self.tabular_branch(tab_features)
        combined = torch.cat([img_encoded, tab_encoded], dim=1)
        return self.fusionlayers(combined)

In [None]:
# checking if image features need scaling or not  
print(f"Min: {image_features.min()}")
print(f"Max: {image_features.max()}")
print(f"Mean: {image_features.mean()}")
print(f"Std: {image_features.std()}")

- bookmark, everything fine till here

In [None]:
indices = np.arange(len(labels))
train_idx, val_idx = train_test_split(indices, test_size=0.15, random_state=42)

X_train_img = image_features[train_idx]
X_val_img = image_features[val_idx]

X_train_tab_raw = tabular_data[train_idx]
X_val_tab_raw = tabular_data[val_idx]

y_train = labels[train_idx]
y_val = labels[val_idx]

In [None]:
#scaling all the tabular data features
scaler = StandardScaler()
X_train_tab = scaler.fit_transform(X_train_tab_raw)
X_val_tab = scaler.transform(X_val_tab_raw)

In [None]:
class FusionDataset(Dataset):
    def __init__(self, img_features, tab_features, labels):
        self.img_features = torch.FloatTensor(img_features)
        self.tab_features = torch.FloatTensor(tab_features)
        self.labels = torch.FloatTensor(labels).view(-1,1)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.img_features[idx], self.tab_features[idx], self.labels[idx]

In [None]:
# Datasets and DataLoaders

train_ds = FusionDataset(X_train_img, X_train_tab, y_train)
val_ds = FusionDataset(X_val_img, X_val_tab, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, pin_memory=True)

## FusionModel training (only need to do to check performance not main training)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
fusion_model = FusionModel(img_dim=(X_train_img.shape[1]) , tabular_dim = (X_train_tab.shape[1])).to(device)
criterion = nn.MSELoss()
optimizer_fusion = optim.Adam(fusion_model.parameters(), lr=1e-3, weight_decay=1e-4, betas=(0.9, 0.999)) # best at lr=1e-3 , wd = 1e-4

In [None]:
epochs = 50
for epoch in range(epochs):
    fusion_model.train()
    total_epoch_loss = 0
    
    for batch_img, batch_tab, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        batch_img = batch_img.to(device)
        batch_tab = batch_tab.to(device)
        batch_labels = batch_labels.to(device)
        
        outputs = fusion_model(batch_img, batch_tab)
        optimizer_fusion.zero_grad()
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer_fusion.step()
        total_epoch_loss += loss.item()
    
    avg_train_loss = total_epoch_loss / len(train_loader)
    print(f'Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}')

print("\nTraining complete! Now evaluation..")

In [None]:
fusion_model.eval()

In [None]:
fusion_predictions = []
fusion_labels = []

with torch.no_grad():
    for batch_img, batch_tab, batch_labels in train_loader:
        batch_img = batch_img.to(device)
        batch_tab = batch_tab.to(device)
        
        outputs = fusion_model(batch_img, batch_tab)
        fusion_predictions.extend(outputs.cpu().numpy().flatten())
        fusion_labels.extend(batch_labels.numpy())

fusion_predictions = np.array(fusion_predictions)
fusion_labels = np.array(fusion_labels)

# Metrics
r2 = r2_score(fusion_labels, fusion_predictions)
rmse = np.sqrt(mean_squared_error(fusion_labels, fusion_predictions))

print(f"Train R2:")
print(f"R² Score: {r2}")
print(f"RMSE: {rmse}")

In [None]:
fusion_predictions = []
fusion_labels = []

with torch.no_grad():
    for batch_img, batch_tab, batch_labels in val_loader:
        batch_img = batch_img.to(device)
        batch_tab = batch_tab.to(device)
        
        outputs = fusion_model(batch_img, batch_tab)
        fusion_predictions.extend(outputs.cpu().numpy().flatten())
        fusion_labels.extend(batch_labels.numpy())

fusion_predictions = np.array(fusion_predictions)
fusion_labels = np.array(fusion_labels)

# Metrics
r2 = r2_score(fusion_labels, fusion_predictions)
rmse = np.sqrt(mean_squared_error(fusion_labels, fusion_predictions))

print(f"TEST RESULTS:")
print(f"R² Score: {r2}")
print(f"RMSE: {rmse}")

## Key Findings:

1.  Best model: XGBoost (Test R2: 0.8560)
2. Multimodal achieved Test R2: 0.8516 (competitive but didn't beat XGBoost)
3. Gap between train/test is smallest for Multimodal (0.038), showing good generalization

# Final Model Training on Full Dataset

Training the best performing Multimodal model on the complete training dataset (no train/val split) to maximize available data for test predictions.

**Model:** Multimodal Fusion (Tabular + Satellite Images)
**Architecture:** 
- Image Branch: Inception features (2048) → 512 → 128
- Tabular Branch: Features → 128 → 64
- Fusion Layer: Combined features → 128 → 64 → 1

**Configuration:**
- Optimizer: Adam (lr=0.001, weight_decay=1e-4)
- Batch Size: 64
- Epochs: 60 (50 for 85% so 60 for complete training)
- Loss: MSE

In [None]:
class FusionModel(nn.Module):
    def __init__(self, img_dim, tabular_dim):
        super().__init__()
        self.image_branch = nn.Sequential(
            nn.BatchNorm1d(img_dim), 
            nn.Linear(img_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5), #best at 0.5
            nn.Linear(512, 128),
            nn.ReLU()
        )
        
        self.tabular_branch = nn.Sequential(
            nn.Linear(tabular_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        # Fusion layers
        self.fusionlayers = nn.Sequential(
            nn.Linear(64 + 128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single value output 
        )
    
    def forward(self, img_features, tab_features):
        img_encoded = self.image_branch(img_features)
        tab_encoded = self.tabular_branch(tab_features)
        combined = torch.cat([img_encoded, tab_encoded], dim=1)
        return self.fusionlayers(combined)

In [None]:
class FusionDataset(Dataset):
    def __init__(self, img_features, tab_features, labels):
        self.img_features = torch.FloatTensor(img_features)
        self.tab_features = torch.FloatTensor(tab_features)
        self.labels = torch.FloatTensor(labels).view(-1,1)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.img_features[idx], self.tab_features[idx], self.labels[idx]

In [None]:
df_final = pd.read_csv('train_preprocessed.csv')
df_final.head()

In [None]:
len(df_final)

In [None]:
image_features_final = np.load('image_features.npy')
print(f"Loaded image features: {image_features_final.shape}")

In [None]:
tabular_cols = ["bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view",
                "condition","grade","sqft_basement","house_age","was_renovated","sqft_living15",
                "sqft_lot15","zip_tier"]
label_col = "price_log"
tabular_data_final = df_final[tabular_cols].values
labels_final = df_final[label_col].values
#scaling tabular data 

scaler = StandardScaler()
tabular_data_final = scaler.fit_transform(tabular_data_final)



In [None]:
#saving scaler for test (only once)
import pickle
with open('tabular_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
# Datasets and DataLoaders
finaltraining_dataset = FusionDataset(img_features = image_features_final, 
                                      tab_features=tabular_data_final, labels=labels_final)
finaltraining_loader = DataLoader(finaltraining_dataset, batch_size=64, shuffle=True, pin_memory=True)

In [None]:
# Checking dataset and dataloader sizes
print(f"Full dataset size: {len(finaltraining_dataset)}")
print(f"Number of batches: {len(finaltraining_loader)}")
print(f"Expected batches: {len(finaltraining_dataset) / 64}")
print(f"X shape: {tabular_data_final.shape}")
print(f"y shape: {labels_final.shape}")
print(f"Image features shape: {image_features_final.shape}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
final_model = FusionModel(img_dim=(image_features_final.shape[1]) , tabular_dim = (tabular_data_final.shape[1])).to(device)
criterion = nn.MSELoss()
optimizer_fusion = optim.Adam(final_model.parameters(), lr=1e-3, weight_decay=1e-4, betas=(0.9, 0.999))

## MAIN TRAINING LOOP 

In [None]:
# Training configuration
epochs = 60
train_losses = []

In [None]:
# Training loop
for epoch in range(epochs):
    final_model.train()
    epoch_loss = 0
    
    for img_batch, tab_batch, target_batch in tqdm(finaltraining_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        img_batch = img_batch.to(device)
        tab_batch = tab_batch.to(device)
        target_batch = target_batch.to(device)
        
        optimizer_fusion.zero_grad()
        outputs = final_model(img_batch, tab_batch)
        loss = criterion(outputs, target_batch)
        loss.backward()
        optimizer_fusion.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(finaltraining_loader)
    train_losses.append(avg_loss)
    print(f"Avg Loss: {avg_loss:.4f}")

print("Training complete!")

In [None]:
#plotting training losses to see changes
plt.figure(figsize=(10, 6))
plt.plot(range(11, epochs+1), train_losses[10:], 'b-', linewidth=1)
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.title('Training Loss')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Create non-shuffled loader for evaluation
eval_loader = DataLoader(finaltraining_dataset, batch_size=64, shuffle=False)
final_model.eval()

In [None]:
#evaluating on training once 
all_preds = []
all_labels = []

with torch.no_grad():
    for img_batch, tab_batch, target_batch in eval_loader:
        img_batch = img_batch.to(device)
        tab_batch = tab_batch.to(device)
        
        outputs = final_model(img_batch, tab_batch)
        all_preds.append(outputs.cpu().numpy())
        all_labels.append(target_batch.cpu().numpy())

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

train_r2 = r2_score(all_labels, all_preds)
train_rmse = np.sqrt(mean_squared_error(all_labels, all_preds))

print(f"Final Training R2: {train_r2}")
print(f"Final Training RMSE: {train_rmse}")

In [None]:
# Save model
torch.save(final_model.state_dict(), 'final_fusion_model.pt')
print("Model saved!")

# Test Set Predictions

Generating predictions on the test dataset using the final trained multimodal fusion model.

**Steps:**
1. Load preprocessed test tabular data
2. Extract image features from test satellite images using pretrained Inception v3
3. Scale test features using saved scaler
4. Generate predictions using trained model
5. Create submission CSV

### 1. Loading preprocessed test tabular data 

In [None]:
test_df = pd.read_csv("C:\\Users\\Aashit\\OneDrive\\Desktop\\Coding_stuff\\PROJECTS\\real-estate-multimodal\\test_preprocessed.csv")
test_df.head()

In [None]:
test_df.shape

In [None]:

tabular_cols = ["bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view",
                "condition","grade","sqft_basement","house_age","was_renovated","sqft_living15",
                "sqft_lot15","zip_tier"]
test_tabular = test_df[tabular_cols].values
print(f"test_tabular : {test_tabular.shape}")

### 2.1 EXTRACT IMAGE FEATURES

In [None]:
#Just for getting image features vector once
class ImageEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature_extractor = models.inception_v3(weights=Inception_V3_Weights.DEFAULT)
        self.feature_extractor.aux_logits = False
        self.feature_extractor.fc = nn.Identity()  # output: 2048

        for param in self.feature_extractor.parameters():
          param.requires_grad = False
        self.feature_extractor.eval() #evaluation mode

    def forward(self, x):
        return self.feature_extractor(x)

In [None]:
TEST_IMAGE_DIR = "C:\\Users\\Aashit\\Downloads\\satellite_images\\test"

In [None]:
class TestMultimodalDataset(Dataset):
    def __init__(self, df, image_dir, tabular_cols, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.tabular_cols = tabular_cols
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Image
        image_id = row["id"]
        image_path = os.path.join(self.image_dir, f"{image_id}.png")
        image_found = True
        
        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")
            if self.transform:
                image = self.transform(image)
        else:
            image_found = False
            image = torch.zeros(3, 299, 299) #zeros(since only few are missing)
        
        return image, image_found  # No tabular or label needed for feature extraction



In [None]:

image_transform = transforms.Compose([
    transforms.Resize((299, 299)),      # match Inception
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])
])
# ImageNet mean & std used to normalizes

In [None]:
print(f"DataFrame shape: {test_df.shape}")
print(f"TEST_IMAGE_DIR exists: {os.path.exists(TEST_IMAGE_DIR)}")
print(f"TestMultimodalDataset: {TestMultimodalDataset}")
print(f"ImageEncoder: {ImageEncoder}")
print(f"image_transform: {image_transform}")
# to check if everything is available or not

In [None]:
test_dataset = TestMultimodalDataset(
    df=test_df,
    image_dir=TEST_IMAGE_DIR,
    tabular_cols=tabular_cols,
    transform=image_transform
)
sample_image, sample_found = test_dataset[0]

print("Image shape:", sample_image.shape)
print("Image min/max:", sample_image.min().item(), sample_image.max().item())
print("Is all zeros?", torch.all(sample_image == 0).item())

# Checking a few more
zero_count = 0
for i in range(10):
    img, found = test_dataset[i]
    if torch.all(img == 0):
        zero_count += 1

print(f"\n Zero images in first 10: {zero_count}/10")

In [None]:
print(device)
image_encoder = ImageEncoder().to(device)

In [None]:
image_features_list = []
# ONLY NEED TO RUN THIS ONCE 
# Using DataLoader for speed with your own GPU
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

image_encoder.eval()
with torch.no_grad():
    for images, found in tqdm(test_loader):
        images = images.to(device)
        
        # 1. Pass through Inception
        features = image_encoder(images) # (Batch, 2048)
        
        # 2. Using the 'found' flag to zero out missing entries
        # Move 'found' to GPU and reshape
        mask = found.to(device).view(-1, 1).float() 
        features = features * mask 
        
        image_features_list.append(features.cpu().numpy())


test_image_features = np.concatenate(image_features_list, axis=0)
np.save('test_image_features.npy', test_image_features)
print(f"Saved: {test_image_features.shape}")
        

### 2.2 Loading test image features 

In [None]:
image_features_test = np.load('test_image_features.npy')
print(f"Loaded image features: {image_features_test.shape}")

### 2.3 VERIFICATION

In [None]:

# Finding indices in the array where the features are all zeros
zero_feature_indices = np.where(~image_features_test.any(axis=1))[0]

# Finding indices in your dataframe where images were missing
missing_image_indices = test_df[test_df['id'].apply(
    lambda x: not os.path.exists(os.path.join(TEST_IMAGE_DIR, f"{x}.png"))
)].index.tolist()


if set(zero_feature_indices) == set(missing_image_indices):
    print("Alignment Verified: Features match CSV rows perfectly.")
else:
    print("Alignment Error: Features and CSV rows are out of sync!")

### 3. SCALE TEST FEATURES USING TABULAR_SCALAR.PKL

In [None]:
# Loading saved scaler and transform test data
import pickle
with open('tabular_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

test_tabular_scaled = scaler.transform(test_tabular)
print(f"Scaled shape: {test_tabular_scaled.shape}")

### 4. Generate PREDICTIONS

In [None]:
# Create test dataset (using numpy arrays directly)
class TestPredictionDataset(Dataset):
    def __init__(self, img_features, tab_features):
        self.img_features = torch.FloatTensor(img_features)
        self.tab_features = torch.FloatTensor(tab_features)
    
    def __len__(self):
        return len(self.img_features)
    
    def __getitem__(self, idx):
        return self.img_features[idx], self.tab_features[idx]

test_pred_dataset = TestPredictionDataset(image_features_test, test_tabular_scaled)
test_pred_loader = DataLoader(test_pred_dataset, batch_size=64, shuffle=False)

print(f"Test dataset size: {len(test_pred_dataset)}")
print(f"Number of batches: {len(test_pred_loader)}")

In [None]:
class FusionModel(nn.Module):
    def __init__(self, img_dim, tabular_dim):
        super().__init__()
        self.image_branch = nn.Sequential(
            nn.BatchNorm1d(img_dim), 
            nn.Linear(img_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5), #best at 0.5
            nn.Linear(512, 128),
            nn.ReLU()
        )
        
        self.tabular_branch = nn.Sequential(
            nn.Linear(tabular_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        # Fusion layers
        self.fusionlayers = nn.Sequential(
            nn.Linear(64 + 128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single value output 
        )
    
    def forward(self, img_features, tab_features):
        img_encoded = self.image_branch(img_features)
        tab_encoded = self.tabular_branch(tab_features)
        combined = torch.cat([img_encoded, tab_encoded], dim=1)
        return self.fusionlayers(combined)

In [None]:
print(device)

In [None]:
# Loading trained model
final_model = FusionModel(img_dim=image_features_test.shape[1], 
                          tabular_dim= test_tabular_scaled.shape[1]).to(device)
final_model.load_state_dict(torch.load('final_fusion_model.pt'))
final_model.eval()
print("Model loaded and ready for prediction. ")

In [None]:
all_predictions = []
final_model.eval()

with torch.no_grad():
    for img_batch, tab_batch in tqdm(test_pred_loader, desc="Predicting"):
        img_batch = img_batch.to(device)
        tab_batch = tab_batch.to(device)
        
        batch_preds = final_model(img_batch, tab_batch)
        all_predictions.append(batch_preds.cpu().numpy())

test_predictions = np.concatenate(all_predictions).flatten()
print(f"Predictions shape: {test_predictions.shape}")
print(f"Sample predictions (log scale): {test_predictions[:5]}")

### 5. Creating Submission File

In [None]:
# first changing log-prices back to prices
actual_prices = np.exp(test_predictions)

print(f"Sample prices : {actual_prices[:5]}")
print(f"Actual price range : {actual_prices.min():.2f} to {actual_prices.max():.2f}")

In [None]:
# Creating submission CSV
submission = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'predicted_price': actual_prices.round(0).astype(int)
})

submission.to_csv('24322002_final.csv', index=False)
print(f"Total predictions: {len(submission)}")
submission.head(10)