In [3]:
import sys
!{sys.executable} -m pip install joblib xgboost scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
%pip install torchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import joblib
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image

# 1. Load Data
df = pd.read_csv('data/dataset.csv')
df.fillna(0, inplace=True)

# 2. Split Data - CRITICAL: Reset index to keep tabular and image paths aligned
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

def prepare_tabular(data_frame, vectorizer=None, fit=False):
    features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'bedrooms', 'sq_ft']
    dicts = data_frame[features].to_dict('records')
    if fit:
        return vectorizer.fit_transform(dicts), data_frame['median_house_value']
    return vectorizer.transform(dicts), data_frame['median_house_value']

dv = DictVectorizer(sparse=False) # Sparse=False for easier concat later
X_train, y_train = prepare_tabular(df_train, dv, fit=True)
X_val, y_val = prepare_tabular(df_val, dv)
X_test, y_test = prepare_tabular(df_test, dv)

In [6]:
# 3. Dataset & CNN Setup
class HouseDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.paths = dataframe['image_path'].values
        self.labels = dataframe['median_house_value'].values
        self.transform = transform
    def __len__(self): return len(self.paths)
    def __getitem__(self, idx):
        # Use path directly since data_prep.py saved full paths
        try:
            img = Image.open(self.paths[idx]).convert('RGB')
        except:
            img = Image.new('RGB', (200, 200), color='black') # Fallback for missing files
        if self.transform: img = self.transform(img)
        return img, torch.tensor(self.labels[idx], dtype=torch.float)

transform = transforms.Compose([
    transforms.Resize((128, 128)), # Reduced size for faster training on CPU
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_loader = DataLoader(HouseDataset(df_train, transform), batch_size=32, shuffle=False) # Shuffle=False to align with X_train
val_loader = DataLoader(HouseDataset(df_val, transform), batch_size=32, shuffle=False)

class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv_stack = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2), # 128 -> 64
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2), # 64 -> 32
            nn.Flatten()
        )
        self.fc = nn.Linear(32 * 32 * 32, 64) # Adjust for 128x128 input
        self.out = nn.Linear(64, 1)
    def forward(self, x):
        x = self.fc(self.conv_stack(x))
        return self.out(torch.relu(x))
    def extract(self, x):
        return self.fc(self.conv_stack(x))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cnn = CNN().to(device)
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [7]:
# 4. Fusion Feature Extraction
def get_cnn_features(model, loader):
    model.eval()
    feats = []
    with torch.no_grad():
        for imgs, _ in loader:
            feats.append(model.extract(imgs.to(device)).cpu().numpy())
    return np.vstack(feats)

print("Extracting CNN features...")
train_img_feats = get_cnn_features(cnn, train_loader)
val_img_feats = get_cnn_features(cnn, val_loader)

# 5. Combined XGBoost (Fusion)
X_train_fused = np.hstack([X_train, train_img_feats])
X_val_fused = np.hstack([X_val, val_img_feats])

dtrain = xgb.DMatrix(X_train_fused, label=y_train)
dval = xgb.DMatrix(X_val_fused, label=y_val)

params = {'objective': 'reg:squarederror', 'max_depth': 6, 'eta': 0.1}
fused_model = xgb.train(params, dtrain, num_boost_round=50)

y_pred = fused_model.predict(dval)
print(f'Fused RMSE: {np.sqrt(mean_squared_error(y_val, y_pred)):.2f}')

Extracting CNN features...




Fused RMSE: 50927.77
