In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
from google.colab import drive
drive.flush_and_unmount()


Drive not mounted, so nothing to flush and unmount.


In [5]:
!rm -rf /content/drive


In [15]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor


In [16]:
TRAIN_PATH = "/content/drive/MyDrive/IITR/Satellite-project/Satellite-train.xlsx"
IMG_FEAT_DIR = "/content/drive/MyDrive/IITR/Satellite-project/image_features/"
MODEL_DIR = "/content/drive/MyDrive/IITR/Satellite-project/models/"


In [17]:
df = pd.read_excel(TRAIN_PATH)
df['house_age'] = 2025 - df['yr_built']
df['is_renovated'] = (df['yr_renovated'] > 0).astype(int)
df['total_sqft'] = df['sqft_above'] + df['sqft_basement']
df['room_density'] = df['sqft_living'] / (df['bedrooms'] + 1)


In [18]:
df = df.drop(columns=['id', 'date'], errors='ignore')



In [19]:
TABULAR_FEATURES = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
    'floors', 'waterfront', 'view', 'condition', 'grade',
    'sqft_above', 'sqft_basement',
    'lat', 'long',
    'sqft_living15', 'sqft_lot15',
    'house_age', 'is_renovated', 'total_sqft', 'room_density'
]

In [20]:
X_tab = df[TABULAR_FEATURES].copy()
y = df['price'].copy()

print(X_tab.shape, y.shape)


(16209, 19) (16209,)


In [21]:
train_idx, val_idx = train_test_split(
    df.index, test_size=0.2, random_state=42
)


In [22]:
tab_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

tab_model.fit(X_tab.loc[train_idx], y.loc[train_idx])


In [23]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

tab_val_pred = tab_model.predict(X_tab.loc[val_idx])

r2 = r2_score(y.loc[val_idx], tab_val_pred)
rmse = np.sqrt(mean_squared_error(y.loc[val_idx], tab_val_pred))

print("Tabular Validation R²:", r2)
print("Tabular Validation RMSE:", rmse)


Tabular Validation R²: 0.8900825381278992
Tabular Validation RMSE: 117445.25328850035


In [24]:
tab_train_pred = tab_model.predict(X_tab.loc[train_idx])
residual_train = y.loc[train_idx] - tab_train_pred


In [25]:
X_pca = np.load(IMG_FEAT_DIR + "resnet50_pca64.npy")          # (16209, 64)
X_interp = np.load(IMG_FEAT_DIR + "interpretable_features.npy")  # (16209, 4)

X_image = np.hstack([X_pca, X_interp])  # (16209, 68)


In [26]:
img_model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

img_model.fit(X_image[train_idx], residual_train)


In [27]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

residual_val_pred = img_model.predict(X_image[val_idx])
final_val_pred = tab_val_pred + residual_val_pred

r2_final = r2_score(y.loc[val_idx], final_val_pred)
rmse_final = np.sqrt(mean_squared_error(y.loc[val_idx], final_val_pred))

print("FINAL Multimodal Validation R²:", r2_final)
print("FINAL Multimodal Validation RMSE:", rmse_final)


FINAL Multimodal Validation R²: 0.8912675380706787
FINAL Multimodal Validation RMSE: 116810.4499434875


In [None]:
joblib.dump(tab_model, MODEL_DIR + "tabular_xgb_fe.pkl")
joblib.dump(img_model, MODEL_DIR + "image_residual_xgb_fe.pkl")


# Prediction for test

CNN


In [None]:
import os
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
import torchvision.models as models

# paths
IMAGES_PATH = "/content/drive/MyDrive/IITR/Satellite-project/images_test"
SAVE_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/resnet50_features_test.npy"

os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# transform (SAME AS TRAIN)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# load pretrained ResNet50 (feature extractor)
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet = resnet.to(device)
resnet.eval()

# number of test images
N_IMAGES = len(os.listdir(IMAGES_PATH))

# storage
features_all = np.zeros((N_IMAGES, 2048), dtype=np.float32)

# extraction loop
with torch.no_grad():
    for i in tqdm(range(N_IMAGES)):
        img_path = os.path.join(IMAGES_PATH, f"{i}.png")
        img = Image.open(img_path).convert("RGB")
        img_tensor = image_transform(img).unsqueeze(0).to(device)
        feats = resnet(img_tensor).view(-1).cpu().numpy()
        features_all[i] = feats

# save
np.save(SAVE_PATH, features_all)
print("Saved:", SAVE_PATH, features_all.shape)


In [1]:
import numpy as np
import joblib
import os

# paths
CNN_TEST_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/resnet50_features_test.npy"
PCA_MODEL_PATH = "/content/drive/MyDrive/IITR/Satellite-project/models/image_pca.joblib"
SAVE_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/resnet50_pca64_test.npy"

# load
X_cnn_test = np.load(CNN_TEST_PATH)          # (5404, 2048)
pca = joblib.load(PCA_MODEL_PATH)            # fitted on TRAIN

# transform ONLY (NO FIT)
X_cnn_pca_test = pca.transform(X_cnn_test)   # (5404, 64)

# save
np.save(SAVE_PATH, X_cnn_pca_test)
print("Saved:", SAVE_PATH, X_cnn_pca_test.shape)


Saved: /content/drive/MyDrive/IITR/Satellite-project/image_features/resnet50_pca64_test.npy (5404, 64)


In [9]:
import os
import cv2
import numpy as np
from tqdm import tqdm

IMAGES_PATH = "/content/drive/MyDrive/IITR/Satellite-project/images_test"
SAVE_INTERP_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/interp_features_test.npy"

N_IMAGES = len(os.listdir(IMAGES_PATH))
interp_feats = np.zeros((N_IMAGES, 4), dtype=np.float32)

for i in tqdm(range(N_IMAGES)):
    img = cv2.imread(os.path.join(IMAGES_PATH, f"{i}.png"))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    green_ratio = img[:, :, 1].mean() / 255.0
    blue_ratio  = img[:, :, 2].mean() / 255.0
    edges = cv2.Canny(img, 100, 200)
    edge_density = edges.mean() / 255.0
    brightness = img.mean() / 255.0

    interp_feats[i] = [green_ratio, blue_ratio, edge_density, brightness]

np.save(SAVE_INTERP_PATH, interp_feats)
print("Saved:", SAVE_INTERP_PATH, interp_feats.shape)


100%|██████████| 5404/5404 [00:57<00:00, 94.41it/s] 

Saved: /content/drive/MyDrive/IITR/Satellite-project/image_features/interp_features_test.npy (5404, 4)





In [10]:
PCA_TEST_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/resnet50_pca64_test.npy"
FINAL_IMG_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/X_image_test.npy"

X_pca_test = np.load(PCA_TEST_PATH)        # (5404, 64)
X_interp_test = np.load(SAVE_INTERP_PATH)  # (5404, 4)

X_image_test = np.hstack([X_pca_test, X_interp_test])  # (5404, 68)

np.save(FINAL_IMG_PATH, X_image_test)
print("Saved:", FINAL_IMG_PATH, X_image_test.shape)


Saved: /content/drive/MyDrive/IITR/Satellite-project/image_features/X_image_test.npy (5404, 68)


In [11]:
import numpy as np
import pandas as pd
import joblib

MODEL_DIR = "/content/drive/MyDrive/IITR/Satellite-project/models/"
IMG_FEAT_PATH = "/content/drive/MyDrive/IITR/Satellite-project/image_features/X_image_test.npy"
TEST_PATH = "/content/drive/MyDrive/IITR/Satellite-project/Satellite-test.xlsx"

tab_model = joblib.load(MODEL_DIR + "tabular_xgb_fe.pkl")
img_model = joblib.load(MODEL_DIR + "image_residual_xgb_fe.pkl")

X_image_test = np.load(IMG_FEAT_PATH)   # (5404, 68)


In [12]:
df_test = pd.read_excel(TEST_PATH)

test_ids = df_test["id"].copy()

# feature engineering (same as train)
df_test['house_age'] = 2025 - df_test['yr_built']
df_test['is_renovated'] = (df_test['yr_renovated'] > 0).astype(int)
df_test['total_sqft'] = df_test['sqft_above'] + df_test['sqft_basement']
df_test['room_density'] = df_test['sqft_living'] / (df_test['bedrooms'] + 1)

# drop unused
df_test = df_test.drop(columns=['id', 'date'], errors='ignore')

TABULAR_FEATURES = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
    'floors', 'waterfront', 'view', 'condition', 'grade',
    'sqft_above', 'sqft_basement',
    'lat', 'long',
    'sqft_living15', 'sqft_lot15',
    'house_age', 'is_renovated', 'total_sqft', 'room_density'
]

X_test_tab = df_test[TABULAR_FEATURES].copy()


In [13]:
base_pred = tab_model.predict(X_test_tab)
residual_pred = img_model.predict(X_image_test)

final_price = base_pred + residual_pred


In [14]:
submission = pd.DataFrame({
    "id": test_ids,
    "predicted_price": final_price
})

OUT_PATH = "/content/drive/MyDrive/IITR/Satellite-project/submission.csv"
submission.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
submission.head()


Saved: /content/drive/MyDrive/IITR/Satellite-project/submission.csv


Unnamed: 0,id,predicted_price
0,2591820310,398744.4
1,7974200820,870735.4
2,7701450110,1094724.0
3,9522300010,2199066.0
4,9510861140,772580.6
