## Task 3: Multimodal ML – Housing Price Prediction Using Images + Tabular Data

### Problem Statement
Predict house prices using both **tabular data** (like bedrooms, area, etc.) and **house images**, because images contain important visual information that affects price.

### Objective
Build a **multimodal model** that combines:
- **CNN features from images**
- **Dense features from tabular data**

to predict housing prices and evaluate using **MAE and RMSE**.


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

IMG_SIZE = (224, 224)
BATCH_SIZE = 8
IMAGE_DIR = "images"

# -----------------------------
# LOAD DATA
# -----------------------------
data = pd.read_csv("housing_data.csv")

# -----------------------------
# GET ALL IMAGES
# -----------------------------
all_images = sorted([f for f in os.listdir(IMAGE_DIR) if f.endswith(".jpg")])

# -----------------------------
# MAP IMAGES TO ROWS
# -----------------------------
data = data.iloc[:len(all_images)].copy()
data["image_path"] = [os.path.join(IMAGE_DIR, f) for f in all_images]

# -----------------------------
# SPLIT
# -----------------------------
y = data["price"]
X = data.drop(["price"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# SCALE TABULAR
# -----------------------------
tabular_cols = X_train.drop(["id", "date", "image_path"], axis=1).columns
scaler = StandardScaler()

X_train_tab = scaler.fit_transform(X_train[tabular_cols])
X_test_tab = scaler.transform(X_test[tabular_cols])

# -----------------------------
# IMAGE LOADER
# -----------------------------
def load_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img

# -----------------------------
# BUILD DATASETS
# -----------------------------
train_img_ds = tf.data.Dataset.from_tensor_slices(X_train["image_path"].values).map(load_image)
train_tab_ds = tf.data.Dataset.from_tensor_slices(X_train_tab.astype(np.float32))
train_lbl_ds = tf.data.Dataset.from_tensor_slices(y_train.values.astype(np.float32))

train_ds = tf.data.Dataset.zip(((train_img_ds, train_tab_ds), train_lbl_ds))
train_ds = train_ds.shuffle(1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_img_ds = tf.data.Dataset.from_tensor_slices(X_test["image_path"].values).map(load_image)
test_tab_ds = tf.data.Dataset.from_tensor_slices(X_test_tab.astype(np.float32))
test_lbl_ds = tf.data.Dataset.from_tensor_slices(y_test.values.astype(np.float32))

test_ds = tf.data.Dataset.zip(((test_img_ds, test_tab_ds), test_lbl_ds))
test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# -----------------------------
# MODEL
# -----------------------------
image_input = tf.keras.Input(shape=(224, 224, 3))
tab_input = tf.keras.Input(shape=(X_train_tab.shape[1],))

x = tf.keras.layers.Conv2D(32, 3, activation="relu")(image_input)
x = tf.keras.layers.MaxPooling2D()(x)
x = tf.keras.layers.Conv2D(64, 3, activation="relu")(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
img_feat = tf.keras.layers.Dense(64, activation="relu")(x)

t = tf.keras.layers.Dense(64, activation="relu")(tab_input)
t = tf.keras.layers.Dense(32, activation="relu")(t)

combined = tf.keras.layers.Concatenate()([img_feat, t])
z = tf.keras.layers.Dense(64, activation="relu")(combined)
z = tf.keras.layers.Dense(32, activation="relu")(z)
output = tf.keras.layers.Dense(1)(z)

model = tf.keras.Model(inputs=[image_input, tab_input], outputs=output)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# -----------------------------
# TRAIN
# -----------------------------
model.fit(train_ds, validation_data=test_ds, epochs=10)



Epoch 1/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 195ms/step - loss: 212870987776.0000 - mae: 302942.4375 - val_loss: 112258023424.0000 - val_mae: 221719.4531
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m658s[0m 1s/step - loss: 94771388416.0000 - mae: 180198.0625 - val_loss: 51051065344.0000 - val_mae: 154167.2969
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 320ms/step - loss: 55939272704.0000 - mae: 149067.7969 - val_loss: 45555609600.0000 - val_mae: 143354.4531
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 360ms/step - loss: 50297229312.0000 - mae: 141764.7031 - val_loss: 43311833088.0000 - val_mae: 137715.2656
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 364ms/step - loss: 47102455808.0000 - mae: 135816.5938 - val_loss: 42117713920.0000 - val_mae: 134580.6094
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

<keras.src.callbacks.history.History at 0x1c93cdc8410>

In [3]:
for inp in model.inputs:
    print(inp.name)


keras_tensor_13
keras_tensor_14


In [13]:
# -----------------------------
# PREDICTION - SIMPLIFIED
# -----------------------------

# Create image array by loading all test images
test_images = []
for img_path in X_test["image_path"].values:
    img = load_image(img_path).numpy()
    test_images.append(img)
test_images = np.array(test_images)

# Predict using list of inputs
preds = model.predict([test_images, X_test_tab]).ravel()

# -----------------------------
# EVALUATE
# -----------------------------
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("MAE:", mae)
print("RMSE:", rmse)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 286ms/step
MAE: 116987.83773809524
RMSE: 190907.2034805175


### Summary in One Sentence

This task teaches you how to build a multimodal model that predicts house prices by combining structured data and images using CNN + tabular fusion.