In [3]:
import pandas as pd

df = pd.read_csv("socal2.csv")
print(df.head())


   image_id                 street             citi  n_citi  bed  bath  sqft  \
0         0  1317 Van Buren Avenue  Salton City, CA     317    3   2.0  1560   
1         1         124 C Street W      Brawley, CA      48    3   2.0   713   
2         2        2304 Clark Road     Imperial, CA     152    3   1.0   800   
3         3     755 Brawley Avenue      Brawley, CA      48    3   1.0  1082   
4         4  2207 R Carrillo Court     Calexico, CA      55    4   3.0  2547   

    price  
0  201900  
1  228500  
2  273950  
3  350000  
4  385100  


In [14]:
!pip install tensorflow




In [26]:
"""
Multimodal ML – Housing Price Prediction (Images + Tabular)
Works with SoCal dataset or demo synthetic dataset.
"""


import os
import random
import shutil
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# =====================
# Config
# =====================
USE_DEMO = False   # <<< set False to use SoCal dataset
CSV_PATH = "socal2.csv"  # path to your SoCal csv
IMG_FOLDER = "images"    # folder containing 1.jpg, 2.jpg, etc.
IMG_SIZE = (224, 224)
BATCH_SIZE = 16
EPOCHS = 20
VAL_SPLIT = 0.2
RANDOM_SEED = 42
BACKBONE = "efficientnetb0"  # or "mobilenetv2"

# Features for SoCal dataset
NUMERIC_FEATURES = ["sqft", "bed", "bath", "n_citi"]  # adjust to your CSV
CATEGORICAL_FEATURES = ["citi"]                       # categorical
TARGET_COL = "price"
IMAGE_COL = "image_path"  # we will create this below
# Pick one backbone model for images
BACKBONE = "mobilenet"   # you can also try "efficientnet"

# =====================
# If using real SoCal dataset, fix image paths
# =====================
if not USE_DEMO:
    df = pd.read_csv(CSV_PATH)
    # create full path to image file (assuming images/1.jpg, 2.jpg, ...)
    df[IMAGE_COL] = df["image_id"].apply(lambda x: str(Path(IMG_FOLDER) / f"{x}.jpg"))
else:
    # If demo, generate synthetic csv + fake images
    from pathlib import Path
    import numpy as np
    import random
    import shutil
    def _make_demo_dataset(n=300, root="demo_data"):
        root = Path(root)
        if root.exists():
            shutil.rmtree(root)
        (root / "images").mkdir(parents=True, exist_ok=True)
        rng = np.random.default_rng(RANDOM_SEED)
        cities = ["Karachi", "Lahore", "Islamabad", "Quetta"]
        rows = []
        for i in range(n):
            img = (rng.random((*IMG_SIZE, 3)) * 255).astype(np.uint8)
            img_path = root / "images" / f"house_{i}.jpg"
            tf.keras.utils.save_img(str(img_path), img)
            sqft = rng.integers(500, 5000)
            beds = rng.integers(1, 7)
            baths = rng.integers(1, 5)
            n_citi = rng.integers(10, 500)
            city = random.choice(cities)
            price = sqft*200 + beds*30000 + baths*20000 + rng.normal(0, 50000)
            rows.append({IMAGE_COL: str(img_path), TARGET_COL: price,
                         "sqft": sqft, "bed": beds, "bath": baths,
                         "n_citi": n_citi, "citi": city})
        df = pd.DataFrame(rows)
        csv_path = root / "demo_housing.csv"
        df.to_csv(csv_path, index=False)
        return df
    df = _make_demo_dataset(400)

# =====================
# Split train/val/test
# =====================
train_df, test_df = train_test_split(df, test_size=0.15, random_state=RANDOM_SEED)
train_df, val_df = train_test_split(train_df, test_size=VAL_SPLIT, random_state=RANDOM_SEED)

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# =====================
# Tabular preprocessing
# =====================
def build_tabular_preprocessor(train_df):
    inputs, encoded_feats = {}, []
    for name in NUMERIC_FEATURES:
        inp = keras.Input(shape=(1,), name=name)
        norm = layers.Normalization(name=f"norm_{name}")
        norm.adapt(train_df[name].astype(float).values.reshape(-1,1))
        x = norm(inp)
        inputs[name] = inp
        encoded_feats.append(x)
    for name in CATEGORICAL_FEATURES:
        inp = keras.Input(shape=(1,), dtype=tf.string, name=name)
        lookup = layers.StringLookup(output_mode="int")
        lookup.adapt(train_df[name].astype(str).values)
        encoder = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size(), output_mode="one_hot")
        x = lookup(inp)
        x = encoder(x)
        inputs[name] = inp
        encoded_feats.append(x)
    tab_out = layers.Concatenate()(encoded_feats) if len(encoded_feats)>1 else encoded_feats[0]
    tab_out = layers.Dense(128, activation="relu")(tab_out)
    tab_out = layers.Dropout(0.2)(tab_out)
    tab_out = layers.Dense(64, activation="relu")(tab_out)
    return inputs, tab_out

# =====================
# Image encoder
# =====================
def build_image_encoder(backbone="mobilenet"):
    img_input = keras.Input(shape=(*IMG_SIZE, 3))   # force 3 channels (RGB)

    if backbone == "mobilenet":
        base = keras.applications.MobileNetV2(
            include_top=False,
            input_shape=(*IMG_SIZE, 3),
            weights="imagenet"
        )
        preprocess = keras.applications.mobilenet_v2.preprocess_input
    elif backbone == "efficientnet":
        base = keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(*IMG_SIZE, 3),
            weights="imagenet"
        )
        preprocess = keras.applications.efficientnet.preprocess_input
    else:
        raise ValueError("Backbone must be either 'mobilenet' or 'efficientnet'")

    x = layers.Lambda(preprocess)(img_input)
    x = base(x, training=False)
    x = layers.GlobalAveragePooling2D()(x)

    return img_input, x

# =====================
# Build multimodal model
# =====================
tab_inputs, tab_feats = build_tabular_preprocessor(train_df)
img_input, img_feats = build_image_encoder(BACKBONE)
combined = layers.Concatenate()([img_feats, tab_feats])
combined = layers.Dense(256, activation="relu")(combined)
combined = layers.Dropout(0.3)(combined)
combined = layers.Dense(128, activation="relu")(combined)
output = layers.Dense(1)(combined)

model = keras.Model(inputs={**tab_inputs, "image": img_input}, outputs=output)
model.compile(optimizer=keras.optimizers.Adam(1e-3),
              loss="mse",
              metrics=[keras.metrics.MeanAbsoluteError(name="mae"),
                       keras.metrics.RootMeanSquaredError(name="rmse")])
model.summary()

# =====================
# tf.data pipeline
# =====================
def make_dataset(frame, training=True):
    def _load_row(row):
        img = tf.io.read_file(row[IMAGE_COL])
        img = tf.image.decode_jpeg(img, channels=3)  # <<< force RGB
        img = tf.image.resize(img, IMG_SIZE)
        feats = {}
        for n in NUMERIC_FEATURES:
            feats[n] = tf.cast(tf.reshape(row[n],(1,)), tf.float32)
        for n in CATEGORICAL_FEATURES:
            feats[n] = tf.reshape(tf.strings.as_string(row[n]), (1,))
        feats["image"] = img
        label = tf.cast(row[TARGET_COL], tf.float32)
        return feats, label
    ds = tf.data.Dataset.from_tensor_slices({c: frame[c].values for c in [IMAGE_COL, TARGET_COL]+NUMERIC_FEATURES+CATEGORICAL_FEATURES})
    ds = ds.map(_load_row, num_parallel_calls=tf.data.AUTOTUNE)
    if training: ds = ds.shuffle(1024, seed=RANDOM_SEED)
    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds, val_ds, test_ds = map(lambda d: make_dataset(d, training=False), [train_df, val_df, test_df])

# =====================
# Train
# =====================
history = model.fit(make_dataset(train_df, True),
                    validation_data=val_ds,
                    epochs=EPOCHS,
                    callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor="val_rmse")])

# =====================
# Evaluate
# =====================
print(model.evaluate(test_ds, return_dict=True))

# Save model
model.save("multimodal_house_price_model.keras")


Train/Val/Test sizes: 10521 2631 2322
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1us/step





Epoch 1/20


NotFoundError: Graph execution error:

Detected at node ReadFile defined at (most recent call last):
<stack traces unavailable>
Error in user-defined function passed to ParallelMapDatasetV2:13 transformation with iterator: Iterator::Root::Prefetch::BatchV2::Shuffle::ParallelMapV2: NewRandomAccessFile failed to Create/Open: images\15285.jpg : The system cannot find the path specified.
; No such process
	 [[{{node ReadFile}}]]
	 [[IteratorGetNext]] [Op:__inference_multi_step_on_iterator_41754]