In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization,
    GlobalAveragePooling2D, Concatenate,
    RandomFlip, RandomRotation, RandomZoom, RandomContrast
)
from tensorflow.keras.models import Model
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input

from tensorflow.keras.models import load_model


# ==============================
# CONFIG
# ==============================
DATA_DIR = r"C:\Users\bhuvi\OneDrive\cdc_project"
TRAIN_CSV = os.path.join(DATA_DIR, "train_processed.csv")
TEST_CSV  = os.path.join(DATA_DIR, "test_processed.csv")

IMG_SIZE = 224
BATCH_SIZE = 32
SEED = 42
EPOCHS_PHASE1 = 15
EPOCHS_PHASE2 = 8

tf.random.set_seed(SEED)
np.random.seed(SEED)

# ==============================
# 1. LOAD DATA
# ==============================
print("Loading data...")
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# ------------------------------
# TARGET (DEFINE FIRST)
# ------------------------------
y = train_df['price_log'].values.astype(np.float32)
y = np.nan_to_num(y, nan=np.nanmean(y))
y_mean = y.mean()
y_std = y.std()

# ------------------------------
# SPATIAL FEATURES
# ------------------------------
center_lat = train_df['lat'].mean()
center_long = train_df['long'].mean()

os.makedirs("models", exist_ok=True)
joblib.dump(
    {
        "center_lat": center_lat,
        "center_long": center_long
    },
    "models/spatial_center_new.pkl"
)
print("✅ Spatial center saved (center_lat, center_long)")

def add_spatial_features(df):
    df = df.copy()
    df['dist_from_center'] = np.sqrt(
        (df['lat'] - center_lat)**2 + (df['long'] - center_long)**2
    )
    return df

train_df = add_spatial_features(train_df)
test_df  = add_spatial_features(test_df)

feature_cols = [
    c for c in train_df.columns
    if c not in ['price', 'price_log', 'image_path', 'id']
]
joblib.dump(feature_cols, "models/feature_schema_new.pkl")

# ------------------------------
# SINGLE STRATIFIED SPLIT
# ------------------------------
train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,
    random_state=SEED,
    stratify=pd.qcut(y, q=10, duplicates='drop')
)

y_train, y_val = y[train_idx], y[val_idx]

# ------------------------------
# TABULAR FEATURES (NO LEAKAGE)
# ------------------------------
scaler = StandardScaler()
X_tab_train = scaler.fit_transform(
    train_df.iloc[train_idx][feature_cols].values
).astype(np.float32)
X_tab_val = scaler.transform(
    train_df.iloc[val_idx][feature_cols].values
).astype(np.float32)

X_tab_train = np.clip(X_tab_train, -5, 5)
X_tab_val   = np.clip(X_tab_val, -5, 5)
X_tab_train = np.nan_to_num(X_tab_train, nan=0.0)
X_tab_val   = np.nan_to_num(X_tab_val, nan=0.0)

# ------------------------------
# IMAGE PATHS
# ------------------------------
train_paths = train_df.iloc[train_idx]['image_path'].values
val_paths   = train_df.iloc[val_idx]['image_path'].values

# ==============================
# 2. TF.DATA PIPELINE
# ==============================
augmentation = tf.keras.Sequential([
    RandomFlip("horizontal"),
    RandomRotation(0.1),
    RandomZoom(0.15),
    RandomContrast(0.1),
], name="augmentation")

def process_image(path, augment=False):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    if augment:
        img = augmentation(img)
    img = tf.cast(img, tf.float32)
    img = preprocess_input(img)
    return img

def create_dataset(paths, tabular, labels=None, shuffle=False, augment=False):
    if labels is not None:
        ds = tf.data.Dataset.from_tensor_slices(((paths, tabular), labels))
    else:
        ds = tf.data.Dataset.from_tensor_slices((paths, tabular))

    if shuffle:
        ds = ds.shuffle(len(paths), seed=SEED, reshuffle_each_iteration=True)

    def map_with_label(inputs, label):
        path, tab = inputs
        img = process_image(path, augment)
        return (img, tab), label

    def map_no_label(path, tab):
        img = process_image(path, augment)
        return (img, tab)

    if labels is not None:
        ds = ds.map(map_with_label, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        ds = ds.map(map_no_label, num_parallel_calls=tf.data.AUTOTUNE)

    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds = create_dataset(train_paths, X_tab_train, y_train, shuffle=True, augment=True)
val_ds   = create_dataset(val_paths, X_tab_val, y_val, shuffle=False, augment=False)

# ==============================
# 3. MODEL — FIXED & OPTIMIZED
# ==============================
def build_model(tab_dim, y_mean):
    img_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3), name="image_input")
    tab_input = Input(shape=(tab_dim,), name="tabular_input")

    # ✅ EfficientNetB0 backbone
    base_model = EfficientNetB0(
        weights="imagenet",
        include_top=False,
        input_tensor=img_input
    )
    base_model.trainable = False  # Phase 1 freeze

    # Image branch
    x_img = base_model.output
    x_img = GlobalAveragePooling2D()(x_img)
    x_img = Dense(512, activation="relu")(x_img)
    x_img = BatchNormalization()(x_img)
    x_img = Dropout(0.5)(x_img)
    x_img = Dense(256, activation="relu")(x_img)
    x_img = Dropout(0.4)(x_img)

    # Tabular branch (unchanged)
    x_tab = Dense(256, activation="relu")(tab_input)
    x_tab = BatchNormalization()(x_tab)
    x_tab = Dropout(0.4)(x_tab)
    x_tab = Dense(128, activation="relu")(x_tab)
    x_tab = BatchNormalization()(x_tab)
    x_tab = Dropout(0.3)(x_tab)
    
    # fusion branch
    x = Concatenate()([x_img, x_tab])
    x = Dense(256, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation="relu")(x)


    output = Dense(
        1,
        activation="linear",
        bias_initializer=tf.keras.initializers.Constant(y_mean)
    )(x)

    model = Model(inputs=[img_input, tab_input], outputs=output)
    return model, base_model

model, base_model = build_model(len(feature_cols), y_mean)
model.summary()

# ==============================
# 4. PHASE 1: TRAIN HEAD
# ==============================
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=Huber(delta=2.0),
    metrics=['mae']
)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1)
]

print("\n=== PHASE 1: Training Head Layers ===")
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_PHASE1, callbacks=callbacks, verbose=1)

# ==============================
# 5. PHASE 2: FINE-TUNING (NOW WORKS!)
# ==============================
print("\n=== PHASE 2: Fine-tuning EfficientNetB0 Top Layers ===")
base_model.trainable = True

# Keep BatchNorm layers frozen
for layer in base_model.layers:
    if isinstance(layer, BatchNormalization):
        layer.trainable = False

# Unfreeze top 50 layers
num_finetune = 50
for layer in base_model.layers[:-num_finetune]:
    layer.trainable = False

print(f"Fine-tuning the top {num_finetune} layers of EfficientNetB0.")

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=Huber(delta=2.0),
    metrics=['mae']
)

model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_PHASE2, callbacks=callbacks, verbose=1)

# ==============================
# 6. FINAL EVALUATION
# ==============================
print("\n=== Final Validation Evaluation ===")
log_preds = model.predict(val_ds, verbose=0).ravel()
log_preds = np.clip(log_preds, y_mean - 6*y_std, y_mean + 6*y_std)

rmse_log = np.sqrt(mean_squared_error(y_val, log_preds))
r2_log = r2_score(y_val, log_preds)

price_true = np.expm1(y_val)
price_pred = np.expm1(log_preds)

rmse_price = np.sqrt(mean_squared_error(price_true, price_pred))
r2_price = r2_score(price_true, price_pred)
mape = np.mean(np.abs((price_true - price_pred) / (price_true + 1e-6))) * 100

print("="*60)
print("FINAL VALIDATION METRICS")
print("="*60)
print(f"Log RMSE     : {rmse_log:.4f}")
print(f"Log R²       : {r2_log:.4f}")
print(f"Price RMSE   : ${rmse_price:,.0f}")
print(f"Price R²     : {r2_price:.4f}")
print(f"Price MAPE   : {mape:.2f}%")
print("="*60)

os.makedirs("models", exist_ok=True)

MODEL_PATH = "models/multimodal_final_new.keras"
SCALER_PATH = "models/tabular_scaler_new.pkl"

model.save(MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

print(f"\n✅ Model saved to: {MODEL_PATH}")
print(f"✅ Scaler saved to: {SCALER_PATH}")

Loading data...
✅ Spatial center saved (center_lat, center_long)



=== PHASE 1: Training Head Layers ===
Epoch 1/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m799s[0m 2s/step - loss: 0.2103 - mae: 0.4971 - val_loss: 0.0493 - val_mae: 0.2443 - learning_rate: 3.0000e-04
Epoch 2/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m791s[0m 2s/step - loss: 0.0967 - mae: 0.3464 - val_loss: 0.0370 - val_mae: 0.2086 - learning_rate: 3.0000e-04
Epoch 3/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 1s/step - loss: 0.0728 - mae: 0.2991 - val_loss: 0.0315 - val_mae: 0.1917 - learning_rate: 3.0000e-04
Epoch 4/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m408s[0m 1s/step - loss: 0.0588 - mae: 0.2690 - val_loss: 0.0291 - val_mae: 0.1834 - learning_rate: 3.0000e-04
Epoch 5/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m401s[0m 987ms/step - loss: 0.0522 - mae: 0.2523 - val_loss: 0.0280 - val_mae: 0.1795 - learning_rate: 3.0000e-04
Epoch 6/15
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━

In [6]:
test_df.columns

Index(['id', 'bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront',
       'view', 'condition', 'grade', 'lat', 'long', 'sqft_living15',
       'sqft_lot15', 'image_path', 'age', 'renovated', 'has_basement',
       'sqft_lot_log', 'dist_from_center'],
      dtype='object')

In [7]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib

from tensorflow.keras.models import load_model
from tensorflow.keras.applications.efficientnet import preprocess_input

# ==============================
# CONFIG
# ==============================
DATA_DIR = r"C:\Users\bhuvi\OneDrive\cdc_project"
TEST_CSV = os.path.join(DATA_DIR, "test_processed.csv")

MODEL_PATH   = "models/multimodal_final.keras"
SCALER_PATH  = "models/tabular_scaler.pkl"
SPATIAL_PATH = "models/spatial_center.pkl"

IMG_SIZE = 224
BATCH_SIZE = 32

# ==============================
# 1. LOAD MODEL & ARTIFACTS
# ==============================
print("🔄 Loading model & artifacts...")

model  = load_model(MODEL_PATH, compile=False)
scaler = joblib.load(SCALER_PATH)

spatial = joblib.load(SPATIAL_PATH)
center_lat  = spatial["center_lat"]
center_long = spatial["center_long"]

print("✅ Model, scaler, spatial center loaded")

# ==============================
# 2. LOAD TEST DATA
# ==============================
test_df = pd.read_csv(TEST_CSV)
print(f"📂 Test samples: {len(test_df)}")

# ==============================
# 3. FEATURE ENGINEERING (SAME AS TRAINING)
# ==============================
test_df["dist_from_center"] = np.sqrt(
    (test_df["lat"] - center_lat) ** 2 +
    (test_df["long"] - center_long) ** 2
)

# # 🔒 EXACT FEATURE ORDER USED IN TRAINING
# FEATURE_COLS = [
#     "bedrooms",
#     "bathrooms",
#     "sqft_living",
#     "floors",
#     "view",
#     "condition",
#     "lat",
#     "long",
#     "sqft_living15",
#     "has_basement",
#     "relative_size",
#     "house_age",
#     "dist_from_center"
# ]

# # Safety: ensure all columns exist
# for col in FEATURE_COLS:
#     if col not in test_df.columns:
#         test_df[col] = 0.0

# # ==============================
# # TABULAR FEATURES (FINAL FIX)
# # ==============================
# X_tab_test_raw = test_df[FEATURE_COLS].values.astype(np.float32)

# expected = scaler.n_features_in_   # 17
# current  = X_tab_test_raw.shape[1] # 13

# if current < expected:
#     pad = expected - current
#     print(f"⚠️ Padding {pad} missing features with zeros")
#     pad_values = np.tile(
#     scaler.mean_[current:], 
#     (X_tab_test_raw.shape[0], 1))
#     X_tab_test_raw = np.hstack([X_tab_test_raw, pad_values])
# ==============================
# TABULAR FEATURES (CORRECT)
# ==============================

# Load feature schema used during training
FEATURE_COLS = joblib.load("models/feature_schema_new.pkl")

# Build tabular matrix in correct order
X_tab_test = test_df[FEATURE_COLS].values.astype(np.float32)

# Scale
X_tab_test = scaler.transform(X_tab_test)
X_tab_test = np.clip(X_tab_test, -5, 5)
X_tab_test = np.nan_to_num(X_tab_test, nan=0.0)

print("✅ Final tabular shape passed to model:", X_tab_test.shape)


# ==============================
# 5. IMAGE PIPELINE (INFERENCE)
# ==============================
def process_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    img = preprocess_input(img)
    return img

def create_test_dataset(paths, tabular):
    ds = tf.data.Dataset.from_tensor_slices((paths, tabular))

    def map_fn(path, tab):
        img = process_image(path)
        return {
            "image_input": img,
            "tabular_input": tab
        }

    ds = ds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)
    return ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_paths = test_df["image_path"].values
test_ds = create_test_dataset(test_paths, X_tab_test)

# ==============================
# 6. PREDICT
# ==============================
print("🔮 Generating predictions...")
log_preds = model.predict(test_ds, verbose=1).ravel()

# Convert log-price → price
price_preds = np.expm1(log_preds)

print(f"📈 Mean predicted price: ${price_preds.mean():,.0f}")
print(f"📉 Min / Max price: ${price_preds.min():,.0f} / ${price_preds.max():,.0f}")

# ==============================
# 7. SAVE SUBMISSION
# ==============================
submission_new_new = pd.DataFrame({
    "id": test_df["id"],
    "price": price_preds
})

submission_new_new.to_csv("submission_new_new.csv", index=False)

print("\n✅ submission_new_new.csv saved")
print("\n🔍 First 5 predictions:")
print(submission_new_new.head())

🔄 Loading model & artifacts...
✅ Model, scaler, spatial center loaded
📂 Test samples: 5404
✅ Final tabular shape passed to model: (5404, 17)
🔮 Generating predictions...
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 785ms/step
📈 Mean predicted price: $510,060
📉 Min / Max price: $161,105 / $2,903,780

✅ submission_new_new.csv saved

🔍 First 5 predictions:
           id         price
0  2591820310  3.663122e+05
1  7974200820  7.302819e+05
2  7701450110  1.094297e+06
3  9522300010  1.536391e+06
4  9510861140  6.912469e+05
