In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models, Input, Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

#loading the pre processed data
train_df = pd.read_csv('train_preprocessed.csv')

#scaling the price_log to be between 0 and 1 to prevent high losses
target_scaler = MinMaxScaler()
train_df['target_scaled'] = target_scaler.fit_transform(train_df[['price_log']])

#splitting the data
train_split, val_split = train_test_split(train_df, test_size=0.2, random_state=42)

In [2]:
def multi_modal_generator(df, image_folder, batch_size=32, target_size=(224, 224), shuffle=True):
    while True:
        if shuffle: df = df.sample(frac=1).reset_index(drop=True)
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size]
            images, tabular, labels = [], [], []
            for _, row in batch_df.iterrows():
                img_path = os.path.join(image_folder, f"{int(row['id'])}.jpg")
                if os.path.exists(img_path):
                    img = img_to_array(load_img(img_path, target_size=target_size)) / 255.0
                    tab_feat = row.drop(['id', 'price', 'price_log', 'target_scaled']).values.astype(np.float32)
                    images.append(img)
                    tabular.append(tab_feat)
                    labels.append(row['target_scaled']) 
            if images:
                yield ((np.array(images), np.array(tabular)), np.array(labels))

output_sig = (
    (tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32), 
     tf.TensorSpec(shape=(None, 15), dtype=tf.float32)),
    tf.TensorSpec(shape=(None,), dtype=tf.float32)
)

In [3]:
#creating the brnaches of the CNN
img_in = Input(shape=(224, 224, 3))
x = layers.Conv2D(16, (3,3), activation='relu')(img_in)
x = layers.MaxPooling2D((2,2))(x)
x = layers.GlobalAveragePooling2D()(x)

tab_in = Input(shape=(15,))
y = layers.Dense(64, activation='relu')(tab_in)
y = layers.Dense(32, activation='relu')(y)

#fusing the branches
merged = layers.concatenate([x, y])
z = layers.Dense(16, activation='relu')(merged)

#using the sigmoid function to ensure the output is between 0 and 1
output = layers.Dense(1, activation='sigmoid')(z) 

model = Model(inputs=[img_in, tab_in], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

model.summary()

In [4]:
train_ds = tf.data.Dataset.from_generator(
    lambda: multi_modal_generator(train_split, 'images/train'), output_signature=output_sig)
val_ds = tf.data.Dataset.from_generator(
    lambda: multi_modal_generator(val_split, 'images/train', shuffle=False), output_signature=output_sig)

history = model.fit(train_ds, steps_per_epoch=len(train_split)//32, 
                    validation_data=val_ds, validation_steps=len(val_split)//32, 
                    epochs=20)

Epoch 1/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 233ms/step - loss: 0.0103 - val_loss: 0.0050
Epoch 2/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 192ms/step - loss: 0.0050 - val_loss: 0.0047
Epoch 3/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 194ms/step - loss: 0.0044 - val_loss: 0.0042
Epoch 4/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 194ms/step - loss: 0.0042 - val_loss: 0.0043
Epoch 5/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 188ms/step - loss: 0.0040 - val_loss: 0.0039
Epoch 6/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 190ms/step - loss: 0.0039 - val_loss: 0.0040
Epoch 7/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 181ms/step - loss: 0.0039 - val_loss: 0.0042
Epoch 8/20
[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 172ms/step - loss: 0.0038 - val_loss: 0.0039
Epoch 9/20
[1m4

In [13]:
from sklearn.metrics import r2_score, mean_squared_error
import math

#creating batches of 64 for faster and efficient calculations
BATCH_SIZE = 64
target_size = (224, 224)

def get_model_predictions(df, image_folder):
    all_preds_scaled = []
    num_batches = math.ceil(len(df) / BATCH_SIZE)
        
    for i in range(num_batches):
        start = i * BATCH_SIZE
        end = min((i + 1) * BATCH_SIZE, len(df))
        batch_df = df.iloc[start:end]
        
        batch_images = []
        batch_tabular = []
        
        for _, row in batch_df.iterrows():
            img_path = os.path.join(image_folder, f"{int(row['id'])}.jpg")
            img = img_to_array(load_img(img_path, target_size=target_size)) / 255.0
            to_drop = ['id', 'price', 'price_log', 'target_scaled']
            tab_feat = row.drop(labels=to_drop, errors='ignore').values.astype(np.float32)
            
            batch_images.append(img)
            batch_tabular.append(tab_feat)
        
        preds = model.predict([np.array(batch_images), np.array(batch_tabular)], verbose=0)
        all_preds_scaled.extend(preds)
            
    return np.array(all_preds_scaled)

#transforming data to original and calculating metrics
y_val_pred_scaled = get_model_predictions(val_split, 'images/train')

y_val_pred_log = target_scaler.inverse_transform(y_val_pred_scaled)
y_val_true_log = val_split['price_log'].values.reshape(-1, 1)

y_val_true_dollars = np.expm1(y_val_true_log).flatten()
y_val_pred_dollars = np.expm1(y_val_pred_log).flatten()

r2 = r2_score(y_val_true_log, y_val_pred_log)
rmse = np.sqrt(mean_squared_error(y_val_true_dollars, y_val_pred_dollars))


print(f"Validation R^2 Score: {r2:.4f}")
print(f"Validation RMSE:      ${rmse:,.2f}")


#running same logiic with actual test data
test_df = pd.read_csv('test_preprocessed.csv')
y_test_pred_scaled = get_model_predictions(test_df, 'images/test')

# Inverse transform back to real dollar prices
y_test_pred_log = target_scaler.inverse_transform(y_test_pred_scaled)
y_test_pred_dollars = np.expm1(y_test_pred_log).flatten()

# Create and Save the Final CSV
submission_df = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'predicted_price': y_test_pred_dollars
})

submission_df.to_csv('predictions.csv', index=False)
print("\n 'predictions.csv' has been created.")
display(submission_df.head())

Validation R^2 Score: 0.7106
Validation RMSE:      $177,490.47

 'predictions.csv' has been created.


Unnamed: 0,id,predicted_price
0,-1703146986,424370.1
1,-615733772,784222.0
2,-888484482,1116835.0
3,932365418,1652684.0
4,920926548,602853.4
