In [19]:
import pandas as pd
import numpy as np

# to use with image url
from PIL import Image, ImageOps
import requests
from io import BytesIO

import tensorflow as tf
import os


from tensorflow_text.python.ops.fast_wordpiece_tokenizer import FastWordpieceTokenizer
from keras_nlp.layers import StartEndPacker
from tensorflow_text import normalize_utf8
import keras_nlp

from tensorflow.keras import regularizers
from tensorflow.keras.utils import register_keras_serializable 

PROJECT_DIR = "./"

file_path = "./datasets/train_cleaned.csv"
df = pd.read_csv(file_path)

# Train tokenizer and construct vocab (same as V1)

In [20]:
class CustomTokenizer():

    def __init__(self, vocab_path, max_length):
        self.packer = StartEndPacker(sequence_length=max_length,pad_value=0)
        self.unk_token = '[UNK]'
        self.vocabulary = self._get_vocab_list(vocab_path)
        self.tokenizer = FastWordpieceTokenizer(
            vocab=self.vocabulary,
            suffix_indicator='##',
            unknown_token=self.unk_token,
            support_detokenization=True
        )

    @staticmethod
    def _preprocess(text):
        text_normalized = normalize_utf8(text, "NFD")
        text_stripped_accents = tf.strings.regex_replace(text_normalized, r"\p{Mn}", "")
        lowercase = tf.strings.lower(text_stripped_accents)
        return lowercase

    def _get_vocab_list(self, vocab_path):
        vclist = []

        with open(vocab_path, "r") as f:
            vclist.extend(f.read().splitlines())
            seen = set()
            vclist = [x for x in vclist if not (x in seen or seen.add(x))]

        if self.unk_token not in vclist:
            vclist = [vclist[0]] +  [self.unk_token]  + vclist[1:]
        return vclist

    def tokenize(self, text):
        text = self._preprocess(text)
        tokens = self.tokenizer.tokenize(text)
        return self.packer(tokens)

    def detokenize(self, tokens):
        return self.tokenizer.detokenize(tokens)

    def __call__(self, text):
        return self.tokenize(text)

In [28]:
def train_word_piece(ds, vocab_size, query_or_desc, vocab_file=None):
    if query_or_desc == "title":
        word_piece_ds = ds.map(lambda x: x["title"])
    elif query_or_desc == 'desc':
        word_piece_ds = ds.map(lambda x: x["description"])

    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(10000).prefetch(1000),
        vocabulary_output_file=vocab_file,
        vocabulary_size=vocab_size,
        lowercase=True,
        strip_accents=True,
        reserved_tokens=["[PAD]", "[UNK]"],
    )
    return vocab

def get_tokenizer(train, vocab_size, csv_size, vocab_type, max_tokens, force_train=False):
    vocab_filename = f"vocab/{csv_size}_{vocab_type}"
    vocab_path = os.path.join(PROJECT_DIR, vocab_filename)
    print(vocab_path)
    try:
        if force_train:
            raise FileNotFoundError()

        tokenizer = CustomTokenizer(vocab_path, max_tokens)

    except FileNotFoundError:
        print("Training tokenizer...")
        train_word_piece(train, vocab_size, vocab_type, vocab_path)

        tokenizer = CustomTokenizer(vocab_path, max_tokens)

    final_vocab_size = len(tokenizer.vocabulary)
    print(f"Loaded tokenizer from '{vocab_filename}' with final vocab size: {final_vocab_size:,}")
    print("Sample tokens: ", np.random.choice(tokenizer.vocabulary, 20))
    return tokenizer, final_vocab_size

# pre process image and other input cols (same as V1)

In [23]:
def resize_and_pad_image(url, target_size=(320, 320)):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    img.thumbnail(target_size, Image.Resampling.LANCZOS)
    
    delta_w = target_size[0] - img.width
    delta_h = target_size[1] - img.height
    padding = (
        delta_w // 2, 
        delta_h // 2, 
        delta_w - (delta_w // 2), 
        delta_h - (delta_h // 2)
        )
    
    padded_img = ImageOps.expand(img, padding, fill=(0, 0, 0))
    
    return padded_img

def normalize_image(pil_image):
    img_array = np.array(pil_image).astype(np.float32) / 255.0

    img_tensor = tf.convert_to_tensor(img_array)
    img_tensor = tf.expand_dims(img_tensor, axis=0)  
    return img_tensor

def process_image(url):
    def _load_and_process(url_str):
        url_decoded = url_str.numpy().decode()
        img = resize_and_pad_image(url_decoded)
        img_tensor = normalize_image(img)
        return img_tensor[0]  # remove batch dim

    img = tf.py_function(func=_load_and_process, inp=[url], Tout=tf.float32)
    img.set_shape([320, 320, 3])
    return img

In [24]:
condition_cols = [col for col in df.columns if col.startswith("condition_")]
df[condition_cols] = df[condition_cols].astype(np.int32)

df["year"] = df["year"].astype(np.int32)

sin_cos_cols = ["month_sin", "month_cos", "day_of_week_sin", "day_of_week_cos"]
df[sin_cos_cols] = df[sin_cos_cols].astype(np.float32)

df["title"] = df["title"].astype(str)
df["description"] = df["description"].astype(str)
df["image_url"] = df["image_url"].astype(str)

df.fillna("", inplace=True)  # for strings

ds = tf.data.Dataset.from_tensor_slices({
    "title": df["title"].values,
    "description": df["description"].values,
    "image_url": df["image_url"].values,
    "condition_BRAND NEW": df["condition_BRAND NEW"].values,
    "condition_HEAVILY USED": df["condition_HEAVILY USED"].values,
    "condition_LIGHTLY USED": df["condition_LIGHTLY USED"].values,
    "condition_LIKE NEW": df["condition_LIKE NEW"].values,
    "condition_WELL USED": df["condition_WELL USED"].values,
    "year": df["year"].values,
    "month_sin": df["month_sin"].values,
    "month_cos": df["month_cos"].values,
    "day_of_week_sin": df["day_of_week_sin"].values,
    "day_of_week_cos": df["day_of_week_cos"].values,
    "price_log": df["price_log"].values, # target
})


In [29]:
shuffled = ds.shuffle(len(ds), seed = 42, reshuffle_each_iteration=False)
train_ds_raw = shuffled.take(int(len(ds) * 0.9))
test_ds_raw = shuffled.skip(int(len(ds) * 0.9))

def cyclical_encode(value, max_value):
    sin_val = round(np.sin(2 * np.pi * value / max_value), 2)
    cos_val = round(np.cos(2 * np.pi * value / max_value), 2)
    return sin_val, cos_val

def tokenize(title, description, title_tokenizer, desc_tokenizer):
    x = title_tokenizer(title)
    y = desc_tokenizer(description)
    return {
        "title_tokens": x,
        "desc_tokens": y,
    }

In [26]:
def preprocess(inputs, title_tokenizer, desc_tokenizer):
    title = inputs["title"]
    desc = inputs["description"]
    img_url = inputs["image_url"]
    tokenized = tokenize(title, desc, title_tokenizer, desc_tokenizer)
    img_tensor = process_image(img_url)

    price = inputs["price_log"]

    structured = {
        key: tf.cast(inputs[key], tf.float32)
        for key in [
            "condition_BRAND NEW", "condition_HEAVILY USED", "condition_LIGHTLY USED",
            "condition_LIKE NEW", "condition_WELL USED",
            "year", "month_sin", "month_cos", "day_of_week_sin", "day_of_week_cos"
        ]
    }

    model_inputs = {
        "image_url": img_tensor,
        "title": tokenized["title_tokens"],
        "description": tokenized["desc_tokens"],
        **structured
    }

    return model_inputs, tf.cast(price, tf.float32)  # model input, target

# Smaller tokens (factor of 3)

In [30]:
csv_size = "train_cleaned.csv"
MAX_TITLE_VOCAB_SIZE = 500
MAX_DESC_VOCAB_SIZE = 3000

vocab_type = "title"
title_max_tokens = 8
title_tokenizer, TITLE_VOCAB_SIZE = get_tokenizer(
    train_ds_raw, MAX_TITLE_VOCAB_SIZE, csv_size, vocab_type, title_max_tokens, force_train=False)

vocab_type = "desc"
desc_max_tokens = 16
desc_tokenizer, DESC_VOCAB_SIZE = get_tokenizer(
    train_ds_raw, MAX_DESC_VOCAB_SIZE, csv_size, vocab_type, desc_max_tokens, force_train=False)

./vocab/train_cleaned.csv_title
Loaded tokenizer from 'vocab/train_cleaned.csv_title' with final vocab size: 434
Sample tokens:  ['"' '解' '##!' 'consoles' '种' 'n' '##ore' '6' 'dual' '##版' '##x' 'brand'
 '##m' '❄' 'adapter' '##👾' 'selling' 'nintendo' 'all' 'selling']
./vocab/train_cleaned.csv_desc
Loaded tokenizer from 'vocab/train_cleaned.csv_desc' with final vocab size: 2,321
Sample tokens:  ['##\U0001fa77' '##us' 'call' '##💽' '\U0001fa77' 'together' 'warfare'
 '##ス' '##😫' 'fix' 'thanks' '##❗' '##🏦' '##!' 'xbox' 'block' '##view'
 'was' 'dead' '{']


In [31]:
print(title_tokenizer(["xbox for sale"]))
print(desc_tokenizer(["This is a sample description for the dataset."]))

tf.Tensor([[105 143 162   0   0   0   0   0]], shape=(1, 8), dtype=int64)
tf.Tensor(
[[ 413  366   40   58  811 1591 1248  360  358   43 1575  892  464   15
     0    0]], shape=(1, 16), dtype=int64)


In [11]:
def print_sample(text, tokenizer, type_):
    print(f'>> {type_}')
    tokens = tokenizer(text)
    print("Text:\t\t", text)
    print("Tokens:\t\t", tokens)
    print("Recovered:\t", tokenizer.detokenize(tokens).numpy().decode())
    print('\n')


sample = df.sample(1)
print_sample(sample['title'].values[0], title_tokenizer, 'Title')
print_sample(sample['description'].values[0], desc_tokenizer, 'Description')

>> Title
Text:		 PlayStation 3 Console
Tokens:		 tf.Tensor([111  20 115   0   0   0   0   0], shape=(8,), dtype=int64)
Recovered:	 playstation 3 console [PAD] [PAD] [PAD] [PAD] [PAD]


>> Description
Text:		 nan
Tokens:		 tf.Tensor([453   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(16,), dtype=int64)
Recovered:	 nan [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]




In [32]:
# check tokenizer outputs, sanity check

print(title_tokenizer(['gaming can be cheap']))
print(desc_tokenizer(['buy this amazing tool super cheap']))

# tf.Tensor([[262 310  33 137  34 243 137 317]], shape=(1, 8), dtype=int64)
# tf.Tensor(
# [[ 537  413  746  486 1924  599  607  514  672    0    0    0    0    0
#      0    0]], shape=(1, 16), dtype=int64)

tf.Tensor([[262 310  33 137  34 243 137 317]], shape=(1, 8), dtype=int64)
tf.Tensor(
[[ 537  413  746  486 1924  599  607  514  672    0    0    0    0    0
     0    0]], shape=(1, 16), dtype=int64)


In [33]:
train_ds = train_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

test_ds = test_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# Baseline set

In [None]:
total = 0.0
count = 0

for _, price_log_batch in train_ds:
    total += tf.reduce_sum(price_log_batch).numpy()
    count += price_log_batch.shape[0]

mean_price_log = total / count
print("Mean of price_log:", mean_price_log)


Mean of price_log: 4.96927619525621


In [48]:
mae_total = 0.0
mse_total = 0.0
n = 0

for _, price_log_batch in test_ds:
    batch_size = price_log_batch.shape[0]
    baseline_preds = tf.ones_like(price_log_batch) * mean_price_log
    
    mae_total += tf.reduce_sum(tf.abs(price_log_batch - baseline_preds)).numpy()
    mse_total += tf.reduce_sum(tf.square(price_log_batch - baseline_preds)).numpy()
    n += batch_size

mae = mae_total / n
mse = mse_total / n
rmse = np.sqrt(mse)

print(f"Baseline MAE: {mae}")
print(f"Baseline MSE: {mse}")
print(f"Baseline RMSE: {rmse}")

Baseline MAE: 0.7929205028543173
Baseline MSE: 1.2709249491841683
Baseline RMSE: 1.1273530721048168


# Model Architechture

In [8]:
''' 
V3 tries to use far less params by the following ways

1. Under image encoder
1.1 use Gloval Avg Pool instead of Flatten
1.2 L2 regularization 

2. Under Price Predictor class
2.1 higher dropout rate
2.2 embedding dims for title and desc reduced to 64 (from 128 earlier)
2.3 L2 regularization
2.4 other features passed through a dense layer, before concat 

Simplified final layer : 32 > 1
For V1 it was : 256 > 128 > 1
'''

' \nV3 tries to use far less params by the following ways\n\n1. Under image encoder\n1.1 use Gloval Avg Pool instead of Flatten\n1.2 L2 regularization \n\n2. Under Price Predictor class\n2.1 higher dropout rate\n2.2 embedding dims for title and desc reduced to 64 (from 128 earlier)\n2.3 L2 regularization\n2.4 other features passed through a dense layer, before concat \n\nSimplified final layer : 32 > 1\nFor V1 it was : 256 > 128 > 1\n'

In [34]:
@register_keras_serializable()
class CNNImageEncoder(tf.keras.Model):
    def __init__(self, activation='relu', kernel_size=(3, 3), pool_size=(2, 2), **kwargs):
        super(CNNImageEncoder, self).__init__(**kwargs)

        self.cnn_layers = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, kernel_size, activation=activation,
                                   input_shape=(320, 320, 3),
                                   kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),

            tf.keras.layers.Conv2D(64, kernel_size, activation=activation,
                                   kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),

            tf.keras.layers.Conv2D(128, kernel_size, activation=activation,
                                   kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),

            # Replace flatten with global pooling to reduce params, compared to V1
            tf.keras.layers.GlobalAveragePooling2D(),
        ])

    def call(self, image_inputs):
        return self.cnn_layers(image_inputs)
    
    def build(self, input_shape):
        self.cnn_layers.build(input_shape)
        super().build(input_shape)

    def get_config(self):
        return {
            "activation": self.activation,
            "kernel_size": self.kernel_size,
            "pool_size": self.pool_size,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)


DROPOUT = 0.3 # higher dropout as compared with V1

@register_keras_serializable()
class PricePredictor(tf.keras.Model):
    def __init__(self, title_vocab_size=434, desc_vocab_size=2321,**kwargs):
        super().__init__(**kwargs)

        self.title_vocab_size = title_vocab_size or 434
        self.desc_vocab_size = desc_vocab_size or 2321
        
        self.image_encoder = CNNImageEncoder()

        self.title_embedding_layer = tf.keras.layers.Embedding(title_vocab_size, 64, mask_zero=True)
        self.desc_embedding_layer = tf.keras.layers.Embedding(desc_vocab_size, 64, mask_zero=True)

        self.title_dense_layers = tf.keras.Sequential([
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.Dropout(DROPOUT)
        ])

        self.desc_dense_layers = tf.keras.Sequential([
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.Dropout(DROPOUT)
        ])

        self.other_features_processing = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.Dropout(DROPOUT)
        ])

        # Final head with reduced complexity
        self.final_layers = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
            tf.keras.layers.Dropout(DROPOUT),
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        image_input = inputs["image_url"]
        title_tokens = inputs["title"]
        desc_tokens = inputs["description"]

        image_features = self.image_encoder(image_input)

        title_embeddings = self.title_embedding_layer(title_tokens)
        title_features = self.title_dense_layers(title_embeddings)

        desc_embeddings = self.desc_embedding_layer(desc_tokens)
        desc_features = self.desc_dense_layers(desc_embeddings)

        other_raw_features = tf.keras.layers.concatenate([
            tf.expand_dims(inputs["condition_BRAND NEW"], axis=1),
            tf.expand_dims(inputs["condition_HEAVILY USED"], axis=1),
            tf.expand_dims(inputs["condition_LIGHTLY USED"], axis=1),
            tf.expand_dims(inputs["condition_LIKE NEW"], axis=1),
            tf.expand_dims(inputs["condition_WELL USED"], axis=1),
            tf.expand_dims(inputs["year"], axis=1),
            tf.expand_dims(inputs["month_sin"], axis=1),
            tf.expand_dims(inputs["month_cos"], axis=1),
            tf.expand_dims(inputs["day_of_week_sin"], axis=1),
            tf.expand_dims(inputs["day_of_week_cos"], axis=1),
        ])

        other_features = self.other_features_processing(other_raw_features)

        concatenated_features = tf.keras.layers.concatenate([
            image_features,
            title_features,
            desc_features,
            other_features
        ])

        return self.final_layers(concatenated_features)
    
    def get_config(self):
        return {
            "title_vocab_size": self.title_vocab_size,
            "desc_vocab_size": self.desc_vocab_size,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)

## introduce the validation split

In [16]:
valid_df = pd.read_csv("./datasets/validation.csv")

valid_df = pd.get_dummies(valid_df, columns=['condition'], dtype=int)
valid_df['date_sold'] = pd.to_datetime(valid_df['date_sold'])
valid_df['year'] = valid_df['date_sold'].dt.year - 2024 # as we normalize with train set
valid_df['month_sin'], valid_df['month_cos'] = zip(
    *valid_df['date_sold'].dt.month.apply(lambda x: cyclical_encode(x, 12))
    )
valid_df['day_of_week_sin'], valid_df['day_of_week_cos'] = zip(
    *valid_df['date_sold'].dt.dayofweek.apply(lambda x: cyclical_encode(x, 7))
    )
valid_df.drop('date_sold', axis=1, inplace=True)

# convert target to log price
valid_df['price_log'] = np.log1p(valid_df['price'])
valid_df.drop('price', axis=1, inplace=True)

valid_df["title"] = valid_df["title"].astype(str)
valid_df["description"] = valid_df["description"].astype(str)
valid_df["image_url"] = valid_df["image_url"].astype(str)

valid_df.fillna("", inplace=True)

In [17]:
valid_ds_raw = tf.data.Dataset.from_tensor_slices({
    "title": valid_df["title"].values,
    "description": valid_df["description"].values,
    "image_url": valid_df["image_url"].values,
    "condition_BRAND NEW": valid_df["condition_BRAND NEW"].values,
    "condition_HEAVILY USED": valid_df["condition_HEAVILY USED"].values,
    "condition_LIGHTLY USED": valid_df["condition_LIGHTLY USED"].values,
    "condition_LIKE NEW": valid_df["condition_LIKE NEW"].values,
    "condition_WELL USED": valid_df["condition_WELL USED"].values,
    "year": valid_df["year"].values,
    "month_sin": valid_df["month_sin"].values,
    "month_cos": valid_df["month_cos"].values,
    "day_of_week_sin": valid_df["day_of_week_sin"].values,
    "day_of_week_cos": valid_df["day_of_week_cos"].values,
    "price_log": valid_df["price_log"].values, 
})

BATCH_SIZE = 32

valid_ds = valid_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [18]:
TITLE_VOCAB_SIZE = len(title_tokenizer.vocabulary)
DESC_VOCAB_SIZE = len(desc_tokenizer.vocabulary)

model = PricePredictor(
    title_vocab_size=TITLE_VOCAB_SIZE,
    desc_vocab_size=DESC_VOCAB_SIZE
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
print(model.title_vocab_size)
print(model.desc_vocab_size)


434
2321


In [20]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanAbsoluteError()]
)


In [21]:
model.summary(expand_nested=True)

## to see what we feed into the CNN part of the model


In [22]:

# import matplotlib.pyplot as plt
# import numpy as np

# # Show 10 sample images
# count = 0
# for batch in train_ds.take(5):  # Adjust if batch_size is large/small
#     images = batch[0]["image_url"]
#     for i in range(images.shape[0]):
#         if count >= 10:
#             break
#         img = images[i].numpy()

#         # Fix: Rescale if float
#         if img.dtype == np.float32 and np.max(img) <= 1.0:
#             img_vis = (img * 255).astype("uint8")
#         elif img.dtype == np.float32:
#             img_vis = np.clip(img, 0, 255).astype("uint8")
#         else:
#             img_vis = img.astype("uint8")

#         plt.figure()
#         plt.imshow(img_vis)  # or just img if dtype is float32
#         plt.title(f"Sample {count}")
#         plt.axis("off")
#         plt.show()

#         # Also print basic stats
#         print(f"Sample {count}: shape={img.shape}, min={np.min(img)}, max={np.max(img)}, mean={np.mean(img)}")

#         count += 1
#     if count >= 10:
#         break


# train block below

In [23]:
from keras.callbacks import CSVLogger
# change model name
model_name = "V3"

log_file_path = os.path.join(PROJECT_DIR, f"training_log_{model_name}.csv")
csv_logger = CSVLogger(log_file_path)

print(model_name, log_file_path)

V3 ./training_log_V3.csv


In [26]:
# EPOCHS = 10
# BATCH_SIZE = 32

# model.fit(
#     train_ds, 
#     epochs=EPOCHS,
#     validation_data=valid_ds,
#     callbacks=[csv_logger],
# )

import time

EPOCHS = 10
BATCH_SIZE = 32

while True:
    try:
        model.fit(
            train_ds,
            epochs=EPOCHS,
            validation_data=valid_ds,
            callbacks=[csv_logger],
        )
        print("Training completed successfully.")
        break  # exit loop if training is successful
    except Exception as e:
        print(f"Training failed with error: {e}")
        print("Retrying in 10 seconds...")
        time.sleep(10)


Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 2s/step - loss: 0.3631 - mean_absolute_error: 0.3859 - val_loss: 0.3852 - val_mean_absolute_error: 0.4746
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 2s/step - loss: 0.3556 - mean_absolute_error: 0.3880 - val_loss: 0.4171 - val_mean_absolute_error: 0.4974
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 2s/step - loss: 0.3384 - mean_absolute_error: 0.3858 - val_loss: 0.4387 - val_mean_absolute_error: 0.5099
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 2s/step - loss: 0.3163 - mean_absolute_error: 0.3751 - val_loss: 0.4532 - val_mean_absolute_error: 0.5130
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 2s/step - loss: 0.2905 - mean_absolute_error: 0.3593 - val_loss: 0.4642 - val_mean_absolute_error: 0.5102
Epoch 6/10
[1m  8/117[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:54

2025-07-17 22:10:57.692550: W tensorflow/core/framework/op_kernel.cc:1828] UNKNOWN: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x3504ce1b0>
Traceback (most recent call last):

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)
              ^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)
          ^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/autograph/impl/api.

[1m 10/117[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:46[0m 2s/step - loss: 0.2345 - mean_absolute_error: 0.3268Training failed with error: Graph execution error:

Detected at node EagerPyFunc defined at (most recent call last):
<stack traces unavailable>
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x3504ce1b0>
Traceback (most recent call last):

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)
              ^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    re

2025-07-17 22:11:01.610358: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: UNKNOWN: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x3504ce1b0>
Traceback (most recent call last):

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    return func(device, token, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 146, in __call__
    outputs = self._call(device, args)
              ^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 153, in _call
    ret = self._func(*args)
          ^^^^^^^^^^^^^^^^^

  File "/Users/aryan/Downloads/ds_assignment_2025/pricing-venv/lib/python3.11/sit

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 2s/step - loss: 0.2523 - mean_absolute_error: 0.3277 - val_loss: 0.4826 - val_mean_absolute_error: 0.5063
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 2s/step - loss: 0.2294 - mean_absolute_error: 0.3119 - val_loss: 0.5178 - val_mean_absolute_error: 0.5113
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 2s/step - loss: 0.2073 - mean_absolute_error: 0.2955 - val_loss: 0.5648 - val_mean_absolute_error: 0.5219
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 2s/step - loss: 0.1875 - mean_absolute_error: 0.2809 - val_loss: 0.6057 - val_mean_absolute_error: 0.5327
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 2s/step - loss: 0.1702 - mean_absolute_error: 0.2666 - val_loss: 0.6515 - val_mean_absolute_error: 0.5468
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s

# save model

In [27]:
import os
print(os.getcwd())

model.save("price_predictor_V3.keras")
print("Model saved!")


/Users/aryan/Downloads/ds_assignment_2025
Model saved!


## Inference

In [10]:
def get_condition_features(condition_str):
    all_conditions = ["BRAND NEW", "HEAVILY USED", "LIGHTLY USED", "LIKE NEW", "WELL USED"]
    cond_dict = {f"condition_{cond}": 0 for cond in all_conditions}
    if condition_str.upper() in [c.upper() for c in all_conditions]:
        cond_dict[f"condition_{condition_str.upper()}"] = 1
    return cond_dict

def prepare_inference_inputs(title, description, image_url, date_sold_str, condition_str, title_tokenizer, desc_tokenizer):
    dt = pd.to_datetime(date_sold_str)
    
    # Cyclical features
    structured_features = {}
    structured_features["year"] = dt.year - 2024
    structured_features["month_sin"], structured_features["month_cos"] = cyclical_encode(dt.month, 12)
    structured_features["day_of_week_sin"], structured_features["day_of_week_cos"] = cyclical_encode(dt.dayofweek, 7)
    
    # Condition features
    cond_features = get_condition_features(condition_str)
    structured_features.update(cond_features)
    
    tokenized = {
        "title": tf.convert_to_tensor(title_tokenizer(tf.constant(title))),
        "description": tf.convert_to_tensor(desc_tokenizer(tf.constant(description)))
    }
    
    img_tensor = process_image(tf.constant(image_url))

    structured_tensors = {
        key: tf.expand_dims(tf.convert_to_tensor(value, dtype=tf.float32), axis=0)
        for key, value in structured_features.items()
    }
    
    model_inputs = {
        "image_url": tf.expand_dims(img_tensor, axis=0),
        "title": tf.expand_dims(tokenized["title"], axis=0),
        "description": tf.expand_dims(tokenized["description"], axis=0),
        **structured_tensors
    }
    
    return model_inputs

In [11]:
# Sample data
# sample_title = "Xbox Series X"
# sample_desc = "Used for 1 day only then decided to sell because pc has more games"
# sample_condition = "BRAND NEW"
# sample_image_url = "https://media.karousell.com/media/photos/products/2025/5/13/xbox_series_x_1747134184_1cd82594_thumbnail.jpg"
# sample_date_sold = "2025-05-14"
# 385 dollar real price, predicted is 372.19

sample_title = "Nintendo New 3DS"
sample_desc = "Selling off this New 3DS (Non-XL). Comes with:- Console- Stylus- 32GB MicroSD- Charger Please Note: Device is modded, so can download any and all games from online. It also has a non-functioning volume slider, so the volume slider can't control the volume. Tried replacing the speakers and the cables but no luck in fixing this. However, because the console is modded the volume can still be adjusted anytime from the mod menu. Please purchase after careful consideration. Any questions feel free to ask, happy to answer any questions."
sample_condition ="HEAVILY USED"
sample_image_url = "https://media.karousell.com/media/photos/products/2025/6/4/nintendo_new_3ds_1749051938_84b4b4d4_thumbnail"
sample_date_sold = "2025-07-01"
# 200 dollar real price, predicted is 204.48

# Prepare inputs
inference_inputs = prepare_inference_inputs(
    title=sample_title,
    description=sample_desc,
    image_url=sample_image_url,
    date_sold_str=sample_date_sold,
    condition_str=sample_condition,
    title_tokenizer=title_tokenizer,
    desc_tokenizer=desc_tokenizer
)

# Get prediction
prediction_log_price = model.predict(inference_inputs)
predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for the Xbox Series X: ${predicted_price:.2f}")

NameError: name 'title_tokenizer' is not defined

# Load model from disk

In [35]:
from tensorflow.keras.models import load_model

inf_model = load_model(
    "price_predictor_v3.keras",
    custom_objects={
        "PricePredictor": PricePredictor,
        "CNNImageEncoder": CNNImageEncoder,
    }
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
check_prediction_log_price = inf_model.predict(inference_inputs)
check_predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for the Xbox Series X: ${check_predicted_price:.2f}")
assert check_predicted_price == predicted_price, "Predictions between save and mmodel in memory do not match!"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
Predicted price for the Xbox Series X: $372.19


### Check output swings, depending on condition

In [None]:
sample_title = "Nintendo New 3DS"
sample_desc = "Selling off this New 3DS (Non-XL). Comes with:- Console- Stylus- 32GB MicroSD- Charger Please Note: Device is modded, so can download any and all games from online. It also has a non-functioning volume slider, so the volume slider can't control the volume. Tried replacing the speakers and the cables but no luck in fixing this. However, because the console is modded the volume can still be adjusted anytime from the mod menu. Please purchase after careful consideration. Any questions feel free to ask, happy to answer any questions."
sample_condition ="WELL USED"
sample_image_url = "https://media.karousell.com/media/photos/products/2025/6/4/nintendo_new_3ds_1749051938_84b4b4d4_thumbnail"
sample_date_sold = "2025-07-01"

'''
BRAND NEW 426.60
LIKE NEW 384.40
LIGHTLY USED 303.40
WELL USED 244.18
HEAVILY USED 204.48
'''

inference_inputs = prepare_inference_inputs(
    title=sample_title,description=sample_desc,image_url=sample_image_url,date_sold_str=sample_date_sold,condition_str=sample_condition,title_tokenizer=title_tokenizer,desc_tokenizer=desc_tokenizer
)

prediction_log_price = inf_model.predict(inference_inputs)
predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for {sample_title}, with condition {sample_condition}: ${predicted_price:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Predicted price for Nintendo New 3DS, with condition WELL USED: $244.18


# Evaluate on test_ds

In [41]:
inf_model.compile(
    loss='mean_absolute_error',
    metrics=[
        'mean_absolute_error',
        tf.keras.metrics.RootMeanSquaredError(),
        tf.keras.metrics.MeanSquaredError(),
    ]
)

test_ds_results = inf_model.evaluate(test_ds)

print("Test Loss:", test_ds_results[0])
print("Test Mean Absolute Error:", test_ds_results[1])
print("Test RMSE:", test_ds_results[2])
print("Test MSE:", test_ds_results[3])
print(test_ds_results)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 677ms/step - loss: 0.7100 - mean_absolute_error: 0.6843 - mean_squared_error: 1.1262 - root_mean_squared_error: 1.0571
Test Loss: 0.713158130645752
Test Mean Absolute Error: 0.6875342726707458
Test RMSE: 1.0649105310440063
Test MSE: 1.1340343952178955
[0.713158130645752, 0.6875342726707458, 1.0649105310440063, 1.1340343952178955]


#### Why is test loss different than MAE
loss is computed per batch, then averaged over all batches

mae and other metrics are calculated globally, so no averaging

## Get predictions for test.csv

In [42]:
def predict_prices_from_csv(file_path, model, title_tokenizer, desc_tokenizer):
    df = pd.read_csv(file_path)
    df.fillna("")
    for col in ['title', 'description', 'condition', 'image_url', 'date_sold']:
        if col in df.columns:
            df[col] = df[col].astype(str).fillna('')

    predicted_prices = []

    for index, row in df.iterrows():
        sample_title = row['title']
        sample_desc = row['description']
        sample_condition = row['condition']
        sample_image_url = row['image_url']
        sample_date_sold = row['date_sold']

        inference_inputs = prepare_inference_inputs(
            title=sample_title,
            description=sample_desc,
            image_url=sample_image_url,
            date_sold_str=sample_date_sold,
            condition_str=sample_condition,
            title_tokenizer=title_tokenizer,
            desc_tokenizer=desc_tokenizer
        )

        prediction_log_price = model.predict(inference_inputs)
        predicted_price = np.expm1(prediction_log_price[0][0])
        predicted_prices.append(predicted_price)

    df['price'] = predicted_prices
    return df

In [44]:
result_df = predict_prices_from_csv('datasets/test.csv', inf_model, title_tokenizer, desc_tokenizer)

result_df = result_df[['product_id', 'price']]
print(len(result_df))
result_df.to_csv('predicted_prices_v3.csv', index=False)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68