## Image Compression Head

In [2]:
# import libraries
import pandas as pd
import numpy as np

# to use with image url
from PIL import Image, ImageOps
import requests
from io import BytesIO

import tensorflow as tf
import os

from tensorflow_text.python.ops.fast_wordpiece_tokenizer import FastWordpieceTokenizer
from keras_nlp.layers import StartEndPacker
from tensorflow_text import normalize_utf8
import keras_nlp

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import register_keras_serializable

PROJECT_DIR = "./"

file_path = "./datasets/train_cleaned.csv"
df = pd.read_csv(file_path)

  from .autonotebook import tqdm as notebook_tqdm


# Train tokenizer and construct vocab

In [3]:
class CustomTokenizer():

    def __init__(self, vocab_path, max_length):
        self.packer = StartEndPacker(sequence_length=max_length,pad_value=0)
        self.unk_token = '[UNK]'
        self.vocabulary = self._get_vocab_list(vocab_path)
        self.tokenizer = FastWordpieceTokenizer(
            vocab=self.vocabulary,
            suffix_indicator='##',
            unknown_token=self.unk_token,
            support_detokenization=True
        )

    @staticmethod
    def _preprocess(text):
        """Strip accent and lower case the text"""
        text_normalized = normalize_utf8(text, "NFD")
        text_stripped_accents = tf.strings.regex_replace(text_normalized, r"\p{Mn}", "")
        lowercase = tf.strings.lower(text_stripped_accents)
        return lowercase

    def _get_vocab_list(self, vocab_path):
        vclist = []

        with open(vocab_path, "r") as f:
            vclist.extend(f.read().splitlines())
            seen = set()
            vclist = [x for x in vclist if not (x in seen or seen.add(x))]

        if self.unk_token not in vclist:
            vclist = [vclist[0]] + [self.unk_token] + vclist[1:]

        assert len(list(set(vclist))) == len(vclist), "Duplicate vocab entries"
        return vclist

    def tokenize(self, text):
        text = self._preprocess(text)
        tokens = self.tokenizer.tokenize(text)
        return self.packer(tokens)

    def detokenize(self, tokens):
        return self.tokenizer.detokenize(tokens)

    def __call__(self, text):
        return self.tokenize(text)

In [4]:
def train_word_piece(ds, vocab_size, query_or_desc, vocab_file=None):
    if query_or_desc == "title":
        word_piece_ds = ds.map(lambda x: x["title"])
    elif query_or_desc == 'desc':
        word_piece_ds = ds.map(lambda x: x["description"])

    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(10000).prefetch(1000),
        vocabulary_output_file=vocab_file,
        vocabulary_size=vocab_size,
        lowercase=True,
        strip_accents=True,
        reserved_tokens=["[PAD]", "[UNK]"],
    )
    return vocab

def get_tokenizer(train, vocab_size, csv_size, vocab_type, max_tokens, force_train=False):
    vocab_filename = f"vocab_{csv_size}_{vocab_type}"
    vocab_path = os.path.join(PROJECT_DIR, vocab_filename)
    print(vocab_path)
    try:
        if force_train:
            raise FileNotFoundError()

        tokenizer = CustomTokenizer(vocab_path, max_tokens)

    except FileNotFoundError:
        print("Training tokenizer...")
        train_word_piece(train, vocab_size, vocab_type, vocab_path)

        tokenizer = CustomTokenizer(vocab_path, max_tokens)

    # final_vocab_size = min(vocab_size, tokenizer.vocabulary_size())
    final_vocab_size = len(tokenizer.vocabulary)
    print(f"Loaded tokenizer from '{vocab_filename}' with final vocab size: {final_vocab_size:,}")
    print("Sample tokens: ", np.random.choice(tokenizer.vocabulary, 20))
    return tokenizer, final_vocab_size

# pre process image and other input cols

In [13]:
def resize_and_pad_image(url, target_size=(320, 320)):
    # rescale image and pad empty space with black pixel
    response = requests.get(url)
    img = Image.open(BytesIO(response.content)).convert("RGB")
    img.thumbnail(target_size, Image.Resampling.LANCZOS)
    
    delta_w = target_size[0] - img.width
    delta_h = target_size[1] - img.height
    padding = (
        delta_w // 2, 
        delta_h // 2, 
        delta_w - (delta_w // 2), 
        delta_h - (delta_h // 2)
        )
    
    padded_img = ImageOps.expand(img, padding, fill=(0, 0, 0))
    
    return padded_img

def normalize_image(pil_image):
    img_array = np.array(pil_image).astype(np.float32) / 255.0

    img_tensor = tf.convert_to_tensor(img_array)
    img_tensor = tf.expand_dims(img_tensor, axis=0)  
    return img_tensor

def process_image(url):
    def _load_and_process(url_str):
        url_decoded = url_str.numpy().decode()
        img = resize_and_pad_image(url_decoded)
        img_tensor = normalize_image(img)
        return img_tensor[0]  # remove batch dim

    img = tf.py_function(func=_load_and_process, inp=[url], Tout=tf.float32)
    img.set_shape([320, 320, 3])
    return img


In [14]:
# specify type for cols
condition_cols = [col for col in df.columns if col.startswith("condition_")]
df[condition_cols] = df[condition_cols].astype(np.int32)

df["year"] = df["year"].astype(np.int32)

sin_cos_cols = ["month_sin", "month_cos", "day_of_week_sin", "day_of_week_cos"]
df[sin_cos_cols] = df[sin_cos_cols].astype(np.float32)

df["title"] = df["title"].astype(str)
df["description"] = df["description"].astype(str)
df["image_url"] = df["image_url"].astype(str)

df.fillna("", inplace=True)  # as some descriptions are empty

ds = tf.data.Dataset.from_tensor_slices({
    "title": df["title"].values,
    "description": df["description"].values,
    "image_url": df["image_url"].values,
    "condition_BRAND NEW": df["condition_BRAND NEW"].values,
    "condition_HEAVILY USED": df["condition_HEAVILY USED"].values,
    "condition_LIGHTLY USED": df["condition_LIGHTLY USED"].values,
    "condition_LIKE NEW": df["condition_LIKE NEW"].values,
    "condition_WELL USED": df["condition_WELL USED"].values,
    "year": df["year"].values,
    "month_sin": df["month_sin"].values,
    "month_cos": df["month_cos"].values,
    "day_of_week_sin": df["day_of_week_sin"].values,
    "day_of_week_cos": df["day_of_week_cos"].values,
    "price_log": df["price_log"].values, # target
})


In [15]:
# set seed = 42 to replicate results
shuffled = ds.shuffle(len(ds), seed = 42, reshuffle_each_iteration=False)
train_ds_raw = shuffled.take(int(len(ds) * 0.9))
test_ds_raw = shuffled.skip(int(len(ds) * 0.9))


In [6]:
def cyclical_encode(value, max_value):
    sin_val = round(np.sin(2 * np.pi * value / max_value), 2)
    cos_val = round(np.cos(2 * np.pi * value / max_value), 2)
    return sin_val, cos_val

def tokenize(title, description, title_tokenizer, desc_tokenizer):
    x = title_tokenizer(title)
    y = desc_tokenizer(description)
    return {
        "title_tokens": x,
        "desc_tokens": y,
    }

def preprocess(inputs, title_tokenizer, desc_tokenizer):
    title = inputs["title"]
    desc = inputs["description"]
    img_url = inputs["image_url"]
    tokenized = tokenize(title, desc, title_tokenizer, desc_tokenizer)
    img_tensor = process_image(img_url)

    price = inputs["price_log"]

    structured = {
        key: tf.cast(inputs[key], tf.float32)
        for key in [
            "condition_BRAND NEW", "condition_HEAVILY USED", "condition_LIGHTLY USED",
            "condition_LIKE NEW", "condition_WELL USED",
            "year", "month_sin", "month_cos", "day_of_week_sin", "day_of_week_cos"
        ]
    }

    model_inputs = {
        "image_url": img_tensor,
        "title": tokenized["title_tokens"],
        "description": tokenized["desc_tokens"],
        **structured
    }

    return model_inputs, tf.cast(price, tf.float32)  # model input, target

In [20]:
csv_size = "train_cleaned.csv"
MAX_TITLE_VOCAB_SIZE = 500
MAX_DESC_VOCAB_SIZE = 3000

vocab_type = "title"
title_max_tokens = 24
title_tokenizer, TITLE_VOCAB_SIZE = get_tokenizer(
    train_ds_raw, MAX_TITLE_VOCAB_SIZE, csv_size, vocab_type, title_max_tokens, force_train=False)

vocab_type = "desc"
desc_max_tokens = 48
desc_tokenizer, DESC_VOCAB_SIZE = get_tokenizer(
    train_ds_raw, MAX_DESC_VOCAB_SIZE, csv_size, vocab_type, desc_max_tokens, force_train=False)

./vocab_train_cleaned.csv_title
Loaded tokenizer from 'vocab_train_cleaned.csv_title' with final vocab size: 434
Sample tokens:  ['gray' '##🎧' 'm' '##ay' '游' '##ance' '##🐍' '(' '🛑' '##st' '##th' '##et'
 'zelda' 'bnib' '👀' 'card' '##ts' '##es' '##et' ')']
./vocab_train_cleaned.csv_desc
Loaded tokenizer from 'vocab_train_cleaned.csv_desc' with final vocab size: 2,321
Sample tokens:  ['##个' '💀' 'portal' ')' 'while' 'controls' '##游' 'world' '##ro' '##fully'
 'planet' '##ay' '🏽' '##ions' '##k' '##8' 'dont' '##ration' 'tested' '🔹']


In [11]:
# check tokenizer outputs, as a sanity check
print(title_tokenizer(['gaming can be cheap']))
print(desc_tokenizer(['buy this amazing tool super cheap']))

tf.Tensor(
[[262 310  33 137  34 243 137 317   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]], shape=(1, 24), dtype=int64)
tf.Tensor(
[[ 537  413  746  486 1924  599  607  514  672    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]], shape=(1, 48), dtype=int64)


In [21]:
train_ds = train_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

test_ds = test_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

# Model Architechture

In [None]:
@register_keras_serializable()
class CNNImageEncoder(tf.keras.Model):
    def __init__(self, activation='relu', kernel_size=(3, 3), pool_size=(2, 2), **kwargs):
        super(CNNImageEncoder, self).__init__(**kwargs)

        self.cnn_layers = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, kernel_size, activation=activation, input_shape=(320, 320, 3)),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),
            tf.keras.layers.Conv2D(64, kernel_size, activation=activation),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),
            tf.keras.layers.Conv2D(128, kernel_size, activation=activation),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),
            tf.keras.layers.Conv2D(256, kernel_size, activation=activation),
            tf.keras.layers.MaxPooling2D(pool_size, strides=(2, 2)),
            tf.keras.layers.Flatten(),
        ])

    def call(self, image_inputs):
        return self.cnn_layers(image_inputs)
    
    def get_config(self):
        return {
            "activation": self.activation,
            "kernel_size": self.kernel_size,
            "pool_size": self.pool_size,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)
    
DROPOUT = 0.1 # weak dropout

@register_keras_serializable()
class PricePredictor(tf.keras.Model):
    def __init__(self, title_vocab_size=434, desc_vocab_size=2321,**kwargs):
        super().__init__(**kwargs)
        
        self.image_encoder = CNNImageEncoder()
        
        self.title_embedding_layer = tf.keras.layers.Embedding(title_vocab_size, 128, mask_zero=True)
        self.desc_embedding_layer = tf.keras.layers.Embedding(desc_vocab_size, 128, mask_zero=True)
        
        self.title_dense_layers = tf.keras.Sequential([
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(DROPOUT)
        ])

        self.desc_dense_layers = tf.keras.Sequential([
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(DROPOUT)
        ])
        
        self.final_layers = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(DROPOUT),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(DROPOUT),
            tf.keras.layers.Dense(1)
        ])
    
    def call(self, inputs):
        image_input = inputs["image_url"]
        title_tokens = inputs["title"]
        desc_tokens = inputs["description"]
        
        image_features = self.image_encoder(image_input)
        
        title_embeddings = self.title_embedding_layer(title_tokens)
        title_features = self.title_dense_layers(title_embeddings)

        desc_embeddings = self.desc_embedding_layer(desc_tokens)
        desc_features = self.desc_dense_layers(desc_embeddings)

        other_features = tf.keras.layers.concatenate([
            tf.expand_dims(inputs["condition_BRAND NEW"], axis=1),
            tf.expand_dims(inputs["condition_HEAVILY USED"], axis=1),
            tf.expand_dims(inputs["condition_LIGHTLY USED"], axis=1),
            tf.expand_dims(inputs["condition_LIKE NEW"], axis=1),
            tf.expand_dims(inputs["condition_WELL USED"], axis=1),
            tf.expand_dims(inputs["year"], axis=1),
            tf.expand_dims(inputs["month_sin"], axis=1),
            tf.expand_dims(inputs["month_cos"], axis=1),
            tf.expand_dims(inputs["day_of_week_sin"], axis=1),
            tf.expand_dims(inputs["day_of_week_cos"], axis=1),
        ])

        concatenated_features = tf.keras.layers.concatenate([
            image_features,
            title_features,
            desc_features,
            other_features
        ])
        
        return self.final_layers(concatenated_features)

    def get_config(self):
        return {
            "title_vocab_size": self.title_vocab_size,
            "desc_vocab_size": self.desc_vocab_size,
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)


## introduce the validation split

In [15]:
valid_df = pd.read_csv("./datasets/validation.csv")

valid_df = pd.get_dummies(valid_df, columns=['condition'], dtype=int)
valid_df['date_sold'] = pd.to_datetime(valid_df['date_sold'])
valid_df['year'] = valid_df['date_sold'].dt.year - 2024 # scale year as with train_ds and test_ds
valid_df['month_sin'], valid_df['month_cos'] = zip(
    *valid_df['date_sold'].dt.month.apply(lambda x: cyclical_encode(x, 12))
    )
valid_df['day_of_week_sin'], valid_df['day_of_week_cos'] = zip(
    *valid_df['date_sold'].dt.dayofweek.apply(lambda x: cyclical_encode(x, 7))
    )
valid_df.drop('date_sold', axis=1, inplace=True)

# convert target to log price
valid_df['price_log'] = np.log1p(valid_df['price'])
valid_df.drop('price', axis=1, inplace=True)

valid_df["title"] = valid_df["title"].astype(str)
valid_df["description"] = valid_df["description"].astype(str)
valid_df["image_url"] = valid_df["image_url"].astype(str)

valid_df.fillna("", inplace=True)

In [16]:
valid_ds_raw = tf.data.Dataset.from_tensor_slices({
    "title": valid_df["title"].values,
    "description": valid_df["description"].values,
    "image_url": valid_df["image_url"].values,
    "condition_BRAND NEW": valid_df["condition_BRAND NEW"].values,
    "condition_HEAVILY USED": valid_df["condition_HEAVILY USED"].values,
    "condition_LIGHTLY USED": valid_df["condition_LIGHTLY USED"].values,
    "condition_LIKE NEW": valid_df["condition_LIKE NEW"].values,
    "condition_WELL USED": valid_df["condition_WELL USED"].values,
    "year": valid_df["year"].values,
    "month_sin": valid_df["month_sin"].values,
    "month_cos": valid_df["month_cos"].values,
    "day_of_week_sin": valid_df["day_of_week_sin"].values,
    "day_of_week_cos": valid_df["day_of_week_cos"].values,
    "price_log": valid_df["price_log"].values, 
})

BATCH_SIZE = 32

valid_ds = valid_ds_raw.map(
    lambda x: preprocess(x, title_tokenizer, desc_tokenizer), num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [None]:
TITLE_VOCAB_SIZE = len(title_tokenizer.vocabulary)
DESC_VOCAB_SIZE = len(desc_tokenizer.vocabulary)

model = PricePredictor(
    title_vocab_size=TITLE_VOCAB_SIZE,
    desc_vocab_size=DESC_VOCAB_SIZE
)

In [18]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.MeanAbsoluteError()]
)


In [19]:
model.summary(expand_nested=True)

# train block below

In [20]:
from keras.callbacks import CSVLogger

model_name = "V1"
log_file_path = os.path.join(PROJECT_DIR, f"training_log_{model_name}.csv")
csv_logger = CSVLogger(log_file_path)

In [None]:
EPOCHS = 10
BATCH_SIZE = 32

model.fit(
    train_ds, 
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[csv_logger],
)


# save model

In [None]:
import os
print(os.getcwd())

model.save("price_predictor_V1.keras")
print("Model saved!")


Model saved!


## Inference

In [25]:
def get_condition_features(condition_str):
    all_conditions = ["BRAND NEW", "HEAVILY USED", "LIGHTLY USED", "LIKE NEW", "WELL USED"]
    cond_dict = {f"condition_{cond}": 0 for cond in all_conditions}
    if condition_str.upper() in [c.upper() for c in all_conditions]:
        cond_dict[f"condition_{condition_str.upper()}"] = 1
    return cond_dict

def prepare_inference_inputs(title, description, image_url, date_sold_str, condition_str, title_tokenizer, desc_tokenizer):
    dt = pd.to_datetime(date_sold_str)
    
    # Cyclical features
    structured_features = {}
    structured_features["year"] = dt.year - 2024
    structured_features["month_sin"], structured_features["month_cos"] = cyclical_encode(dt.month, 12)
    structured_features["day_of_week_sin"], structured_features["day_of_week_cos"] = cyclical_encode(dt.dayofweek, 7)
    
    # Condition features
    cond_features = get_condition_features(condition_str)
    structured_features.update(cond_features)
    
    tokenized = {
        "title": tf.convert_to_tensor(title_tokenizer(tf.constant(title))),
        "description": tf.convert_to_tensor(desc_tokenizer(tf.constant(description)))
    }
    
    img_tensor = process_image(tf.constant(image_url))

    structured_tensors = {
        key: tf.expand_dims(tf.convert_to_tensor(value, dtype=tf.float32), axis=0)
        for key, value in structured_features.items()
    }
    
    model_inputs = {
        "image_url": tf.expand_dims(img_tensor, axis=0),
        "title": tf.expand_dims(tokenized["title"], axis=0),
        "description": tf.expand_dims(tokenized["description"], axis=0),
        **structured_tensors
    }
    
    return model_inputs

In [47]:
# Sample data
# sample_title = "Xbox Series X"
# sample_desc = "Used for 1 day only then decided to sell because pc has more games"
# sample_condition = "BRAND NEW"
# sample_image_url = "https://media.karousell.com/media/photos/products/2025/5/13/xbox_series_x_1747134184_1cd82594_thumbnail.jpg"
# sample_date_sold = "2025-05-14"
# 385 dollar real price, predicted is 192.25

sample_title = "Nintendo New 3DS"
sample_desc = "Selling off this New 3DS (Non-XL). Comes with:- Console- Stylus- 32GB MicroSD- Charger Please Note: Device is modded, so can download any and all games from online. It also has a non-functioning volume slider, so the volume slider can't control the volume. Tried replacing the speakers and the cables but no luck in fixing this. However, because the console is modded the volume can still be adjusted anytime from the mod menu. Please purchase after careful consideration. Any questions feel free to ask, happy to answer any questions."
sample_condition ="HEAVILY USED"
sample_image_url = "https://media.karousell.com/media/photos/products/2025/6/4/nintendo_new_3ds_1749051938_84b4b4d4_thumbnail"
sample_date_sold = "2025-07-01"
# 200 dollar real price, predicted is 128.64

inference_inputs = prepare_inference_inputs(
    title=sample_title,
    description=sample_desc,
    image_url=sample_image_url,
    date_sold_str=sample_date_sold,
    condition_str=sample_condition,
    title_tokenizer=title_tokenizer,
    desc_tokenizer=desc_tokenizer
)

prediction_log_price = model.predict(inference_inputs)
predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for the Xbox Series X: ${predicted_price:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted price for the Xbox Series X: $128.64


# Load model from disk

In [None]:
from tensorflow.keras.models import load_model

inf_model = load_model(
    "price_predictor_v1.keras",
    custom_objects={
        "PricePredictor": PricePredictor,
        "CNNImageEncoder": CNNImageEncoder,
    }
)

In [None]:
# Optional, sanity check
# Use to compare outputs from model (from train loop) and inf_model from disk are in memory

check_prediction_log_price = inf_model.predict(inference_inputs)
check_predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for the Xbox Series X: ${check_predicted_price:.2f}")
assert check_predicted_price == predicted_price, "Predictions between save and mmodel in memory do not match"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Predicted price for the Xbox Series X: $128.64


### Check output swings, depending on condition

In [36]:
sample_title = "Nintendo New 3DS"
sample_desc = "Selling off this New 3DS (Non-XL). Comes with:- Console- Stylus- 32GB MicroSD- Charger Please Note: Device is modded, so can download any and all games from online. It also has a non-functioning volume slider, so the volume slider can't control the volume. Tried replacing the speakers and the cables but no luck in fixing this. However, because the console is modded the volume can still be adjusted anytime from the mod menu. Please purchase after careful consideration. Any questions feel free to ask, happy to answer any questions."
sample_condition ="BRAND NEW"
sample_image_url = "https://media.karousell.com/media/photos/products/2025/6/4/nintendo_new_3ds_1749051938_84b4b4d4_thumbnail"
sample_date_sold = "2025-07-01"

'''
BRAND NEW 205.01
LIKE NEW 202.91
LIGHTLY USED 184.92
WELL USED 152.22
HEAVILY USED 128.64
'''

inference_inputs = prepare_inference_inputs(
    title=sample_title,description=sample_desc,image_url=sample_image_url,date_sold_str=sample_date_sold,condition_str=sample_condition,title_tokenizer=title_tokenizer,desc_tokenizer=desc_tokenizer
)

prediction_log_price = inf_model.predict(inference_inputs)
predicted_price = np.expm1(prediction_log_price[0][0])

print(f"Predicted price for {sample_title}, with condition {sample_condition}: ${predicted_price:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Predicted price for Nintendo New 3DS, with condition BRAND NEW: $205.01


# Eval on test_ds

In [23]:
inf_model.compile(
    loss='mean_absolute_error',
    metrics=[
        'mean_absolute_error',
        tf.keras.metrics.RootMeanSquaredError(),
        tf.keras.metrics.MeanSquaredError(),
    ]
)

test_ds_results = inf_model.evaluate(test_ds)

print("Test Loss:", test_ds_results[0])
print("Test Mean Absolute Error:", test_ds_results[1])
print("Test RMSE:", test_ds_results[2])
print("Test MSE:", test_ds_results[3])
print(test_ds_results)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 973ms/step - loss: 0.5453 - mean_absolute_error: 0.5453 - mean_absolute_percentage_error: 27420934.0000 - mean_squared_error: 0.7655 - root_mean_squared_error: 0.8711
Test Loss: 0.5649797916412354
Test Mean Absolute Error: 0.5649797916412354
Test RMSE: 0.8787887096405029
Test MSE: 0.7722695469856262
[0.5649797916412354, 0.5649797916412354, 0.8787887096405029, 0.7722695469856262, 49858516.0]


## Get predictions on test.csv

In [60]:
def predict_prices_from_csv(file_path, model, title_tokenizer, desc_tokenizer):
    df = pd.read_csv(file_path)
    df.fillna("")
    for col in ['title', 'description', 'condition', 'image_url', 'date_sold']:
        if col in df.columns:
            df[col] = df[col].astype(str).fillna('')

    predicted_prices = []

    for index, row in df.iterrows():
        sample_title = row['title']
        sample_desc = row['description']
        sample_condition = row['condition']
        sample_image_url = row['image_url']
        sample_date_sold = row['date_sold']

        inference_inputs = prepare_inference_inputs(
            title=sample_title,
            description=sample_desc,
            image_url=sample_image_url,
            date_sold_str=sample_date_sold,
            condition_str=sample_condition,
            title_tokenizer=title_tokenizer,
            desc_tokenizer=desc_tokenizer
        )

        prediction_log_price = model.predict(inference_inputs)
        predicted_price = np.expm1(prediction_log_price[0][0])
        predicted_prices.append(predicted_price)

    df['price'] = predicted_prices
    return df

In [None]:
result_df = predict_prices_from_csv('datasets/test.csv', model, title_tokenizer, desc_tokenizer)

result_df = result_df[['product_id', 'price']]
print(len(result_df))
result_df.to_csv('predicted_prices_v1.csv', index=False)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52