In [2]:
!mkdir -p embeddings

In [3]:
import os
import random
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from io import BytesIO

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [5]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_MODEL = "openai/clip-vit-base-patch32"
SENT_MODEL = "all-MiniLM-L6-v2"

In [6]:
sent_model = SentenceTransformer(SENT_MODEL, device=DEVICE)
clip_model = CLIPModel.from_pretrained(CLIP_MODEL).to(DEVICE)
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [7]:
def process_batch(batch_path, batch_id):
    df = pd.read_csv(batch_path)
    print(f"\nProcessing {batch_path} ... ({len(df)} samples)")

    # --- Text embeddings ---
    texts = df['catalog_content'].astype(str).tolist()
    text_embs = sent_model.encode(texts, batch_size=128, convert_to_numpy=True, show_progress_bar=True)
    np.save(f"embeddings/text_batch{batch_id}.npy", text_embs)

In [8]:
!mkdir -p dataset

In [10]:
import numpy as np
import pandas as pd

all_text, all_img = [], []
for i in range(1, 14):
    all_text.append(np.load(f"embeddings/text_batch{i}.npy"))
    all_img.append(np.load(f"embeddings/img_batch{i}.npy"))

text_embs = np.vstack(all_text)
img_embs = np.vstack(all_img)

train_df = pd.concat([pd.read_csv(f"dataset/batch_{i}.csv") for i in range(1, 14)], ignore_index=True)
print("✅ Combined:", train_df.shape, text_embs.shape, img_embs.shape)

✅ Combined: (75000, 4) (75000, 384) (75000, 512)


In [12]:
import os, re, unicodedata
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import joblib

In [13]:
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.where(denom == 0, eps, denom)
    return np.mean(np.abs(y_pred - y_true) / denom) * 100

def clean_text(x):
    if not isinstance(x, str): return ""
    x = unicodedata.normalize("NFKC", x).lower()
    x = re.sub(r"\s+", " ", x).strip()
    return x

In [15]:
num_batches = 13
all_df, all_text, all_img = [], [], []

for i in range(1, num_batches + 1):
    df_path = f"dataset/batch_{i}.csv"
    text_path = f"embeddings/text_batch{i}.npy"
    img_path  = f"embeddings/img_batch{i}.npy"
    print(f"Loading {df_path} ...")

    df = pd.read_csv(df_path)
    df["batch_id"] = i
    all_df.append(df)

    all_text.append(np.load(text_path))
    all_img.append(np.load(img_path))

train_df = pd.concat(all_df, ignore_index=True)
text_embs = np.vstack(all_text)
img_embs  = np.vstack(all_img)

print(f"✅ Combined data: {train_df.shape}, text_embs: {text_embs.shape}, img_embs: {img_embs.shape}")

Loading dataset/batch_1.csv ...
Loading dataset/batch_2.csv ...
Loading dataset/batch_3.csv ...
Loading dataset/batch_4.csv ...
Loading dataset/batch_5.csv ...
Loading dataset/batch_6.csv ...
Loading dataset/batch_7.csv ...
Loading dataset/batch_8.csv ...
Loading dataset/batch_9.csv ...
Loading dataset/batch_10.csv ...
Loading dataset/batch_11.csv ...
Loading dataset/batch_12.csv ...
Loading dataset/batch_13.csv ...
✅ Combined data: (75000, 5), text_embs: (75000, 384), img_embs: (75000, 512)


In [16]:
train_df["catalog_content_clean"] = train_df["catalog_content"].apply(clean_text)
train_df["ipq"] = train_df["catalog_content_clean"].str.extract(r"(\d+)\s*(?:pack|pcs|pieces|count|ct|bottle|x)", expand=False)
train_df["ipq"] = pd.to_numeric(train_df["ipq"], errors="coerce").fillna(1)
train_df["title"] = train_df["catalog_content"].str.split(r"[:.]").str[0].fillna("")
train_df["len_title"] = train_df["title"].apply(lambda x: len(str(x)))
train_df["num_digits"] = train_df["catalog_content"].str.count(r"\d").fillna(0)
train_df["num_tokens"] = train_df["catalog_content_clean"].apply(lambda x: len(x.split()))
train_df["has_brand_keyword"] = train_df["catalog_content"].str.contains(
    r"brand|inc|co\.|limited|™|®", case=False, na=False
).astype(int)

num_cols = ["ipq", "len_title", "num_digits", "num_tokens", "has_brand_keyword"]
numeric_feats = train_df[num_cols].values.astype(np.float32)

In [17]:
train_df["price"] = pd.to_numeric(train_df["price"], errors="coerce").fillna(train_df["price"].median())
train_df["price"] = train_df["price"].clip(lower=0.01)
train_df["log_price"] = np.log1p(train_df["price"])
y = train_df["log_price"].values

scaler = StandardScaler()
num_scaled = scaler.fit_transform(numeric_feats)


In [18]:
X = np.concatenate([text_embs, img_embs, num_scaled], axis=1)
print(f"Final feature matrix: {X.shape}")

# Save final merged embeddings
os.makedirs("final_embeddings", exist_ok=True)
np.save("final_embeddings/X_features.npy", X)
np.save("final_embeddings/y_logprice.npy", y)
joblib.dump(scaler, "final_embeddings/numeric_scaler.pkl")
print("✅ Saved combined embeddings & scaler.")

Final feature matrix: (75000, 901)
✅ Saved combined embeddings & scaler.


In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val)

In [20]:
params = {
    "objective": "regression",
    "metric": "mae",
    "boosting_type": "gbdt",
    "device": "gpu",                  # 🚀 Use GPU
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    "learning_rate": 0.05,
    "num_leaves": 128,                # higher leaf count works well on GPU
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 50,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "verbosity": -1
}

print("\n🚀 Training LightGBM model on GPU ...")
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=4000,
    callbacks=[
        lgb.early_stopping(200),
        lgb.log_evaluation(100)
    ]
)


🚀 Training LightGBM model on GPU ...
Training until validation scores don't improve for 200 rounds
[100]	training's l1: 0.497008	valid_1's l1: 0.599344
[200]	training's l1: 0.408017	valid_1's l1: 0.578195
[300]	training's l1: 0.343823	valid_1's l1: 0.568713
[400]	training's l1: 0.294411	valid_1's l1: 0.563438
[500]	training's l1: 0.254335	valid_1's l1: 0.559952
[600]	training's l1: 0.221062	valid_1's l1: 0.557245
[700]	training's l1: 0.192766	valid_1's l1: 0.555765
[800]	training's l1: 0.16875	valid_1's l1: 0.554533
[900]	training's l1: 0.148173	valid_1's l1: 0.553883
[1000]	training's l1: 0.130332	valid_1's l1: 0.553075
[1100]	training's l1: 0.114937	valid_1's l1: 0.55234
[1200]	training's l1: 0.101498	valid_1's l1: 0.551826
[1300]	training's l1: 0.0899844	valid_1's l1: 0.551391
[1400]	training's l1: 0.079865	valid_1's l1: 0.551291
[1500]	training's l1: 0.0709808	valid_1's l1: 0.550837
[1600]	training's l1: 0.0633117	valid_1's l1: 0.550648
[1700]	training's l1: 0.056592	valid_1's l1:

In [21]:
y_pred_val = np.expm1(model.predict(X_val, num_iteration=model.best_iteration))
y_true_val = np.expm1(y_val)
mae = mean_absolute_error(y_true_val, y_pred_val)
smape_val = smape(y_true_val, y_pred_val)
print(f"\n✅ Validation MAE: {mae:.4f} | SMAPE: {smape_val:.2f}%")


✅ Validation MAE: 12.5764 | SMAPE: 54.88%
