In [36]:
!pip install lightgbm tqdm ftfy regex --quiet

import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import joblib
import torch
import clip
from PIL import Image

In [37]:
SEED = 42
N_SPLITS = 5
MAX_FEATURES = 30000
SVD_DIM = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

np.random.seed(SEED)
print("✅ Running on:", DEVICE)

✅ Running on: cuda


In [38]:
train_df = pd.read_csv("/kaggle/input/dataset/dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/dataset/dataset/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.head(3))


Train shape: (75000, 4)
Test shape: (75000, 3)
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  


In [39]:
train_df["log_price"] = np.log1p(train_df["price"])
y_train_log = train_df["log_price"].values


In [40]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"item name:|pack of \d+|value:.*?unit:.*?|\n", " ", text)
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df['clean_text'] = train_df['catalog_content'].fillna("").apply(clean_text)
test_df['clean_text']  = test_df['catalog_content'].fillna("").apply(clean_text)


In [41]:
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train_df["catalog_content"].astype(str))
X_test_text = tfidf.transform(test_df["catalog_content"].astype(str))

svd = TruncatedSVD(n_components=256, random_state=42)
X_train_text_svd = svd.fit_transform(X_train_text)
X_test_text_svd = svd.transform(X_test_text)

print("Text features:", X_train_text_svd.shape)


Text features: (75000, 256)


In [42]:
import os

os.makedirs("/kaggle/working/images", exist_ok=True)
os.makedirs("/kaggle/working/embeddings", exist_ok=True)
os.makedirs("/kaggle/working/outputs", exist_ok=True)

In [43]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-56c4kcz8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-56c4kcz8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [44]:
import torch
import clip
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14", device=device)
model.eval()

image_dir = Path("/kaggle/working/images")
emb_dir = Path("/kaggle/working/embeddings")
emb_dir.mkdir(exist_ok=True)

def load_image_batch(paths):
    imgs = []
    valid_ids = []
    for sid, path in paths:
        if not path.exists():
            continue
        try:
            imgs.append(preprocess(Image.open(path)))
            valid_ids.append(sid)
        except:
            continue
    if not imgs:
        return None, []
    imgs = torch.stack(imgs).to(device)
    return imgs, valid_ids

batch_size = 64
all_samples = [(row['sample_id'], image_dir / f"{row['sample_id']}.jpg") for _, row in train_df.iterrows()]

for i in tqdm(range(0, len(all_samples), batch_size)):
    batch = all_samples[i:i+batch_size]
    imgs, valid_ids = load_image_batch(batch)
    if imgs is None: 
        continue
    with torch.no_grad():
        embs = model.encode_image(imgs).cpu().numpy()
    for sid, emb in zip(valid_ids, embs):
        np.save(emb_dir / f"{sid}.npy", emb)

100%|██████████| 1172/1172 [00:00<00:00, 3184.78it/s]


In [45]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from tqdm import tqdm


In [46]:
import numpy as np
import os
from tqdm import tqdm

train_emb_dir = "/kaggle/working/embeddings_train"
test_emb_dir = "/kaggle/working/embeddings_test"

def load_embeddings(df, emb_dir):
    embs = []
    for sid in tqdm(df["sample_id"], desc=f"Loading embeddings from {emb_dir}"):
        path = os.path.join(emb_dir, f"{sid}.npy")
        if os.path.exists(path):
            embs.append(np.load(path))
        else:
            embs.append(np.zeros(512))  # fallback for missing image
    return np.vstack(embs)

X_train_img = load_embeddings(train_df, train_emb_dir)
X_test_img = load_embeddings(test_df, test_emb_dir)

print("✅ Image features loaded:")
print("Train:", X_train_img.shape)
print("Test :", X_test_img.shape)

Loading embeddings from /kaggle/working/embeddings_train: 100%|██████████| 75000/75000 [00:00<00:00, 174935.85it/s]
Loading embeddings from /kaggle/working/embeddings_test: 100%|██████████| 75000/75000 [00:00<00:00, 171048.17it/s]


✅ Image features loaded:
Train: (75000, 512)
Test : (75000, 512)


In [35]:
X_train_multi = np.hstack([X_train_txt, X_train_img])
X_test_multi  = np.hstack([X_test_txt, X_test_img])

y_train = train_df['price'].values
y_train_log = np.log1p(y_train)  # for stable regression


Generating offline TF-IDF text embeddings...
✅ TF-IDF text features created successfully!
Train shape: (75000, 12000)
Test shape: (75000, 12000)


In [47]:
bins = pd.qcut(y_train, q=5, labels=False)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [49]:
preds_test_stack = np.zeros(len(X_test_multi))
oof_train = np.zeros(len(X_train_multi))

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4,
    'seed': 42,
    'verbose': -1
}

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_multi, bins)):
    print(f"\n--- Fold {fold+1} ---")
    
    X_tr, X_val = X_train_multi[tr_idx], X_train_multi[val_idx]
    y_tr, y_val = y_train_log[tr_idx], y_train_log[val_idx]

    train_data = lgb.Dataset(X_tr, y_tr)
    val_data   = lgb.Dataset(X_val, y_val)

    # --- Callbacks for early stopping & logging ---
    callbacks = [
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=200)
    ]

    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=2000,
        callbacks=callbacks
    )

    oof_train[val_idx] = model.predict(X_val)
    preds_test_stack += model.predict(X_test_multi) / skf.n_splits



--- Fold 1 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.748936
[400]	valid_0's rmse: 0.737533
[600]	valid_0's rmse: 0.731985
[800]	valid_0's rmse: 0.72928
[1000]	valid_0's rmse: 0.72738
[1200]	valid_0's rmse: 0.726111
[1400]	valid_0's rmse: 0.725063
[1600]	valid_0's rmse: 0.72441
Early stopping, best iteration is:
[1660]	valid_0's rmse: 0.724205

--- Fold 2 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.750566
[400]	valid_0's rmse: 0.738779
[600]	valid_0's rmse: 0.732945
[800]	valid_0's rmse: 0.729684
[1000]	valid_0's rmse: 0.72734
[1200]	valid_0's rmse: 0.726051
[1400]	valid_0's rmse: 0.725289
[1600]	valid_0's rmse: 0.724571
[1800]	valid_0's rmse: 0.723848
[2000]	valid_0's rmse: 0.723326
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.723326

--- Fold 3 ---
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.747811
[400]	valid_0's rmse: 0.

In [50]:
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denom
    diff[denom == 0] = 0.0
    return 100 * np.mean(diff)

In [51]:
y_pred_train = np.expm1(oof_train)
y_pred_test  = np.expm1(preds_test_stack)

final_smape = smape(y_train, y_pred_train)
print("✅ Best CV SMAPE:", final_smape)


✅ Best CV SMAPE: 55.73228278334334
