Features:

Text: DistilBERT embeddings (768 dims) on catalog_content.
Numerics: Your V16 set (quantity, feat_gb/oz/inch/mp/lbs/mah/watts, premium_keyword_count, condition_flag, title_length, content_word_count).
Image: ViT-base (HuggingFace) embeddings (768 dims) from image_link URLs (assumes download capability; else placeholder).


Model: Custom PyTorch model:

DistilBERT for text (freeze lower layers).
ViT for images (freeze).
Cross-attention: Text embeddings attend to image embeddings to weigh relevance.
Concat with scaled numerics.
Linear layers → Tweedie output (log-link for price).


Loss: Tweedie (variance_power=1.5, tune 1.2-1.8) to handle skew natively.
Training:

Tail weights: 3x for >100, 2x for <10.
AdamW, linear scheduler with warmup.
Early stopping on validation SMAPE.

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from transformers import DistilBertTokenizer, DistilBertModel, ViTImageProcessor, ViTModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
import os
from datetime import datetime
import requests
from PIL import Image
from io import BytesIO

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature extraction
def extract_quantity(text):
    match = re.search(r'(?:pack|box|set|bundle|case|dozen|carton) of (\d+)', text, re.I)
    return int(match.group(1)) if match else 1

def extract_numeric(text, pattern):
    match = re.search(pattern, text, re.I)
    return float(match.group(1)) if match else 0

units = {
    'gb': r'(\d+\.?\d*)\s*gb',
    'oz': r'(\d+\.?\d*)\s*oz',
    'inch': r'(\d+\.?\d*)\s*(?:inch|in(?:ch)?)',
    'mp': r'(\d+\.?\d*)\s*mp',
    'lbs': r'(\d+\.?\d*)\s*lbs?',
    'mah': r'(\d+\.?\d*)\s*mah',
    'watts': r'(\d+\.?\d*)\s*w(?:atts?)?',
    'kg': r'(\d+\.?\d*)\s*kg'
}

def extract_features(row):
    text = row['catalog_content'].lower()
    feats = {'quantity': extract_quantity(text)}
    for unit, pattern in units.items():
        feats[f'feat_{unit}'] = extract_numeric(text, pattern)
    premiums = ['premium', 'luxury', 'high-end', 'pro', 'ultra', 'elite', 'deluxe', 'professional']
    feats['premium_keyword_count'] = sum(text.count(word) for word in premiums)
    if re.search(r'\bnew\b|\bmint\b|\bbrand new\b', text):
        feats['condition_flag'] = 1
    elif re.search(r'\bused\b|\brefurbished\b|\bpre-owned\b', text):
        feats['condition_flag'] = 0
    else:
        feats['condition_flag'] = 0.5
    title = re.split(r'[.:]\s', text)[0]
    feats['title_length'] = len(title)
    feats['content_word_count'] = len(text.split())
    return pd.Series(feats)

# Augmentation
def augment_data(df):
    high = df[df['price'] > 100].copy()
    low = df[df['price'] < 10].copy()
    high['catalog_content'] = high['catalog_content'] + ' luxury edition'
    low['quantity'] = low['quantity'] * 2
    return pd.concat([df, high, low], ignore_index=True)

# Load and preprocess
train = pd.read_csv('input/train.csv')
train = pd.concat([train, train.apply(extract_features, axis=1)], axis=1)
train = augment_data(train)

num_cols = ['quantity'] + [f'feat_{u}' for u in units] + ['premium_keyword_count', 'condition_flag', 'title_length', 'content_word_count']
imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = StandardScaler()
train[num_cols] = imputer.fit_transform(train[num_cols])
train[num_cols] = scaler.fit_transform(train[num_cols])

# Split
X = train.drop('price', axis=1)
y = train['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)

# Dataset
class PriceDataset(Dataset):
    def __init__(self, df, tokenizer, image_processor, max_length=128):
        self.texts = df['catalog_content'].values
        self.numerics = df[num_cols].values
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_length = max_length
        self.image_links = df['image_link'].values
        self.prices = df['price'].values if 'price' in df.columns else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        try:
            response = requests.get(self.image_links[idx], timeout=5)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert('RGB')
            image = self.image_processor(image, return_tensors='pt')
        except:
            image = {'pixel_values': torch.zeros(1, 3, 224, 224)}
        numerics = torch.tensor(self.numerics[idx], dtype=torch.float)
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'pixel_values': image['pixel_values'].squeeze(),
            'numerics': numerics
        }
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        return item

# Model
class TweedieLoss(nn.Module):
    def __init__(self, p=1.5):
        super().__init__()
        self.p = p

    def forward(self, pred, target):
        pred = torch.clamp(pred, min=1e-6)
        term1 = -target * torch.pow(pred, 1 - self.p) / (1 - self.p)
        term2 = torch.pow(pred, 2 - self.p) / (2 - self.p)
        return torch.mean(term1 + term2)

class PriceRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.cross_attn = nn.MultiheadAttention(embed_dim=768, num_heads=8)
        self.fc1 = nn.Linear(768 + len(num_cols), 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, pixel_values, numerics):
        text_emb = self.text_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        image_emb = self.image_model(pixel_values).last_hidden_state[:, 0, :]
        text_emb = text_emb.unsqueeze(0)
        image_emb = image_emb.unsqueeze(0)
        attn_output, _ = self.cross_attn(text_emb, image_emb, image_emb)
        fused = attn_output.squeeze(0) + text_emb.squeeze(0)
        combined = torch.cat([fused, numerics], dim=1)
        x = self.relu(self.fc1(combined))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
train_ds = PriceDataset(X_train.assign(price=y_train), tokenizer, image_processor)
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
valid_ds = PriceDataset(X_valid.assign(price=y_valid), tokenizer, image_processor)
valid_loader = DataLoader(valid_ds, batch_size=8)

model = PriceRegressor().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=100)
loss_fn = TweedieLoss(p=1.5)
weights = torch.ones(len(y_train), device=device)
weights[y_train > 100] = 3
weights[y_train < 10] = 2

# Train
epochs = 3
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
        target = batch['price'].to(device)
        pred = model(**inputs).squeeze()
        loss = loss_fn(pred, target)
        batch_weights = weights[batch_idx * train_loader.batch_size:(batch_idx + 1) * train_loader.batch_size]
        loss = (loss * batch_weights).mean()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {train_loss/len(train_loader)}')

# Eval
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for batch in valid_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
        target = batch['price'].to(device)
        pred = model(**inputs).squeeze()
        preds.extend(np.expm1(pred.cpu().numpy()))
        actuals.extend(np.expm1(target.cpu().numpy()))
smape_score = smape(np.array(actuals), np.array(preds))
print(f'T3 Validation SMAPE: {smape_score}')

# Error Analysis
errors = pd.DataFrame({
    'actual': actuals,
    'pred': preds,
    'diff': np.array(preds) - np.array(actuals),
    'ape': 2 * 100 * np.abs(np.array(preds) - np.array(actuals)) / (np.abs(actuals) + np.abs(preds))
})
bins = [0, 10, 50, 100, 500, np.inf]
labels = ['0-10 (Low/Bulk)', '10-50', '50-100', '100-500 (High)', '500+ (Extreme)']
errors['price_bin'] = pd.cut(errors['actual'], bins=bins, labels=labels)
bin_smape = errors.groupby('price_bin')['ape'].mean() / 2
print('SMAPE per bin:\n', bin_smape)
bin_bias = errors.groupby('price_bin')['diff'].mean()
print('Bias (pred - actual) per bin:\n', bin_bias)
bin_var = errors.groupby('price_bin')['diff'].var()
print('Error variance per bin:\n', bin_var)
top_errors = errors.sort_values('ape', ascending=False).head(10)
print('Top 10 worst predictions:\n', top_errors)
if bin_bias.iloc[-2] < 0:
    print('Insight: Underpredicting highs—try higher ViT weight or more high-price augmentation.')
if bin_smape.iloc[0] > bin_smape.mean():
    print('Insight: Bulk errors high—refine quantity regex or augment more low-price samples.')

# Save
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_dir = f'output/T3_{timestamp}'
os.makedirs(out_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(out_dir, 'model.pt'))
pd.DataFrame({'actual': actuals, 'pred': preds}).to_csv(os.path.join(out_dir, 'preds_valid.csv'), index=False)
with open(os.path.join(out_dir, 'error_analysis.txt'), 'w') as f:
    f.write(f'Validation SMAPE: {smape_score}\n')
    f.write('SMAPE per bin:\n' + str(bin_smape) + '\n')
    f.write('Bias per bin:\n' + str(bin_bias) + '\n')
    f.write('Error variance per bin:\n' + str(bin_var) + '\n')
    f.write('Top 10 worst:\n' + str(top_errors) + '\n')

# Test inference
test = pd.read_csv('input/test.csv')
test = pd.concat([test, test.apply(extract_features, axis=1)], axis=1)
test[num_cols] = imputer.transform(test[num_cols])
test[num_cols] = scaler.transform(test[num_cols])
test_ds = PriceDataset(test, tokenizer, image_processor)
test_loader = DataLoader(test_ds, batch_size=8)
model.eval()
pred_test = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        pred = model(**inputs).squeeze()
        pred_test.extend(np.expm1(pred.cpu().numpy()))
submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test})
submission.to_csv(os.path.join(out_dir, 'submission.csv'), index=False)
print(f'Outputs saved to {out_dir}')

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]