In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from transformers import DistilBertTokenizer, DistilBertModel, T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
import re
import os
from datetime import datetime
import pickle
import gc
from torch.cuda.amp import autocast, GradScaler  # Mixed precision

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature extraction
def extract_quantity(text):
    match = re.search(r'(?:pack|box|set|bundle|case|dozen|carton|bulk) of (\d+)', text, re.I)
    return int(match.group(1)) if match else 1

def extract_numeric(text, pattern):
    match = re.search(pattern, text, re.I)
    return float(match.group(1)) if match else 0

units = {
    'gb': r'(\d+\.?\d*)\s*gb',
    'oz': r'(\d+\.?\d*)\s*oz',
    'inch': r'(\d+\.?\d*)\s*(?:inch|in(?:ch)?)',
    'mp': r'(\d+\.?\d*)\s*mp',
    'lbs': r'(\d+\.?\d*)\s*lbs?',
    'mah': r'(\d+\.?\d*)\s*mah',
    'watts': r'(\d+\.?\d*)\s*w(?:atts?)?',
    'kg': r'(\d+\.?\d*)\s*kg',
    'ml': r'(\d+\.?\d*)\s*ml'
}

def extract_features(row):
    text = row['catalog_content'].lower()
    feats = {'quantity': extract_quantity(text)}
    for unit, pattern in units.items():
        feats[f'feat_{unit}'] = extract_numeric(text, pattern)
    premiums = ['premium', 'luxury', 'high-end', 'pro', 'ultra', 'elite', 'deluxe', 'professional']
    feats['premium_keyword_count'] = sum(text.count(word) for word in premiums)
    title = re.split(r'[.:]\s', text)[0]
    feats['title_length'] = len(title)
    feats['content_word_count'] = len(text.split())
    return pd.Series(feats)

# Batched T5 paraphrasing
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
def paraphrase_batch(batch_texts):
    input_texts = [f"paraphrase: {text}" for text in batch_texts]
    inputs = t5_tokenizer(input_texts, return_tensors='pt', max_length=64, truncation=True, padding=True)
    outputs = t5_model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=64, num_beams=4, early_stopping=True)
    return [t5_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Augmentation (train only, reduced frac and replication for RAM)
def augment_data(df):
    high = df[df['price'] > 100].copy()
    low = df[df['price'] < 10].copy()
    # Batched paraphrasing for highs
    high_texts = high['catalog_content'].tolist()
    batch_size = 4
    paraphrased = []
    for i in range(0, len(high_texts), batch_size):
        batch = high_texts[i:i + batch_size]
        paraphrased.extend(paraphrase_batch(batch))
        gc.collect()
    high['catalog_content'] = [p + ' luxury edition' for p in paraphrased]
    low['catalog_content'] = low['catalog_content'] + ' bulk carton'
    low['quantity'] = low['quantity'] * 2
    high = pd.concat([high] * 3, ignore_index=True)
    low = pd.concat([low] * 3, ignore_index=True)
    augmented = pd.concat([df, high, low], ignore_index=True)
    gc.collect()
    return augmented

# Load and preprocess
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
with open('input/final_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
embeddings = pd.DataFrame(embeddings).reset_index().rename(columns={'index': 'image_link'})
embed_cols = [f'emb_{i}' for i in range(embeddings.shape[1] - 1)]
embeddings.columns = ['image_link'] + embed_cols
train['image_link'] = train['image_link'].astype(str)
test['image_link'] = test['image_link'].astype(str)
embeddings['image_link'] = embeddings['image_link'].astype(str)
train = train.merge(embeddings, on='image_link', how='left')
test = test.merge(embeddings, on='image_link', how='left')
gc.collect()
train = pd.concat([train, train.apply(extract_features, axis=1)], axis=1)
test = pd.concat([test, test.apply(extract_features, axis=1)], axis=1)
gc.collect()

# Split (val has no synthetic data)
X = train.drop('price', axis=1)
y = train['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train = X_train.assign(price=y_train)
train = augment_data(train)
valid = X_valid.assign(price=y_valid)
gc.collect()

num_cols = ['quantity'] + [f'feat_{u}' for u in units] + ['premium_keyword_count', 'title_length', 'content_word_count'] + embed_cols
imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = StandardScaler()
train[num_cols] = train[num_cols].astype(np.float32)
valid[num_cols] = valid[num_cols].astype(np.float32)
test[num_cols] = test[num_cols].astype(np.float32)
train[num_cols] = imputer.fit_transform(train[num_cols])
train[num_cols] = scaler.fit_transform(train[num_cols])
valid[num_cols] = imputer.transform(valid[num_cols])
valid[num_cols] = scaler.transform(valid[num_cols])
test[num_cols] = imputer.transform(test[num_cols])
test[num_cols] = scaler.transform(test[num_cols])
gc.collect()

# Dataset
class PriceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=64):
        self.texts = df['catalog_content'].values
        self.numerics = df[num_cols].values.astype(np.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prices = df['price'].values if 'price' in df.columns else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float32)
        }
        if self.prices is not None:
            item['price'] = torch.tensor(np.log1p(self.prices[idx]), dtype=torch.float32)
        return item

# Model
class BertPriceRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc1 = nn.Linear(768 + len(num_cols), 256)
        self.fc3 = nn.Linear(256, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, numerics):
        text_emb = self.text_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat([text_emb, numerics], dim=1)
        x = self.relu(self.fc1(combined))
        return self.fc3(x)

# Training
device = torch.device('cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_ds = PriceDataset(train, tokenizer)
valid_ds = PriceDataset(valid, tokenizer)
train_loader = DataLoader(train_ds, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=2)
model = BertPriceRegressor().to(device)
model.half()  # Float16 mixed precision
scaler_amp = GradScaler()
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = LinearLR(optimizer, start_factor=0.1, total_iters=100)
loss_fn = nn.MSELoss()
best_val_smape = float('inf')
accumulation_steps = 4
for epoch in range(2):
    model.train()
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        with autocast():
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
            target = batch['price'].to(device)
            pred = model(**inputs).squeeze()
            loss = loss_fn(pred, target)
            loss = loss / accumulation_steps
        scaler_amp.scale(loss).backward()
        if (i + 1) % accumulation_steps == 0:
            scaler_amp.step(optimizer)
            scaler_amp.update()
            optimizer.zero_grad()
    scheduler.step()
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for batch in valid_loader:
            with autocast():
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
                target = batch['price'].to(device)
                pred = model(**inputs).squeeze()
            preds.extend(np.expm1(pred.cpu().numpy()))
            actuals.extend(np.expm1(target.cpu().numpy()))
    val_smape = smape(np.array(actuals), np.array(preds))
    if val_smape < best_val_smape:
        best_val_smape = val_smape
        torch.save(model.state_dict(), 'model.pt')
    gc.collect()
print(f'T6 Validation SMAPE: {best_val_smape}')

# Error Analysis
errors = pd.DataFrame({
    'actual': actuals,
    'pred': preds,
    'diff': np.array(preds) - np.array(actuals),
    'ape': 2 * 100 * np.abs(np.array(preds) - np.array(actuals)) / (np.abs(actuals) + np.abs(preds))
})
bins = [0, 10, 50, 100, 500, np.inf]
labels = ['0-10 (Low/Bulk)', '10-50', '50-100', '100-500 (High)', '500+ (Extreme)']
errors['price_bin'] = pd.cut(errors['actual'], bins=bins, labels=labels)
bin_smape = errors.groupby('price_bin')['ape'].mean() / 2
print('SMAPE per bin:\n', bin_smape)
bin_bias = errors.groupby('price_bin')['diff'].mean()
print('Bias per bin:\n', bin_bias)
bin_var = errors.groupby('price_bin')['diff'].var()
print('Error variance per bin:\n', bin_var)
top_errors = errors.sort_values('ape', ascending=False).head(10)
print('Top 10 worst predictions:\n', top_errors)

# Save
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_dir = f'output/T6_{timestamp}'
os.makedirs(out_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(out_dir, 'model.pt'))
pd.DataFrame({'actual': actuals, 'pred': preds}).to_csv(os.path.join(out_dir, 'preds_valid.csv'), index=False)
with open(os.path.join(out_dir, 'error_analysis.txt'), 'w') as f:
    f.write(f'Validation SMAPE: {best_val_smape}\n')
    f.write('SMAPE per bin:\n' + str(bin_smape) + '\n')
    f.write('Bias per bin:\n' + str(bin_bias) + '\n')
    f.write('Error variance per bin:\n' + str(bin_var) + '\n')
    f.write('Top 10 worst:\n' + str(top_errors) + '\n')

# Test inference
test_ds = PriceDataset(test, tokenizer)
test_loader = DataLoader(test_ds, batch_size=2)
model.eval()
pred_test = []
with torch.no_grad():
    for batch in test_loader:
        with autocast():
            inputs = {k: v.to(device) for k, v in batch.items()}
            pred = model(**inputs).squeeze()
        pred_test.extend(np.expm1(pred.cpu().numpy()))
submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test})
submission.to_csv(os.path.join(out_dir, 'submission.csv'), index=False)
print(f'Outputs saved to {out_dir}')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


: 