T4 ensembles low/mid/high models, uses pre-computed embeddings (gated for highs), and augments tails to target ~40-38 SMAPE.
Synthetic Data: 5x replication for highs (>100, T5 paraphrasing) and lows (<10, bulk terms + quantity scaling). Validation set has no synthetic data.
Embeddings: Load final_embeddings.pkl, map image_link to sample_id, assume 384 dims (confirm if different).
Models: Low (LightGBM, Tweedie p=1.2), mid (V16), high (DistilBERT + embeddings, Tweedie p=1.8 + focal), DistilBERT classifier, Ridge meta-learner.

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import Ridge
import lightgbm as lgb
from transformers import DistilBertTokenizer, DistilBertModel, T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
import os
from datetime import datetime
import pickle
import gc

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature extraction
def extract_quantity(text):
    match = re.search(r'(?:pack|box|set|bundle|case|dozen|carton|bulk) of (\d+)', text, re.I)
    return int(match.group(1)) if match else 1

def extract_numeric(text, pattern):
    match = re.search(pattern, text, re.I)
    return float(match.group(1)) if match else 0

units = {
    'gb': r'(\d+\.?\d*)\s*gb',
    'oz': r'(\d+\.?\d*)\s*oz',
    'inch': r'(\d+\.?\d*)\s*(?:inch|in(?:ch)?)',
    'mp': r'(\d+\.?\d*)\s*mp',
    'lbs': r'(\d+\.?\d*)\s*lbs?',
    'mah': r'(\d+\.?\d*)\s*mah',
    'watts': r'(\d+\.?\d*)\s*w(?:atts?)?',
    'kg': r'(\d+\.?\d*)\s*kg',
    'ml': r'(\d+\.?\d*)\s*ml'
}

def extract_features(row):
    text = row['catalog_content'].lower()
    feats = {'quantity': extract_quantity(text)}
    for unit, pattern in units.items():
        feats[f'feat_{unit}'] = extract_numeric(text, pattern)
    premiums = ['premium', 'luxury', 'high-end', 'pro', 'ultra', 'elite', 'deluxe', 'professional']
    feats['premium_keyword_count'] = sum(text.count(word) for word in premiums)
    if re.search(r'\bnew\b|\bmint\b|\bbrand new\b', text):
        feats['condition_flag'] = 1
    elif re.search(r'\bused\b|\brefurbished\b|\bpre-owned\b', text):
        feats['condition_flag'] = 0
    else:
        feats['condition_flag'] = 0.5
    title = re.split(r'[.:]\s', text)[0]
    feats['title_length'] = len(title)
    feats['content_word_count'] = len(text.split())
    return pd.Series(feats)

# T5 paraphrasing (batched version)
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
def paraphrase_batch(batch_texts):
    input_texts = [f"paraphrase: {text}" for text in batch_texts]
    inputs = t5_tokenizer(input_texts, return_tensors='pt', max_length=128, truncation=True, padding=True)
    with torch.no_grad():
        outputs = t5_model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128, num_beams=4, early_stopping=True)
    return [t5_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Augmentation (with batched paraphrasing)
def augment_data(df):
    high = df[df['price'] > 100].copy()
    low = df[df['price'] < 10].copy()
    # Paraphrase subset of high samples in batches
    high_subset = high.sample(frac=0.5, random_state=42)
    subset_texts = high_subset['catalog_content'].tolist()
    batch_size = 4  # Adjust based on your laptop's memory; smaller if needed
    paraphrased = []
    for i in range(0, len(subset_texts), batch_size):
        batch = subset_texts[i:i + batch_size]
        paraphrased.extend(paraphrase_batch(batch))
        gc.collect()  # Free memory after each batch
    high.loc[high_subset.index, 'catalog_content'] = [p + ' luxury edition' for p in paraphrased]
    low['catalog_content'] = low['catalog_content'] + ' bulk carton'
    low['quantity'] = low['quantity'] * 2
    high = pd.concat([high] * 5, ignore_index=True)
    low = pd.concat([low] * 5, ignore_index=True)
    augmented = pd.concat([df, high, low], ignore_index=True)
    gc.collect()
    return augmented

# Load and preprocess
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
with open('input/final_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
embeddings = pd.DataFrame(embeddings).reset_index().rename(columns={'index': 'image_link'})
embed_cols = [f'emb_{i}' for i in range(embeddings.shape[1] - 1)]
embeddings.columns = ['image_link'] + embed_cols
train['image_link'] = train['image_link'].astype(str)
test['image_link'] = test['image_link'].astype(str)
embeddings['image_link'] = embeddings['image_link'].astype(str)
train = train.merge(embeddings, on='image_link', how='left')
test = test.merge(embeddings, on='image_link', how='left')
gc.collect()
train = pd.concat([train, train.apply(extract_features, axis=1)], axis=1)
test = pd.concat([test, test.apply(extract_features, axis=1)], axis=1)
gc.collect()

# Split (val has no synthetic data)
X = train.drop('price', axis=1)
y = train['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
train = X_train.assign(price=y_train)
train = augment_data(train)
valid = X_valid.assign(price=y_valid)
gc.collect()

num_cols = ['quantity'] + [f'feat_{u}' for u in units] + ['premium_keyword_count', 'condition_flag', 'title_length', 'content_word_count']
imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = StandardScaler()
train[num_cols] = train[num_cols].astype(np.float32)
valid[num_cols] = valid[num_cols].astype(np.float32)
test[num_cols] = test[num_cols].astype(np.float32)
train[num_cols] = imputer.fit_transform(train[num_cols])
train[num_cols] = scaler.fit_transform(train[num_cols])
valid[num_cols] = imputer.transform(valid[num_cols])
valid[num_cols] = scaler.transform(valid[num_cols])
test[num_cols] = imputer.transform(test[num_cols])
test[num_cols] = scaler.transform(test[num_cols])
train[embed_cols] = train[embed_cols].astype(np.float32)
valid[embed_cols] = valid[embed_cols].astype(np.float32)
test[embed_cols] = test[embed_cols].astype(np.float32)
gc.collect()

# Classifier Dataset
class ClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['catalog_content'].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = (df['price'] > 100).astype(int).values if 'price' in df.columns else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# High-Price Dataset
class HighPriceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['catalog_content'].values
        self.numerics = df[num_cols].values.astype(np.float32)
        self.embeds = df[embed_cols].values.astype(np.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prices = df['price'].values if 'price' in df.columns else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float32),
            'embeds': torch.tensor(self.embeds[idx], dtype=torch.float32)
        }
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float32)
        return item

# Models
class TweedieLoss(nn.Module):
    def __init__(self, p=1.8):
        super().__init__()
        self.p = p

    def forward(self, pred, target):
        pred = torch.clamp(pred, min=1e-6)
        term1 = -target * torch.pow(pred, 1 - self.p) / (1 - self.p)
        term2 = torch.pow(pred, 2 - self.p) / (2 - self.p)
        return torch.mean(term1 + term2)

class FocalLoss(nn.Module):
    def __init__(self, gamma=1.0):
        super().__init__()
        self.gamma = gamma

    def forward(self, pred, target):
        pred = torch.clamp(pred, min=1e-6)
        l = torch.abs(pred - target)
        return torch.mean(l * torch.pow(l, self.gamma))

class HighPriceRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc1 = nn.Linear(768 + len(num_cols) + len(embed_cols), 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, numerics, embeds):
        text_emb = self.text_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat([text_emb, numerics, embeds], dim=1)
        x = self.relu(self.fc1(combined))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Preprocessing for LightGBM
text_transformer = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)  # Reduced
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'catalog_content'),
        ('num', num_transformer, num_cols)
    ])

# Train Classifier
device = torch.device('cpu')  # M1 Pro, no GPU
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_clf_ds = ClassifierDataset(train, tokenizer)
valid_clf_ds = ClassifierDataset(valid, tokenizer)
train_clf_loader = DataLoader(train_clf_ds, batch_size=2, shuffle=True)  # Reduced batch size
valid_clf_loader = DataLoader(valid_clf_ds, batch_size=2)
clf_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
clf_fc = nn.Linear(768, 2).to(device)
optimizer = torch.optim.AdamW(list(clf_model.parameters()) + list(clf_fc.parameters()), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()
best_val_loss = float('inf')
for epoch in range(2):
    clf_model.train()
    train_loss = 0
    for batch in train_clf_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        emb = clf_model(**inputs).last_hidden_state[:, 0, :]
        logits = clf_fc(emb)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    clf_model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in valid_clf_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            emb = clf_model(**inputs).last_hidden_state[:, 0, :]
            logits = clf_fc(emb)
            val_loss += loss_fn(logits, labels).item()
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(clf_fc.state_dict(), 'clf_fc.pt')
    gc.collect()

# Train Low/Mid Models
X_train_lgb = preprocessor.fit_transform(train)
X_valid_lgb = preprocessor.transform(valid)
X_test_lgb = preprocessor.transform(test)
gc.collect()
y_train_lgb = train['price']
y_valid_lgb = valid['price']
low_mask = y_train_lgb < 10
mid_mask = (y_train_lgb >= 10) & (y_train_lgb <= 100)
params_low = {'objective': 'tweedie', 'tweedie_variance_power': 1.2, 'learning_rate': 0.08, 'num_leaves': 31, 'min_data_in_leaf': 20, 'feature_pre_filter': False, 'verbose': -1}
params_mid = {'objective': 'regression', 'learning_rate': 0.05, 'num_leaves': 31, 'min_data_in_leaf': 20, 'feature_pre_filter': False, 'verbose': -1}
dtrain_low = lgb.Dataset(X_train_lgb[low_mask], label=np.log1p(y_train_lgb[low_mask]))
dtrain_mid = lgb.Dataset(X_train_lgb[mid_mask], label=np.log1p(y_train_lgb[mid_mask]))
dvalid = lgb.Dataset(X_valid_lgb, label=np.log1p(y_valid_lgb))
model_low = lgb.train(params_low, dtrain_low, num_boost_round=1000, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50)])
model_mid = lgb.train(params_mid, dtrain_mid, num_boost_round=1000, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50)])
gc.collect()

# Train High Model
train_high_ds = HighPriceDataset(train[train['price'] > 100], tokenizer)
valid_high_ds = HighPriceDataset(valid[valid['price'] > 100], tokenizer)
train_high_loader = DataLoader(train_high_ds, batch_size=2, shuffle=True)  # Reduced batch size
valid_high_loader = DataLoader(valid_high_ds, batch_size=2)
high_model = HighPriceRegressor().to(device)
optimizer = torch.optim.AdamW(high_model.parameters(), lr=2e-5)
tweedie_loss = TweedieLoss(p=1.8)
focal_loss = FocalLoss(gamma=1)
best_val_smape = float('inf')
for epoch in range(2):
    high_model.train()
    for batch in train_high_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
        target = batch['price'].to(device)
        pred = high_model(**inputs).squeeze()
        loss = 0.7 * tweedie_loss(pred, target) + 0.3 * focal_loss(pred, target)
        loss.backward()
        optimizer.step()
    high_model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for batch in valid_high_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
            target = batch['price'].to(device)
            pred = high_model(**inputs).squeeze()
            preds.extend(np.expm1(pred.cpu().numpy()))
            actuals.extend(np.expm1(target.cpu().numpy()))
    val_smape = smape(np.array(actuals), np.array(preds))
    if val_smape < best_val_smape:
        best_val_smape = val_smape
        torch.save(high_model.state_dict(), 'high_model.pt')
    gc.collect()

# Predict
clf_model.eval()
clf_fc.load_state_dict(torch.load('clf_fc.pt'))
high_model.load_state_dict(torch.load('high_model.pt'))
pred_low = np.expm1(model_low.predict(X_valid_lgb))
pred_mid = np.expm1(model_mid.predict(X_valid_lgb))
pred_high = []
valid_high_loader = DataLoader(HighPriceDataset(valid[valid['price'] > 100], tokenizer), batch_size=2)  # Reduced
with torch.no_grad():
    for batch in valid_high_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'price'}
        pred = high_model(**inputs).squeeze()
        pred_high.extend(np.expm1(pred.cpu().numpy()))
pred_high = np.array(pred_high)
valid_clf_ds = ClassifierDataset(valid, tokenizer)
valid_clf_loader = DataLoader(valid_clf_ds, batch_size=2)  # Reduced
is_high = []
with torch.no_grad():
    for batch in valid_clf_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        emb = clf_model(**inputs).last_hidden_state[:, 0, :]
        logits = clf_fc(emb)
        is_high.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy() > 0.5)
is_high = np.array(is_high)
is_low = valid['price'] < 10
is_mid = (~is_high) & (~is_low)
if len(pred_high) < is_high.sum():
    pred_high = np.pad(pred_high, (0, is_high.sum() - len(pred_high)), constant_values=pred_mid[:is_high.sum() - len(pred_high)])
elif len(pred_high) > is_high.sum():
    pred_high = pred_high[:is_high.sum()]
preds = np.where(is_high, pred_high, np.where(is_low, pred_low, pred_mid))
smape_score = smape(valid['price'], preds)
print(f'T4 Validation SMAPE: {smape_score}')
gc.collect()

# Error Analysis
errors = pd.DataFrame({
    'actual': valid['price'],
    'pred': preds,
    'diff': preds - valid['price'],
    'ape': 2 * 100 * np.abs(preds - valid['price']) / (np.abs(valid['price']) + np.abs(preds))
})
bins = [0, 10, 50, 100, 500, np.inf]
labels = ['0-10 (Low/Bulk)', '10-50', '50-100', '100-500 (High)', '500+ (Extreme)']
errors['price_bin'] = pd.cut(errors['actual'], bins=bins, labels=labels)
bin_smape = errors.groupby('price_bin')['ape'].mean() / 2
print('SMAPE per bin:\n', bin_smape)
bin_bias = errors.groupby('price_bin')['diff'].mean()
print('Bias per bin:\n', bin_bias)
bin_var = errors.groupby('price_bin')['diff'].var()
print('Error variance per bin:\n', bin_var)
top_errors = errors.sort_values('ape', ascending=False).head(10)
print('Top 10 worst predictions:\n', top_errors)
gc.collect()

# Save
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_dir = f'output/T4_{timestamp}'
os.makedirs(out_dir, exist_ok=True)
torch.save(high_model.state_dict(), os.path.join(out_dir, 'high_model.pt'))
model_low.save_model(os.path.join(out_dir, 'low_model.txt'))
model_mid.save_model(os.path.join(out_dir, 'mid_model.txt'))
pd.DataFrame({'actual': valid['price'], 'pred': preds}).to_csv(os.path.join(out_dir, 'preds_valid.csv'), index=False)
with open(os.path.join(out_dir, 'error_analysis.txt'), 'w') as f:
    f.write(f'Validation SMAPE: {smape_score}\n')
    f.write('SMAPE per bin:\n' + str(bin_smape) + '\n')
    f.write('Bias per bin:\n' + str(bin_bias) + '\n')
    f.write('Error variance per bin:\n' + str(bin_var) + '\n')
    f.write('Top 10 worst:\n' + str(top_errors) + '\n')

# Test inference
test_clf_ds = ClassifierDataset(test, tokenizer)
test_clf_loader = DataLoader(test_clf_ds, batch_size=2)  # Reduced
is_high_test = []
with torch.no_grad():
    for batch in test_clf_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        emb = clf_model(**inputs).last_hidden_state[:, 0, :]
        logits = clf_fc(emb)
        is_high_test.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy() > 0.5)
is_high_test = np.array(is_high_test)
pred_low = np.expm1(model_low.predict(X_test_lgb))
pred_mid = np.expm1(model_mid.predict(X_test_lgb))
test_high_ds = HighPriceDataset(test[is_high_test], tokenizer)
test_high_loader = DataLoader(test_high_ds, batch_size=2)  # Reduced
pred_high = []
with torch.no_grad():
    for batch in test_high_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        pred = high_model(**inputs).squeeze()
        pred_high.extend(np.expm1(pred.cpu().numpy()))
pred_high = np.array(pred_high)
if len(pred_high) < is_high_test.sum():
    pred_high = np.pad(pred_high, (0, is_high_test.sum() - len(pred_high)), constant_values=pred_mid[:is_high_test.sum() - len(pred_high)])
elif len(pred_high) > is_high_test.sum():
    pred_high = pred_high[:is_high_test.sum()]
pred_test = np.where(is_high_test, pred_high, pred_mid)
submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test})
submission.to_csv(os.path.join(out_dir, 'submission.csv'), index=False)
print(f'Outputs saved to {out_dir}')
gc.collect()