# Product Length Prediction
Text Encoder + Product Type Embedding → MLP → Product Length

In [1]:
# Install dependencies (Colab)
!pip install -q pytorch-lightning transformers wandb sentence-transformers

In [4]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from tqdm import tqdm
import wandb

wandb.login()

pl.seed_everything(42)

[34m[1mwandb[0m: Currently logged in as: [33mbhanu-prasanna2001[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Seed set to 42


42

In [5]:
# Update these paths to your Drive location
DATA_DIR = '/kaggle/input/amazon-ml-challenge-2023/total_sentence_data/total_sentence_data/'
TRAIN_PATH = DATA_DIR + 'total_sentence_train.csv'
TEST_PATH = DATA_DIR + 'total_sentence_test.csv'

## Config

In [6]:
from dataclasses import dataclass

@dataclass
class Config:
    train_path: str = TRAIN_PATH
    test_path: str = TEST_PATH
    max_length: int = 256
    text_encoder: str = "sentence-transformers/all-MiniLM-L6-v2"
    product_type_emb_dim: int = 64
    hidden_dims: tuple = (256, 64)
    dropout: float = 0.1
    batch_size: int = 64
    lr: float = 2e-5
    weight_decay: float = 0.01
    epochs: int = 2
    warmup_ratio: float = 0.1
    num_workers: int = 2
    seed: int = 42

config = Config()

## Dataset

In [7]:
class ProductDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, product_type_map, is_test=False):
        self.texts = df['TOTAL_SENTENCE'].tolist()
        self.product_types = df['PRODUCT_TYPE_ID'].map(product_type_map).fillna(0).astype(int).tolist()
        self.targets = None if is_test else df['PRODUCT_LENGTH'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'product_type': torch.tensor(self.product_types[idx], dtype=torch.long)
        }
        if not self.is_test:
            item['target'] = torch.tensor(self.targets[idx], dtype=torch.float32)
        return item

In [8]:
class ProductDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
        self.product_type_map = None

    def setup(self, stage=None):
        train_df = pd.read_csv(self.config.train_path)
        
        # Shuffle the dataframe for random split
        train_df = train_df.sample(frac=1, random_state=self.config.seed).reset_index(drop=True)
        
        all_types = train_df['PRODUCT_TYPE_ID'].unique()
        self.product_type_map = {t: i+1 for i, t in enumerate(all_types)}
        self.num_product_types = len(all_types) + 1
        
        # Split train/val/test (80/10/10)
        n = len(train_df)
        train_end = int(0.8 * n)
        val_end = int(0.9 * n)
        
        self.train_df = train_df.iloc[:train_end]
        self.val_df = train_df.iloc[train_end:val_end]
        self.test_df = train_df.iloc[val_end:]
        
        self.train_ds = ProductDataset(
            self.train_df, self.tokenizer, self.config.max_length, self.product_type_map
        )
        self.val_ds = ProductDataset(
            self.val_df, self.tokenizer, self.config.max_length, self.product_type_map
        )
        self.test_ds = ProductDataset(
            self.test_df, self.tokenizer, self.config.max_length, self.product_type_map
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.config.batch_size,
            shuffle=True,
            num_workers=self.config.num_workers,
            pin_memory=True,
            persistent_workers=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
            pin_memory=True,
            persistent_workers=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.config.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
            pin_memory=True,
            persistent_workers=True
        )

## Model

In [9]:
class ProductLengthModel(pl.LightningModule):
    def __init__(self, config, num_product_types):
        super().__init__()
        self.save_hyperparameters()
        self.config = config

        self.text_encoder = AutoModel.from_pretrained(config.text_encoder)
        self.text_dim = self.text_encoder.config.hidden_size

        self.product_emb = nn.Embedding(num_product_types, config.product_type_emb_dim)

        input_dim = self.text_dim + config.product_type_emb_dim
        layers = []
        for hidden_dim in config.hidden_dims:
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(config.dropout)
            ])
            input_dim = hidden_dim
        layers.append(nn.Linear(input_dim, 1))
        self.head = nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, product_type):
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = (text_out.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1)
        text_emb = text_emb / attention_mask.sum(-1, keepdim=True)

        type_emb = self.product_emb(product_type)

        combined = torch.cat([text_emb, type_emb], dim=-1)
        return self.head(combined).squeeze(-1)

    def _step(self, batch, stage):
        pred = self(batch['input_ids'], batch['attention_mask'], batch['product_type'])
        target = batch['target']

        pred_log = torch.log1p(torch.clamp(pred, min=0))
        target_log = torch.log1p(target)
        loss = nn.functional.mse_loss(pred_log, target_log)

        with torch.no_grad():
            mape = torch.mean(torch.abs((target - pred) / target)) * 100
            rmsle = torch.sqrt(loss)

        self.log(f'{stage}_loss', loss, prog_bar=True)
        self.log(f'{stage}_mape', mape, prog_bar=True)
        self.log(f'{stage}_rmsle', rmsle, prog_bar=True)
        return loss

    def training_step(self, batch, batch_idx):
        return self._step(batch, 'train')

    def validation_step(self, batch, batch_idx):
        return self._step(batch, 'val')

    def test_step(self, batch, batch_idx):
        return self._step(batch, 'test')

    def configure_optimizers(self):
        encoder_params = list(self.text_encoder.parameters())
        other_params = list(self.product_emb.parameters()) + list(self.head.parameters())

        optimizer = torch.optim.AdamW([
            {'params': encoder_params, 'lr': self.config.lr * 0.1},
            {'params': other_params, 'lr': self.config.lr}
        ], weight_decay=self.config.weight_decay)

        total_steps = self.trainer.estimated_stepping_batches
        warmup_steps = int(self.config.warmup_ratio * total_steps)

        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=[self.config.lr * 0.1, self.config.lr],
            total_steps=total_steps,
            pct_start=warmup_steps / total_steps
        )

        return {
            'optimizer': optimizer,
            'lr_scheduler': {'scheduler': scheduler, 'interval': 'step'}
        }

## Training

In [10]:
# Initialize data module
dm = ProductDataModule(config)
dm.setup()

print(f"Train samples: {len(dm.train_ds)}")
print(f"Val samples: {len(dm.val_ds)}")
print(f"Test samples: {len(dm.test_ds)}")
print(f"Product types: {dm.num_product_types}")

Train samples: 1738559
Val samples: 217320
Test samples: 217320
Product types: 12773


In [11]:
# Initialize model
model = ProductLengthModel(config, dm.num_product_types)
print(f"Text encoder dim: {model.text_dim}")
print(f"Combined dim: {model.text_dim + config.product_type_emb_dim}")

2026-01-22 08:39:25.047714: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769071165.087741     253 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769071165.100618     253 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769071165.130299     253 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769071165.130323     253 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769071165.130329     253 computation_placer.cc:177] computation placer alr

Text encoder dim: 384
Combined dim: 448


In [12]:
# Setup logger and callbacks
wandb_logger = WandbLogger(
    project="amazon-product-length",
    name="text_encoder_v1",
    config=config.__dict__
)

callbacks = [
    ModelCheckpoint(
        dirpath="checkpoints",
        filename="best-{epoch}-{val_rmsle:.4f}",
        monitor="val_rmsle",
        mode="min",
        save_top_k=1
    ),
    EarlyStopping(monitor="val_rmsle", patience=3, mode="min"),
    LearningRateMonitor(logging_interval="step")
]

In [13]:
# Initialize trainer
trainer = pl.Trainer(
    max_epochs=config.epochs,
    accelerator="auto",
    devices=1,
    logger=wandb_logger,
    callbacks=callbacks,
    gradient_clip_val=1.0,
    accumulate_grad_batches=2,
    val_check_interval=0.5,
    log_every_n_steps=50
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


In [14]:
# Train!
trainer.fit(model, dm)

# Test on held-out test set from train data
print("\n--- Testing on held-out test set ---")
trainer.test(model, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Output()

`Trainer.fit` stopped: `max_epochs=2` reached.



--- Testing on held-out test set ---


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'test_loss': 0.5071058869361877,
  'test_mape': 156.53073120117188,
  'test_rmsle': 0.6964890956878662}]

In [None]:
wandb.finish()

## Inference

In [None]:
def predict(model, config, dm):
    model.eval()
    model.cuda()
    
    test_df = pd.read_csv(config.test_path)
    tokenizer = AutoTokenizer.from_pretrained(config.text_encoder)
    test_ds = ProductDataset(test_df, tokenizer, config.max_length, dm.product_type_map, is_test=True)
    test_loader = DataLoader(test_ds, batch_size=config.batch_size * 2, shuffle=False, num_workers=2)
    
    predictions = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            pred = model(
                batch['input_ids'].cuda(),
                batch['attention_mask'].cuda(),
                batch['product_type'].cuda()
            )
            predictions.extend(pred.cpu().numpy().tolist())
    
    submission = pd.DataFrame({
        'PRODUCT_ID': test_df['PRODUCT_ID'],
        'PRODUCT_LENGTH': predictions
    })
    return submission

In [None]:
# Snapping function - round predictions to nearest training value
def snap_to_nearest(pred, values):
    idx = np.searchsorted(values, pred)
    if idx == 0: return values[0]
    if idx == len(values): return values[-1]
    before, after = values[idx-1], values[idx]
    return before if (pred - before) < (after - pred) else after

In [None]:
# Load best checkpoint and predict
best_model = ProductLengthModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path,
    config=config,
    num_product_types=dm.num_product_types,
    weights_only=False
)

# Get raw predictions
submission = predict(best_model, config, dm)

# Load train data to get unique lengths for snapping
train_df = pd.read_csv(config.train_path)
train_lengths = sorted(train_df['PRODUCT_LENGTH'].unique())

# Apply snapping
submission['PRODUCT_LENGTH'] = submission['PRODUCT_LENGTH'].apply(
    lambda x: snap_to_nearest(x, train_lengths)
)

submission.to_csv('submission.csv', index=False)
print(f"Saved submission.csv with {len(submission)} predictions")
submission.head()

In [None]:
# Copy submission to Drive
!cp submission.csv /content/drive/MyDrive/amazon-ml-challenge/