<a href="https://colab.research.google.com/github/anujac1206/AppDemo/blob/main/ebay_ner_final_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# eBay NER Competition - Complete Pipeline
## BEFORE RUNNING: Runtime > Change runtime type > T4 GPU

### Your files:
- `Listing_Titles.tsv.gz` — 2M titles (records 1-2,000,000). Train=1-5000, Quiz=5001-30000
- `Tagged_Titles_Train.tsv.gz` — training labels (records 1-5000, one row per token)

### What this notebook does:
1. Loads and parses your exact file format
2. Trains gbert-large with 3-fold CV
3. Tunes threshold for F-beta=0.2 (precision-heavy)
4. Generates submission for Quiz records 5001-30000

## CELL 1 — Install & Check GPU

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118 -q

Found existing installation: torch 2.10.0+cpu
Uninstalling torch-2.10.0+cpu:
  Successfully uninstalled torch-2.10.0+cpu
Found existing installation: torchvision 0.25.0+cpu
Uninstalling torchvision-0.25.0+cpu:
  Successfully uninstalled torchvision-0.25.0+cpu
Found existing installation: torchaudio 2.10.0+cpu
Uninstalling torchaudio-2.10.0+cpu:
  Successfully uninstalled torchaudio-2.10.0+cpu
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.1/819.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2

In [None]:
import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

Torch: 2.2.2+cu118
CUDA available: False


In [None]:
#when gpu not available
DEVICE = torch.device("cpu")

In [None]:
!pip install transformers datasets -q

import os, csv, gzip, torch, numpy as np, pandas as pd
from pathlib import Path

if not torch.cuda.is_available():
    raise RuntimeError('NO GPU! Go to Runtime > Change runtime type > T4 GPU and re-run.')
print('GPU:', torch.cuda.get_device_name(0))
print('Memory:', round(torch.cuda.get_device_properties(0).total_memory/1e9, 1), 'GB')

RuntimeError: NO GPU! Go to Runtime > Change runtime type > T4 GPU and re-run.

## CELL 2 — Mount Drive & Set Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# ================================================================
# EDIT THESE to match where you put the files in Google Drive
# ================================================================
BASE_DIR      = '/content/drive/MyDrive/ebay_ner'   # folder containing your files
LISTING_FILE  = f'{BASE_DIR}/Listing_Titles.tsv'
TRAIN_FILE    = f'{BASE_DIR}/Tagged_Titles_Train.tsv'
OUTPUT_DIR    = f'{BASE_DIR}/outputs'
# ================================================================

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Check files exist
for f in [LISTING_FILE, TRAIN_FILE]:
    exists = os.path.exists(f)
    # Also try without .gz in case already decompressed
    if not exists:
        f_nogz = f.replace('.gz', '')
        exists = os.path.exists(f_nogz)
        if exists:
            print(f'Found without .gz: {f_nogz}')
    print(f'{"OK" if exists else "MISSING"}: {f}')

Mounted at /content/drive
OK: /content/drive/MyDrive/ebay_ner/Listing_Titles.tsv
OK: /content/drive/MyDrive/ebay_ner/Tagged_Titles_Train.tsv


## CELL 3 — Load Training Data
Format: Record Number, Category Id, Title, Token, Tag (5 columns, one row per token)
Empty Tag = continuation of previous entity

In [None]:
def load_train_file(filepath):
    """
    Loads Tagged_Titles_Train.tsv.gz
    Columns: Record Number, Category Id, Title, Token, Tag
    Empty Tag = continuation token (belongs to previous entity)
    """
    # Handle both .gz and plain .tsv
    if filepath.endswith('.gz'):
        opener = lambda: gzip.open(filepath, 'rt', encoding='utf-8')
    else:
        opener = lambda: open(filepath, 'r', encoding='utf-8')

    # CRITICAL pandas settings from the competition doc:
    # keep_default_na=False, na_values=None  → empty string stays as empty string, not NaN
    with opener() as f:
        df = pd.read_csv(
            f,
            sep='\t',
            keep_default_na=False,   # CRITICAL: empty tag stays '', not NaN
            na_values=None,          # CRITICAL: nothing becomes NaN
            quoting=0,               # csv.QUOTE_MINIMAL — handles CSV-style quoting
            header=0                 # first row is header
        )

    # Normalise column names (strip spaces, lowercase)
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    print('Train columns:', df.columns.tolist())
    print(f'Rows: {len(df)}')
    print(df.head(15).to_string())
    return df

df_train_raw = load_train_file(TRAIN_FILE)

Train columns: ['record_number', 'category', 'title', 'token', 'tag']
Rows: 56812
    record_number  category                                                                             title             token                          tag
0               1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803              MINI    Kompatible_Fahrzeug_Marke
1               1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803               1.6  Kompatibles_Fahrzeug_Modell
2               1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803           W10B16A             Herstellernummer
3               1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803           W11B16A                             
4               1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803               R50      

## CELL 4 — Fix Column Names (Edit if needed)
Look at the output above and confirm column names. Edit this cell if they differ.

In [None]:
# After seeing the printout above, set the correct column names here:
# Common variations the competition uses:
# 'record_number' or 'record number' or 'record_id'
# 'category_id' or 'category id'
# 'title', 'token', 'tag'

# Auto-detect columns (works for most cases)
cols = df_train_raw.columns.tolist()

# Find each column by position or name
COL_RECORD   = cols[0]   # first column  = record number
COL_CATEGORY = cols[1]   # second column = category id
COL_TITLE    = cols[2]   # third column  = title
COL_TOKEN    = cols[3]   # fourth column = token
COL_TAG      = cols[4]   # fifth column  = tag

print(f'Using columns:')
print(f'  Record:   {COL_RECORD}')
print(f'  Category: {COL_CATEGORY}')
print(f'  Title:    {COL_TITLE}')
print(f'  Token:    {COL_TOKEN}')
print(f'  Tag:      {COL_TAG}')

# Show unique non-empty tags
all_raw_tags = df_train_raw[COL_TAG].unique()
print(f'\nUnique tags ({len(all_raw_tags)}):', sorted(all_raw_tags))

Using columns:
  Record:   record_number
  Category: category
  Title:    title
  Token:    token
  Tag:      tag

Unique tags (31): ['', 'Anwendung', 'Anzahl_Der_Einheiten', 'Besonderheiten', 'Breite', 'Bremsscheiben-Aussendurchmesser', 'Bremsscheibenart', 'Einbauposition', 'Farbe', 'Größe', 'Hersteller', 'Herstellernummer', 'Herstellungsland_Und_-Region', 'Im_Lieferumfang_Enthalten', 'Kompatible_Fahrzeug_Marke', 'Kompatibles_Fahrzeug_Jahr', 'Kompatibles_Fahrzeug_Modell', 'Länge', 'Material', 'Maßeinheit', 'Menge', 'Modell', 'O', 'Oberflächenbeschaffenheit', 'Oe/Oem_Referenznummer(N)', 'Produktart', 'Produktlinie', 'SAE_Viskosität', 'Stärke', 'Technologie', 'Zähnezahl']


## CELL 5 — Parse Into Training Examples
This handles the empty-tag continuation logic correctly.

In [None]:
def parse_train_data(df):
    """
    Converts the raw TSV dataframe into list of examples.
    Each example: {record_id, category_id, tokens, tags}

    Handles empty tag = continuation of previous entity.
    Empty tag → same tag as previous row (I- version for BIO format).
    """
    examples = []

    for record_id, group in df.groupby(COL_RECORD, sort=False):
        group = group.reset_index(drop=True)
        tokens = group[COL_TOKEN].tolist()
        raw_tags = group[COL_TAG].tolist()  # some are '', some are tag names
        category_id = str(group[COL_CATEGORY].iloc[0])
        title = group[COL_TITLE].iloc[0]

        # Convert to BIO tags handling empty = continuation
        bio_tags = []
        current_entity = None  # tracks what entity we're inside

        for raw_tag in raw_tags:
            if raw_tag == '':  # CONTINUATION TOKEN
                if current_entity is not None and current_entity != 'O':
                    bio_tags.append(f'I-{current_entity}')
                else:
                    bio_tags.append('O')
            else:  # NEW TAG
                if raw_tag == 'O':
                    bio_tags.append('O')
                    current_entity = 'O'
                else:
                    bio_tags.append(f'B-{raw_tag}')
                    current_entity = raw_tag

        assert len(tokens) == len(bio_tags), \
            f'Length mismatch in record {record_id}: {len(tokens)} tokens vs {len(bio_tags)} tags'

        examples.append({
            'record_id':   str(record_id),
            'category_id': category_id,
            'title':       title,
            'tokens':      tokens,
            'tags':        bio_tags
        })

    return examples


train_examples = parse_train_data(df_train_raw)

print(f'Parsed {len(train_examples)} training examples')
print('\n--- Example 1 (should match Annexure Example 1) ---')
ex = train_examples[0]
print('Title:', ex['title'])
for tok, tag in zip(ex['tokens'], ex['tags']):
    print(f'  {tok:30s} {tag}')

# Build label set
all_tags = set()
for ex in train_examples:
    all_tags.update(ex['tags'])

sorted_labels = ['O'] + sorted(t for t in all_tags if t != 'O')
label2id = {t: i for i, t in enumerate(sorted_labels)}
id2label  = {i: t for t, i in label2id.items()}
NUM_LABELS = len(sorted_labels)

print(f'\nTotal labels: {NUM_LABELS}')
print('Labels:', sorted_labels)

Parsed 5000 training examples

--- Example 1 (should match Annexure Example 1) ---
Title: MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803
  MINI                           B-Kompatible_Fahrzeug_Marke
  1.6                            B-Kompatibles_Fahrzeug_Modell
  W10B16A                        B-Herstellernummer
  W11B16A                        I-Herstellernummer
  R50                            I-Herstellernummer
  Steuerkettensatz               B-Produktart
  11311485400                    B-Herstellernummer
  Steuerkette                    B-Im_Lieferumfang_Enthalten
  FEBI                           B-Hersteller
  31803                          B-Herstellernummer

Total labels: 48
Labels: ['O', 'B-Anwendung', 'B-Anzahl_Der_Einheiten', 'B-Besonderheiten', 'B-Breite', 'B-Bremsscheiben-Aussendurchmesser', 'B-Bremsscheibenart', 'B-Einbauposition', 'B-Farbe', 'B-Größe', 'B-Hersteller', 'B-Herstellernummer', 'B-Herstellungsland_Und_-Region', 'B-Im_Lieferum

In [None]:
# Build allowed aspect names per category from training data

from collections import defaultdict

allowed_aspects_per_category = defaultdict(set)

for ex in train_examples:
    cat = ex['category_id']
    spans = bio_to_spans(ex['tokens'], ex['tags'])
    for aspect_name, _ in spans:
        if aspect_name != 'O':
            allowed_aspects_per_category[cat].add(aspect_name)

print("Allowed aspects per category:")
for cat, aspects in allowed_aspects_per_category.items():
    print(f"Category {cat}:")
    for a in sorted(aspects):
        print("  ", a)

Allowed aspects per category:
Category 2:
   Anwendung
   Anzahl_Der_Einheiten
   Besonderheiten
   Breite
   Einbauposition
   Größe
   Hersteller
   Herstellernummer
   Im_Lieferumfang_Enthalten
   Kompatible_Fahrzeug_Marke
   Kompatibles_Fahrzeug_Jahr
   Kompatibles_Fahrzeug_Modell
   Länge
   Maßeinheit
   Menge
   Modell
   Oe/Oem_Referenznummer(N)
   Produktart
   SAE_Viskosität
   Zähnezahl
Category 1:
   Anzahl_Der_Einheiten
   Besonderheiten
   Bremsscheiben-Aussendurchmesser
   Bremsscheibenart
   Einbauposition
   Farbe
   Größe
   Hersteller
   Herstellernummer
   Herstellungsland_Und_-Region
   Im_Lieferumfang_Enthalten
   Kompatible_Fahrzeug_Marke
   Kompatibles_Fahrzeug_Jahr
   Kompatibles_Fahrzeug_Modell
   Material
   Maßeinheit
   Modell
   Oberflächenbeschaffenheit
   Oe/Oem_Referenznummer(N)
   Produktart
   Produktlinie
   Stärke
   Technologie


In [None]:
from sklearn.model_selection import train_test_split

train_exs, val_exs = train_test_split(
    train_examples,
    test_size=0.1,
    random_state=42,
    shuffle=True
)

print("Train:", len(train_exs))
print("Val:", len(val_exs))

Train: 4500
Val: 500


## CELL 6 — Load Listing Titles (for Quiz/Test set)
Listing_Titles.tsv.gz: Record Number, Category Id, Title (3 columns)
Quiz = records 5001-30000, Test = disclosed later

In [None]:
def load_listing_file(filepath):
    if filepath.endswith('.gz'):
        opener = lambda: gzip.open(filepath, 'rt', encoding='utf-8')
    else:
        opener = lambda: open(filepath, 'r', encoding='utf-8')

    with opener() as f:
        df = pd.read_csv(
            f,
            sep='\t',
            keep_default_na=False,
            na_values=None,
            quoting=0,
            header=0
        )
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    print('Listing columns:', df.columns.tolist())
    print(f'Total records: {len(df)}')
    print(df.head(3).to_string())
    return df

df_listing = load_listing_file(LISTING_FILE)

# Auto-detect column names
listing_cols = df_listing.columns.tolist()
LIST_COL_RECORD   = listing_cols[0]
LIST_COL_CATEGORY = listing_cols[1]
LIST_COL_TITLE    = listing_cols[2]

# Extract Quiz set: records 5001-30000
# Record numbers start at 1 and are in the first column
quiz_df = df_listing[
    (df_listing[LIST_COL_RECORD] >= 5001) &
    (df_listing[LIST_COL_RECORD] <= 30000)
].copy()

print(f'\nQuiz records: {len(quiz_df)}')

# Build quiz examples (whitespace tokenization only - competition rule)
quiz_examples = []
for _, row in quiz_df.iterrows():
    title = str(row[LIST_COL_TITLE])
    tokens = title.split()  # WHITESPACE ONLY - competition rule, no other processing
    quiz_examples.append({
        'record_id':   str(int(row[LIST_COL_RECORD])),
        'category_id': str(int(row[LIST_COL_CATEGORY])),
        'title':       title,
        'tokens':      tokens
    })

print(f'Quiz examples ready: {len(quiz_examples)}')
print('Sample quiz example:', quiz_examples[0])

Listing columns: ['record_number', 'category', 'title']
Total records: 2000000
   record_number  category                                                                             title
0              1         2  MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz 11311485400 Steuerkette FEBI 31803
1              2         1           ATE Power Disc Bremsenset Mercedes SLC + SLK Vorne 295MM + Hinten 300MM
2              3         1        Textar Bremsscheiben + Bremsbeläge hinten für Fiat Freemont Lancia Voyager

Quiz records: 25000
Quiz examples ready: 25000
Sample quiz example: {'record_id': '5001', 'category_id': '1', 'title': 'OPEL ASTRA H 1.7 CDTI-SET 2 Bremsscheiben 4 Beläge VA', 'tokens': ['OPEL', 'ASTRA', 'H', '1.7', 'CDTI-SET', '2', 'Bremsscheiben', '4', 'Beläge', 'VA']}


## CELL 7 — Tokenizer & Subword Alignment

In [None]:
from transformers import AutoTokenizer

# ================================================================
# MODEL CHOICE (try in order if you get OOM errors):
# 1. 'deepset/gbert-large'      — best quality, needs ~13GB VRAM
# 2. 'deepset/gbert-base'       — good quality, safe on T4
# 3. 'bert-base-german-cased'   — fallback
# ================================================================
MODEL_NAME = 'deepset/gbert-large'
MAX_LEN = 128  # titles are short, 128 is more than enough

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f'Loaded: {MODEL_NAME}')

def tokenize_and_align_labels(example):
    """
    BERT uses subword tokenization. 'Bremsscheibe' might split into ['Brems', '##scheibe'].
    We label only the FIRST subtoken of each word; others get -100 (ignored in loss).
    """
    enc = tokenizer(
        example['tokens'],
        is_split_into_words=True,  # tells tokenizer input is already word-split
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_tensors='pt'
    )

    word_ids = enc.word_ids(batch_index=0)
    aligned = []
    prev_word_id = None
    for wid in word_ids:
        if wid is None:              # [CLS], [SEP], [PAD]
            aligned.append(-100)
        elif wid != prev_word_id:    # first subtoken of a word → use its label
            aligned.append(label2id[example['tags'][wid]])
        else:                        # continuation subtoken → ignore
            aligned.append(-100)
        prev_word_id = wid

    return {
        'input_ids':      enc['input_ids'].squeeze(),
        'attention_mask': enc['attention_mask'].squeeze(),
        'labels':         torch.tensor(aligned)
    }

# Verify alignment works on first example
sample = tokenize_and_align_labels(train_examples[0])
print('input_ids shape:', sample['input_ids'].shape)
print('labels shape:   ', sample['labels'].shape)
print('Tokenizer alignment OK')

Loaded: deepset/gbert-large
input_ids shape: torch.Size([128])
labels shape:    torch.Size([128])
Tokenizer alignment OK


## CELL 8 — Dataset & Model Classes

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForTokenClassification

class NERDataset(Dataset):
    def __init__(self, examples):
        print(f'Building dataset from {len(examples)} examples...')
        self.data = [tokenize_and_align_labels(ex) for ex in examples]
        print('Done.')
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]

def load_fresh_model():
    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    return model

# Test model loads
DEVICE = torch.device('cuda')
m = load_fresh_model()
params = sum(p.numel() for p in m.parameters()) / 1e6
print(f'Model OK: {MODEL_NAME} ({params:.0f}M params)')
del m
torch.cuda.empty_cache()

NameError: name 'MODEL_NAME' is not defined

In [None]:
# Reload saved fold models (no retraining)

fold_models = []
DEVICE = torch.device("cuda")

for fold_num in range(1, 4):  # change if N_FOLDS different
    model = load_fresh_model().to(DEVICE)
    path = f'{OUTPUT_DIR}/fold{fold_num}_best.pt'
    model.load_state_dict(torch.load(path, map_location=DEVICE))
    model.eval()
    fold_models.append(model)
    print(f'Loaded fold {fold_num}')

print("All folds loaded.")

NameError: name 'load_fresh_model' is not defined

\**bold text**## CELL 9 — Training Loop (One Fold)

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import gc

def train_fold(train_exs, val_exs, fold_num, num_epochs=4, batch_size=16):

    print(f'\n========== FOLD {fold_num} ==========')
    print(f'Train: {len(train_exs)} | Val: {len(val_exs)}')

    if len(train_exs) == 0 or len(val_exs) == 0:
        raise ValueError("train_exs or val_exs is empty")

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", DEVICE)

    train_dl = DataLoader(
        NERDataset(train_exs),
        batch_size=batch_size,
        shuffle=True,
        pin_memory=torch.cuda.is_available()
    )

    val_dl = DataLoader(
        NERDataset(val_exs),
        batch_size=batch_size,
        shuffle=False,
        pin_memory=torch.cuda.is_available()
    )

    print("Train batches:", len(train_dl))
    print("Val batches:", len(val_dl))

    model = load_fresh_model().to(DEVICE)

    lr = 1e-5 if 'large' in MODEL_NAME else 2e-5
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    total_steps = len(train_dl) * num_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    use_amp = torch.cuda.is_available()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    save_path = f'{OUTPUT_DIR}/fold{fold_num}_best.pt'
    best_val_loss = float('inf')

    for epoch in range(num_epochs):

        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dl):

            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            optimizer.zero_grad()

            with torch.cuda.amp.autocast(enabled=use_amp):
                loss = model(**batch).loss

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

            if step % 30 == 0:
                print(f'  Ep{epoch+1} step{step}/{len(train_dl)} loss={loss.item():.4f}')

        # --- Validate ---
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for batch in val_dl:
                batch = {k: v.to(DEVICE) for k, v in batch.items()}

                with torch.cuda.amp.autocast(enabled=use_amp):
                    val_loss += model(**batch).loss.item()

        avg_train = total_loss / len(train_dl)
        avg_val   = val_loss   / len(val_dl)

        print(f'  >> Epoch {epoch+1}: train_loss={avg_train:.4f}  val_loss={avg_val:.4f}')

        if avg_val < best_val_loss:
            best_val_loss = avg_val
            torch.save(model.state_dict(), save_path)
            print(f'  ** Saved best (val_loss={best_val_loss:.4f})')

    model.load_state_dict(torch.load(save_path, map_location=DEVICE))
    model.eval()

    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return model

In [None]:
model = train_fold(train_exs, val_exs, fold_num=1)


Train: 4500 | Val: 500
Using device: cuda
Building dataset from 4500 examples...
Done.
Building dataset from 500 examples...
Done.
Train batches: 282
Val batches: 32


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: deepset/gbert-large
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized 

  Ep1 step0/282 loss=4.2598
  Ep1 step30/282 loss=3.4358
  Ep1 step60/282 loss=1.5059
  Ep1 step90/282 loss=0.8992
  Ep1 step120/282 loss=0.5938
  Ep1 step150/282 loss=0.4702
  Ep1 step180/282 loss=0.5627
  Ep1 step210/282 loss=0.4098
  Ep1 step240/282 loss=0.3302
  Ep1 step270/282 loss=0.2411


  with torch.cuda.amp.autocast(enabled=use_amp):


  >> Epoch 1: train_loss=1.1270  val_loss=0.3700
  ** Saved best (val_loss=0.3700)
  Ep2 step0/282 loss=0.2821
  Ep2 step30/282 loss=0.4403
  Ep2 step60/282 loss=0.3597
  Ep2 step90/282 loss=0.3354
  Ep2 step120/282 loss=0.4480
  Ep2 step150/282 loss=0.2859
  Ep2 step180/282 loss=0.5128
  Ep2 step210/282 loss=0.4500
  Ep2 step240/282 loss=0.2860
  Ep2 step270/282 loss=0.3855
  >> Epoch 2: train_loss=0.3240  val_loss=0.3119
  ** Saved best (val_loss=0.3119)
  Ep3 step0/282 loss=0.2921
  Ep3 step30/282 loss=0.1900
  Ep3 step60/282 loss=0.2446
  Ep3 step90/282 loss=0.1511
  Ep3 step120/282 loss=0.2205
  Ep3 step150/282 loss=0.3476
  Ep3 step180/282 loss=0.3124
  Ep3 step210/282 loss=0.1990
  Ep3 step240/282 loss=0.2939
  Ep3 step270/282 loss=0.1754
  >> Epoch 3: train_loss=0.2627  val_loss=0.3006
  ** Saved best (val_loss=0.3006)
  Ep4 step0/282 loss=0.2579
  Ep4 step30/282 loss=0.2285
  Ep4 step60/282 loss=0.1964
  Ep4 step90/282 loss=0.1063
  Ep4 step120/282 loss=0.3900
  Ep4 step150/28

## CELL 10 — Run K-Fold Training
Adjust N_FOLDS and NUM_EPOCHS based on time available.
3 folds × 3 epochs ≈ 1.5–2 hours on T4

In [None]:
from sklearn.model_selection import KFold

# ================================================================
# TIME vs QUALITY:
# N_FOLDS=3, NUM_EPOCHS=4  → best, ~2-3 hours
# N_FOLDS=3, NUM_EPOCHS=3  → good, ~1.5 hours
# N_FOLDS=1, NUM_EPOCHS=4  → fast, single model, ~45 min
# BATCH_SIZE: reduce to 8 if CUDA OOM
# ================================================================
N_FOLDS    = 3
NUM_EPOCHS = 4
BATCH_SIZE = 16

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
fold_models = []
fold_val_indices = []  # save for threshold tuning

for fold_num, (tr_idx, va_idx) in enumerate(kf.split(train_examples), 1):
    fold_val_indices.append(va_idx)
    tr_exs = [train_examples[i] for i in tr_idx]
    va_exs = [train_examples[i] for i in va_idx]

    model = train_fold(tr_exs, va_exs,
                       fold_num=fold_num,
                       num_epochs=NUM_EPOCHS,
                       batch_size=BATCH_SIZE)
    fold_models.append(model)
    gc.collect()
    torch.cuda.empty_cache()

print(f'\nAll {N_FOLDS} folds complete!')


Train: 3333 | Val: 1667
Using device: cuda
Building dataset from 3333 examples...
Done.
Building dataset from 1667 examples...
Done.
Train batches: 209
Val batches: 105


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: deepset/gbert-large
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized 

  Ep1 step0/209 loss=4.0818
  Ep1 step30/209 loss=3.2397
  Ep1 step60/209 loss=1.0804
  Ep1 step90/209 loss=0.8424
  Ep1 step120/209 loss=0.6207
  Ep1 step150/209 loss=0.5250
  Ep1 step180/209 loss=0.3865


  with torch.cuda.amp.autocast(enabled=use_amp):


  >> Epoch 1: train_loss=1.2360  val_loss=0.3919
  ** Saved best (val_loss=0.3919)
  Ep2 step0/209 loss=0.3395
  Ep2 step30/209 loss=0.5906
  Ep2 step60/209 loss=0.3077
  Ep2 step90/209 loss=0.4280
  Ep2 step120/209 loss=0.5283
  Ep2 step150/209 loss=0.3293
  Ep2 step180/209 loss=0.2378
  >> Epoch 2: train_loss=0.3360  val_loss=0.3397
  ** Saved best (val_loss=0.3397)
  Ep3 step0/209 loss=0.2966
  Ep3 step30/209 loss=0.2120
  Ep3 step60/209 loss=0.4318
  Ep3 step90/209 loss=0.1770
  Ep3 step120/209 loss=0.3104
  Ep3 step150/209 loss=0.2918
  Ep3 step180/209 loss=0.2600
  >> Epoch 3: train_loss=0.2870  val_loss=0.3256
  ** Saved best (val_loss=0.3256)
  Ep4 step0/209 loss=0.3048
  Ep4 step30/209 loss=0.3011
  Ep4 step60/209 loss=0.2571
  Ep4 step90/209 loss=0.2097
  Ep4 step120/209 loss=0.2200
  Ep4 step150/209 loss=0.2123
  Ep4 step180/209 loss=0.2197
  >> Epoch 4: train_loss=0.2452  val_loss=0.3201
  ** Saved best (val_loss=0.3201)

Train: 3333 | Val: 1667
Using device: cuda
Building 

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: deepset/gbert-large
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized 

  Ep1 step0/209 loss=3.8932
  Ep1 step30/209 loss=3.1379
  Ep1 step60/209 loss=1.4645
  Ep1 step90/209 loss=1.0188
  Ep1 step120/209 loss=0.4713
  Ep1 step150/209 loss=0.3290
  Ep1 step180/209 loss=0.3047
  >> Epoch 1: train_loss=1.2814  val_loss=0.3744
  ** Saved best (val_loss=0.3744)
  Ep2 step0/209 loss=0.4708
  Ep2 step30/209 loss=0.3702
  Ep2 step60/209 loss=0.3086
  Ep2 step90/209 loss=0.3121
  Ep2 step120/209 loss=0.2706
  Ep2 step150/209 loss=0.3121
  Ep2 step180/209 loss=0.3234
  >> Epoch 2: train_loss=0.3411  val_loss=0.2991
  ** Saved best (val_loss=0.2991)
  Ep3 step0/209 loss=0.3489
  Ep3 step30/209 loss=0.2166
  Ep3 step60/209 loss=0.1942
  Ep3 step90/209 loss=0.4346
  Ep3 step120/209 loss=0.3529
  Ep3 step150/209 loss=0.3030
  Ep3 step180/209 loss=0.2432
  >> Epoch 3: train_loss=0.2782  val_loss=0.2834
  ** Saved best (val_loss=0.2834)
  Ep4 step0/209 loss=0.2942
  Ep4 step30/209 loss=0.3282
  Ep4 step60/209 loss=0.2628
  Ep4 step90/209 loss=0.1960
  Ep4 step120/209 los

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: deepset/gbert-large
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized 

  Ep1 step0/209 loss=3.8803
  Ep1 step30/209 loss=3.0854
  Ep1 step60/209 loss=1.2511
  Ep1 step90/209 loss=0.8752
  Ep1 step120/209 loss=0.6724
  Ep1 step150/209 loss=0.5506
  Ep1 step180/209 loss=0.5680
  >> Epoch 1: train_loss=1.2651  val_loss=0.3895
  ** Saved best (val_loss=0.3895)
  Ep2 step0/209 loss=0.4154
  Ep2 step30/209 loss=0.2989
  Ep2 step60/209 loss=0.2804
  Ep2 step90/209 loss=0.2589
  Ep2 step120/209 loss=0.3835
  Ep2 step150/209 loss=0.2827
  Ep2 step180/209 loss=0.4234
  >> Epoch 2: train_loss=0.3673  val_loss=0.4165
  Ep3 step0/209 loss=0.5709
  Ep3 step30/209 loss=0.4306
  Ep3 step60/209 loss=0.4465
  Ep3 step90/209 loss=0.3658
  Ep3 step120/209 loss=0.3278
  Ep3 step150/209 loss=0.3852
  Ep3 step180/209 loss=0.3470
  >> Epoch 3: train_loss=0.3352  val_loss=0.3618
  ** Saved best (val_loss=0.3618)
  Ep4 step0/209 loss=0.2793
  Ep4 step30/209 loss=0.1991
  Ep4 step60/209 loss=0.4304
  Ep4 step90/209 loss=0.3635
  Ep4 step120/209 loss=0.3820
  Ep4 step150/209 loss=0.

## CELL 11 — Inference Helper Functions
These handle: ensemble logit averaging, threshold, subword→word mapping, span extraction

In [None]:
def get_word_level_predictions(models, tokens, threshold):
    """
    Given a list of word tokens, runs ensemble inference and returns
    a word-level list of predicted BIO tags.
    """
    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_tensors='pt'
    )
    word_ids = enc.word_ids(batch_index=0)

    input_ids = enc['input_ids'].to(DEVICE)
    attn_mask = enc['attention_mask'].to(DEVICE)

    # Ensemble: average logits across all fold models
    all_logits = []
    for m in models:
        with torch.no_grad():
            with torch.cuda.amp.autocast():
                logits = m(input_ids=input_ids, attention_mask=attn_mask).logits
        all_logits.append(logits.cpu().float())
    avg_logits = torch.stack(all_logits).mean(0).squeeze(0)  # (seq_len, num_labels)

    probs      = torch.softmax(avg_logits, dim=-1)
    max_probs, pred_ids = probs.max(dim=-1)

    # Map subtokens → words (first subtoken only)
    word_preds = {}  # word_idx → predicted label
    for pos, wid in enumerate(word_ids):
        if wid is None or wid in word_preds:
            continue
        label = id2label[pred_ids[pos].item()]
        conf  = max_probs[pos].item()
        word_preds[wid] = label if (label != 'O' and conf >= threshold) else 'O'

    return [word_preds.get(i, 'O') for i in range(len(tokens))]


def bio_to_spans(tokens, bio_tags):
    """
    Converts BIO tag list to list of (entity_type, aspect_value) tuples.
    Multi-token spans are joined with single whitespace (ASCII 32).
    """
    spans = []
    i = 0
    while i < len(bio_tags):
        tag = bio_tags[i]
        if tag.startswith('B-'):
            etype = tag[2:]
            span_tokens = [tokens[i]]
            j = i + 1
            while j < len(bio_tags) and bio_tags[j] == f'I-{etype}':
                span_tokens.append(tokens[j])
                j += 1
            # Join with single ASCII space (competition requirement)
            aspect_value = ' '.join(span_tokens)
            spans.append((etype, aspect_value))
            i = j
        else:
            i += 1
    return spans


print('Inference helpers ready.')

Inference helpers ready.


In [None]:
#claude ek mein sab
# ================================================================
# COMPLETE FINAL SUBMISSION CELL
# Run this after Cell 11 (inference helpers)
# Does: per-entity threshold tuning + category filtering + submission
# ================================================================

import csv as csv_mod
import datetime
import numpy as np
from collections import Counter

# ----------------------------------------------------------------
# STEP 1: CATEGORY-ASPECT RULES (from Annexure)
# ----------------------------------------------------------------
VALID_ASPECTS = {
    '1': {  # Car Brake Component Kits
        'Anzahl_Der_Einheiten', 'Besonderheiten', 'Breite',
        'Bremsscheiben-Aussendurchmesser', 'Bremsscheibenart',
        'Einbauposition', 'Farbe', 'Größe', 'Hersteller',
        'Herstellernummer', 'Herstellungsland_Und_-Region',
        'Im_Lieferumfang_Enthalten', 'Kompatible_Fahrzeug_Marke',
        'Kompatibles_Fahrzeug_Jahr', 'Kompatibles_Fahrzeug_Modell',
        'Material', 'Maßeinheit', 'Modell', 'O',
        'Oberflächenbeschaffenheit', 'Oe/Oem_Referenznummer(N)',
        'Produktart', 'Produktlinie', 'Stärke', 'Technologie',
    },
    '2': {  # Car Engine Timing Kits
        'Anwendung', 'Anzahl_Der_Einheiten', 'Besonderheiten',
        'Breite', 'Einbauposition', 'Größe', 'Hersteller',
        'Herstellernummer', 'Im_Lieferumfang_Enthalten',
        'Kompatible_Fahrzeug_Marke', 'Kompatibles_Fahrzeug_Jahr',
        'Kompatibles_Fahrzeug_Modell', 'Länge', 'Maßeinheit',
        'Menge', 'Modell', 'O', 'Oe/Oem_Referenznummer(N)',
        'Produktart', 'SAE_Viskosität', 'Zähnezahl',
    }
}

# ----------------------------------------------------------------
# STEP 2: UPDATED INFERENCE WITH FIXED AUTOCAST WARNING
# ----------------------------------------------------------------
def get_word_level_predictions_per_entity(models, tokens, entity_thresholds, default_threshold=0.5):
    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_tensors='pt'
    )
    word_ids = enc.word_ids(batch_index=0)
    input_ids = enc['input_ids'].to(DEVICE)
    attn_mask  = enc['attention_mask'].to(DEVICE)

    all_logits = []
    for m in models:
        with torch.no_grad():
            with torch.amp.autocast('cuda'):  # fixed warning
                logits = m(input_ids=input_ids, attention_mask=attn_mask).logits
        all_logits.append(logits.cpu().float())
    avg_logits = torch.stack(all_logits).mean(0).squeeze(0)

    probs = torch.softmax(avg_logits, dim=-1)
    max_probs, pred_ids = probs.max(dim=-1)

    word_preds = {}
    for pos, wid in enumerate(word_ids):
        if wid is None or wid in word_preds:
            continue
        label = id2label[pred_ids[pos].item()]
        conf  = max_probs[pos].item()
        if label == 'O':
            word_preds[wid] = 'O'
        else:
            etype = label.replace('B-', '').replace('I-', '')
            threshold = entity_thresholds.get(etype, default_threshold)
            word_preds[wid] = label if conf >= threshold else 'O'

    return [word_preds.get(i, 'O') for i in range(len(tokens))]


# ----------------------------------------------------------------
# STEP 3: F-BETA SCORER
# ----------------------------------------------------------------
def compute_fbeta_score(pred_spans_list, gold_spans_list, beta=0.2):
    tp = fp = fn = 0
    for preds, golds in zip(pred_spans_list, gold_spans_list):
        pred_counts = Counter(preds)
        gold_counts = Counter(golds)
        for span, count in pred_counts.items():
            matched = min(count, gold_counts.get(span, 0))
            tp += matched
            fp += count - matched
        for span, count in gold_counts.items():
            matched = min(count, pred_counts.get(span, 0))
            fn += count - matched
    precision = tp / (tp + fp + 1e-9)
    recall    = tp / (tp + fn + 1e-9)
    fbeta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-9)
    return precision, recall, fbeta


# ----------------------------------------------------------------
# STEP 4: GLOBAL THRESHOLD TUNING
# ----------------------------------------------------------------
print('='*60)
print('STEP 4: Global threshold tuning...')
print('='*60)

# Use last fold's validation set
tune_examples = val_exs
print(f'Tuning on {len(tune_examples)} validation examples...')

# Gold spans (excluding O)
gold_spans_all = []
for ex in tune_examples:
    spans = bio_to_spans(ex['tokens'], ex['tags'])
    gold_spans_all.append([(t, v) for t, v in spans if t != 'O'])

print(f'Tuning on {len(tune_examples)} validation examples')
print(f'\n{"Threshold":>10} | {"Precision":>10} | {"Recall":>8} | {"F-beta(0.2)":>12}')
print('-' * 50)

best_threshold = 0.5
best_fbeta     = 0.0

for threshold in np.arange(0.40, 0.97, 0.03):
    # Use simple uniform threshold for global tuning
    simple_thresholds = {
        label.replace('B-', ''): threshold
        for label in label2id if label.startswith('B-')
    }
    pred_spans_all = []
    for ex in tune_examples:
        bio_preds = get_word_level_predictions_per_entity(
            fold_models, ex['tokens'], simple_thresholds, threshold)
        spans = bio_to_spans(ex['tokens'], bio_preds)
        pred_spans_all.append([(t, v) for t, v in spans if t != 'O'])

    p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_all)
    marker = ' <-- BEST' if f > best_fbeta else ''
    print(f'{threshold:>10.2f} | {p:>10.4f} | {r:>8.4f} | {f:>12.4f}{marker}')

    if f > best_fbeta:
        best_fbeta     = f
        best_threshold = threshold

print(f'\nBest global threshold: {best_threshold:.2f}  (F-beta={best_fbeta:.4f})')


# ----------------------------------------------------------------
# STEP 5: PER-ENTITY THRESHOLD TUNING
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 5: Per-entity threshold tuning...')
print('='*60)

# Entities with lots of FPs from your error analysis → push threshold UP
HIGH_FP_ENTITIES = [
    'Kompatibles_Fahrzeug_Modell',
    'Einbauposition',
    'Produktart',
    'Im_Lieferumfang_Enthalten'
]

# Start every entity at global best
tuned_thresholds = {
    label.replace('B-', ''): best_threshold
    for label in label2id if label.startswith('B-')
}

for entity in HIGH_FP_ENTITIES:
    if f'B-{entity}' not in label2id:
        print(f'  Skipping {entity} (not in label set)')
        continue

    best_e_t     = best_threshold
    best_e_fbeta = 0.0

    for t in np.arange(best_threshold, 0.97, 0.02):
        test_thresholds = dict(tuned_thresholds)
        test_thresholds[entity] = t

        pred_spans_all   = []
        gold_spans_local = []
        for ex in tune_examples:
            bio_preds = get_word_level_predictions_per_entity(
                fold_models, ex['tokens'], test_thresholds)
            spans = bio_to_spans(ex['tokens'], bio_preds)
            pred_spans_all.append([(t2, v) for t2, v in spans if t2 == entity])
            gold_spans = bio_to_spans(ex['tokens'], ex['tags'])
            gold_spans_local.append([(t2, v) for t2, v in gold_spans if t2 == entity])

        p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_local)
        if f > best_e_fbeta:
            best_e_fbeta = f
            best_e_t     = t

    tuned_thresholds[entity] = best_e_t
    print(f'  {entity}: {best_threshold:.2f} → {best_e_t:.2f}  (fbeta={best_e_fbeta:.4f})')

print('\nAll tuned thresholds:')
for entity, t in sorted(tuned_thresholds.items()):
    changed = ' *' if abs(t - best_threshold) > 0.01 else ''
    print(f'  {entity}: {t:.2f}{changed}')


# ----------------------------------------------------------------
# STEP 6: GENERATE SUBMISSION WITH CATEGORY FILTERING
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 6: Generating submission...')
print('='*60)

rows    = []
skipped = 0

for i, ex in enumerate(quiz_examples):
    if i % 3000 == 0:
        print(f'  Processing {i}/{len(quiz_examples)}...')

    category              = str(ex['category_id'])
    valid_for_category    = VALID_ASPECTS.get(category, set())

    bio_preds = get_word_level_predictions_per_entity(
        fold_models, ex['tokens'], tuned_thresholds)
    spans = bio_to_spans(ex['tokens'], bio_preds)

    for entity_type, aspect_value in spans:
        if entity_type == 'O':
            continue
        if entity_type not in valid_for_category:
            skipped += 1
            continue
        rows.append({
            'record_number': ex['record_id'],
            'category_id':   ex['category_id'],
            'aspect_name':   entity_type,
            'aspect_value':  aspect_value
        })

df_sub = pd.DataFrame(rows)
print(f'\nSkipped {skipped} predictions (wrong category)')
print(f'Total submission rows: {len(df_sub)}')
print(f'Records with predictions: {df_sub["record_number"].nunique()} / {len(quiz_examples)}')
print('\nAspect distribution:')
print(df_sub['aspect_name'].value_counts().to_string())


# ----------------------------------------------------------------
# STEP 7: VALIDATE & SAVE
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 7: Validating & saving...')
print('='*60)

# Validation checks
assert df_sub.isnull().sum().sum() == 0, 'NULL values found!'
assert not df_sub['aspect_value'].str.contains('\t').any(), 'TAB in aspect values!'

# Check no invalid category-aspect pairs
for _, row in df_sub.iterrows():
    cat    = str(row['category_id'])
    aspect = row['aspect_name']
    assert aspect in VALID_ASPECTS.get(cat, set()), \
        f'Invalid aspect {aspect} for category {cat}!'

print('All validation checks passed!')

ts         = datetime.datetime.now().strftime('%Y%m%d_%H%M')
sub_path   = f'{OUTPUT_DIR}/submission_final_{ts}.tsv'

df_sub.to_csv(
    sub_path,
    sep='\t',
    index=False,
    quoting=csv_mod.QUOTE_NONE,
    encoding='utf-8'
)
print(f'Saved to: {sub_path}')

# Download to your computer
from google.colab import files
files.download(sub_path)
print('\nDone! Submit the downloaded .tsv file to EvalAI.')

STEP 4: Global threshold tuning...
Tuning on 500 validation examples...
Tuning on 500 validation examples

 Threshold |  Precision |   Recall |  F-beta(0.2)
--------------------------------------------------
      0.40 |     0.8964 |   0.8820 |       0.8958 <-- BEST
      0.43 |     0.8968 |   0.8815 |       0.8962 <-- BEST
      0.46 |     0.8986 |   0.8802 |       0.8979 <-- BEST
      0.49 |     0.8990 |   0.8794 |       0.8983 <-- BEST
      0.52 |     0.9009 |   0.8762 |       0.9000 <-- BEST
      0.55 |     0.9048 |   0.8724 |       0.9035 <-- BEST
      0.58 |     0.9072 |   0.8690 |       0.9057 <-- BEST
      0.61 |     0.9089 |   0.8652 |       0.9071 <-- BEST
      0.64 |     0.9144 |   0.8612 |       0.9123 <-- BEST
      0.67 |     0.9182 |   0.8564 |       0.9156 <-- BEST
      0.70 |     0.9194 |   0.8521 |       0.9166 <-- BEST
      0.73 |     0.9197 |   0.8468 |       0.9167 <-- BEST
      0.76 |     0.9204 |   0.8420 |       0.9171 <-- BEST
      0.79 |     0.9204 |

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Done! Submit the downloaded .tsv file to EvalAI.


In [None]:
# ================================================================
# COMPLETE FINAL SUBMISSION CELL - CORRECTED
# ================================================================

import csv as csv_mod
import datetime
import numpy as np
from collections import Counter

# ----------------------------------------------------------------
# VALID ASPECTS PER CATEGORY (definitive from Annexure)
# ----------------------------------------------------------------
VALID_ASPECTS = {
    '1': {
        'Anzahl_Der_Einheiten', 'Besonderheiten',
        'Bremsscheiben-Aussendurchmesser', 'Bremsscheibenart',
        'Einbauposition', 'Farbe', 'Größe', 'Hersteller',
        'Herstellernummer', 'Herstellungsland_Und_-Region',
        'Im_Lieferumfang_Enthalten', 'Kompatible_Fahrzeug_Marke',
        'Kompatibles_Fahrzeug_Jahr', 'Kompatibles_Fahrzeug_Modell',
        'Material', 'Maßeinheit', 'Modell', 'O',
        'Oberflächenbeschaffenheit', 'Oe/Oem_Referenznummer(N)',
        'Produktart', 'Produktlinie', 'Stärke', 'Technologie',
    },
    '2': {
        'Anwendung', 'Anzahl_Der_Einheiten', 'Besonderheiten',
        'Breite', 'Einbauposition', 'Größe', 'Hersteller',
        'Herstellernummer', 'Im_Lieferumfang_Enthalten',
        'Kompatible_Fahrzeug_Marke', 'Kompatibles_Fahrzeug_Jahr',
        'Kompatibles_Fahrzeug_Modell', 'Länge', 'Maßeinheit',
        'Menge', 'Modell', 'O', 'Oe/Oem_Referenznummer(N)',
        'Produktart', 'SAE_Viskosität', 'Zähnezahl',
    }
}

# ----------------------------------------------------------------
# INFERENCE FUNCTION (fixed autocast warning)
# ----------------------------------------------------------------
def get_word_level_predictions_per_entity(models, tokens, entity_thresholds,
                                           default_threshold=0.5):
    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_tensors='pt'
    )
    word_ids  = enc.word_ids(batch_index=0)
    input_ids = enc['input_ids'].to(DEVICE)
    attn_mask = enc['attention_mask'].to(DEVICE)

    all_logits = []
    for m in models:
        with torch.no_grad():
            with torch.amp.autocast('cuda'):
                logits = m(input_ids=input_ids,
                           attention_mask=attn_mask).logits
        all_logits.append(logits.cpu().float())
    avg_logits = torch.stack(all_logits).mean(0).squeeze(0)

    probs              = torch.softmax(avg_logits, dim=-1)
    max_probs, pred_ids = probs.max(dim=-1)

    word_preds = {}
    for pos, wid in enumerate(word_ids):
        if wid is None or wid in word_preds:
            continue
        label = id2label[pred_ids[pos].item()]
        conf  = max_probs[pos].item()
        if label == 'O':
            word_preds[wid] = 'O'
        else:
            etype     = label.replace('B-', '').replace('I-', '')
            threshold = entity_thresholds.get(etype, default_threshold)
            word_preds[wid] = label if conf >= threshold else 'O'

    return [word_preds.get(i, 'O') for i in range(len(tokens))]


# ----------------------------------------------------------------
# F-BETA SCORER
# ----------------------------------------------------------------
def compute_fbeta_score(pred_spans_list, gold_spans_list, beta=0.2):
    tp = fp = fn = 0
    for preds, golds in zip(pred_spans_list, gold_spans_list):
        pred_counts = Counter(preds)
        gold_counts = Counter(golds)
        for span, count in pred_counts.items():
            matched  = min(count, gold_counts.get(span, 0))
            tp      += matched
            fp      += count - matched
        for span, count in gold_counts.items():
            matched  = min(count, pred_counts.get(span, 0))
            fn      += count - matched
    precision = tp / (tp + fp + 1e-9)
    recall    = tp / (tp + fn + 1e-9)
    fbeta     = ((1 + beta**2) * precision * recall /
                 (beta**2 * precision + recall + 1e-9))
    return precision, recall, fbeta


# ----------------------------------------------------------------
# STEP 1: GLOBAL THRESHOLD TUNING
# ----------------------------------------------------------------
print('='*60)
print('STEP 1: Global threshold tuning')
print('='*60)

tune_val_idx  = fold_val_indices[-1]
tune_examples = [train_examples[i] for i in tune_val_idx]

gold_spans_all = []
for ex in tune_examples:
    spans = bio_to_spans(ex['tokens'], ex['tags'])
    gold_spans_all.append([(t, v) for t, v in spans if t != 'O'])

print(f'Validation set: {len(tune_examples)} examples\n')
print(f'{"Threshold":>10} | {"Precision":>10} | {"Recall":>8} | {"F-beta":>10}')
print('-'*46)

best_threshold = 0.5
best_fbeta     = 0.0

for threshold in np.arange(0.40, 0.97, 0.03):
    simple_t = {
        label.replace('B-', ''): threshold
        for label in label2id if label.startswith('B-')
    }
    pred_spans_all = []
    for ex in tune_examples:
        preds = get_word_level_predictions_per_entity(
            fold_models, ex['tokens'], simple_t, threshold)
        spans = bio_to_spans(ex['tokens'], preds)
        pred_spans_all.append([(t, v) for t, v in spans if t != 'O'])

    p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_all)
    marker  = ' <-- BEST' if f > best_fbeta else ''
    print(f'{threshold:>10.2f} | {p:>10.4f} | {r:>8.4f} | {f:>10.4f}{marker}')

    if f > best_fbeta:
        best_fbeta     = f
        best_threshold = threshold

print(f'\nBest global threshold: {best_threshold:.2f}')


# ----------------------------------------------------------------
# STEP 2: PER-ENTITY THRESHOLD TUNING
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 2: Per-entity threshold tuning')
print('='*60)

# Based on your error analysis - these had most FPs
HIGH_FP_ENTITIES = [
    'Kompatibles_Fahrzeug_Modell',
    'Einbauposition',
    'Produktart',
    'Im_Lieferumfang_Enthalten',
]

# Start every entity at global best
tuned_thresholds = {
    label.replace('B-', ''): best_threshold
    for label in label2id if label.startswith('B-')
}

for entity in HIGH_FP_ENTITIES:
    if f'B-{entity}' not in label2id:
        print(f'  Skipping {entity} (not in training labels)')
        continue

    best_e_t     = best_threshold
    best_e_fbeta = 0.0

    for t in np.arange(best_threshold, 0.97, 0.02):
        test_t = dict(tuned_thresholds)
        test_t[entity] = t

        pred_spans_all   = []
        gold_spans_local = []
        for ex in tune_examples:
            preds = get_word_level_predictions_per_entity(
                fold_models, ex['tokens'], test_t)
            spans = bio_to_spans(ex['tokens'], preds)
            pred_spans_all.append(
                [(t2, v) for t2, v in spans if t2 == entity])
            gold = bio_to_spans(ex['tokens'], ex['tags'])
            gold_spans_local.append(
                [(t2, v) for t2, v in gold if t2 == entity])

        p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_local)
        if f > best_e_fbeta:
            best_e_fbeta = f
            best_e_t     = t

    tuned_thresholds[entity] = best_e_t
    changed = f'{best_threshold:.2f} → {best_e_t:.2f}'
    print(f'  {entity}: {changed}  (fbeta={best_e_fbeta:.4f})')

print('\nFinal thresholds (changed only):')
for entity, t in sorted(tuned_thresholds.items()):
    if abs(t - best_threshold) > 0.01:
        print(f'  {entity}: {t:.2f}')


# ----------------------------------------------------------------
# STEP 3: GENERATE SUBMISSION FOR ALL 25,000 QUIZ RECORDS
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 3: Generating submission (records 5001-30000)')
print('='*60)
print(f'Total quiz records to process: {len(quiz_examples)}')

rows    = []
skipped = 0

for i, ex in enumerate(quiz_examples):
    if i % 3000 == 0:
        print(f'  {i}/{len(quiz_examples)} processed...')

    category           = str(ex['category_id'])
    valid_for_category = VALID_ASPECTS.get(category, set())

    bio_preds = get_word_level_predictions_per_entity(
        fold_models, ex['tokens'], tuned_thresholds)
    spans = bio_to_spans(ex['tokens'], bio_preds)

    for entity_type, aspect_value in spans:
        if entity_type == 'O':
            continue
        if entity_type not in valid_for_category:
            skipped += 1
            continue
        rows.append({
            'record_number': ex['record_id'],
            'category_id':   ex['category_id'],
            'aspect_name':   entity_type,
            'aspect_value':  aspect_value
        })

df_sub = pd.DataFrame(rows)

print(f'\nSkipped {skipped} predictions (wrong category)')
print(f'Total rows in submission: {len(df_sub)}')
print(f'Records with ≥1 prediction: {df_sub["record_number"].nunique()} / {len(quiz_examples)}')
print(f'Records with 0 predictions: {len(quiz_examples) - df_sub["record_number"].nunique()}')
print('\nAspect distribution:')
print(df_sub['aspect_name'].value_counts().to_string())


# ----------------------------------------------------------------
# STEP 4: VALIDATE
# ----------------------------------------------------------------
# ----------------------------------------------------------------
# STEP 4: FULL VALIDATION (checks every single row)
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 4: Full validation (every row)')
print('='*60)

# Check 1: No nulls
assert df_sub.isnull().sum().sum() == 0, \
    'FAIL: NULL values found in submission!'
print(f'PASSED: No null values ({len(df_sub)} rows checked)')

# Check 2: No TAB characters in aspect values
tab_mask = df_sub['aspect_value'].str.contains('\t', regex=False)
assert not tab_mask.any(), \
    f'FAIL: TAB in aspect_value at rows: {df_sub[tab_mask].index.tolist()[:5]}'
print('PASSED: No TAB characters in aspect values')

# Check 3: All record IDs are in valid quiz range
valid_record_ids = set(ex['record_id'] for ex in quiz_examples)
sub_record_ids   = set(df_sub['record_number'].astype(str))
invalid_ids      = sub_record_ids - valid_record_ids
assert len(invalid_ids) == 0, \
    f'FAIL: {len(invalid_ids)} record IDs not in quiz range! Examples: {list(invalid_ids)[:5]}'
print(f'PASSED: All record IDs valid (range 5001-30000)')

# Check 4: EVERY row has a valid aspect for its category
# Build a fast lookup: (category, aspect) -> valid?
invalid_rows = []
for idx, row in df_sub.iterrows():
    cat    = str(row['category_id'])
    aspect = row['aspect_name']
    if aspect not in VALID_ASPECTS.get(cat, set()):
        invalid_rows.append({
            'row':          idx,
            'record':       row['record_number'],
            'category':     cat,
            'bad_aspect':   aspect,
            'value':        row['aspect_value']
        })

if invalid_rows:
    print(f'\nFAIL: {len(invalid_rows)} rows have invalid category-aspect pairs!')
    print('First 10 bad rows:')
    for r in invalid_rows[:10]:
        print(f"  Row {r['row']}: record={r['record']} cat={r['category']} "
              f"aspect={r['bad_aspect']} value={r['value']}")
    raise AssertionError('Fix the VALID_ASPECTS dict or category filtering before submitting!')
else:
    print(f'PASSED: All {len(df_sub)} rows have valid category-aspect pairs')

# Check 5: aspect_value contains no leading/trailing whitespace
stripped = df_sub['aspect_value'].str.strip()
whitespace_issues = (stripped != df_sub['aspect_value']).sum()
if whitespace_issues > 0:
    print(f'WARNING: {whitespace_issues} aspect values have leading/trailing whitespace')
    print('Auto-fixing...')
    df_sub['aspect_value'] = stripped
else:
    print('PASSED: No leading/trailing whitespace in aspect values')

# Check 6: No empty aspect values
empty_mask = df_sub['aspect_value'].str.len() == 0
assert not empty_mask.any(), \
    f'FAIL: {empty_mask.sum()} empty aspect values found!'
print('PASSED: No empty aspect values')

# Summary
print(f'\n=== VALIDATION SUMMARY ===')
print(f'Total rows:              {len(df_sub)}')
print(f'Unique records:          {df_sub["record_number"].nunique()}')
print(f'Records with 0 aspects:  {len(quiz_examples) - df_sub["record_number"].nunique()}')
print(f'Category 1 rows:         {(df_sub["category_id"].astype(str) == "1").sum()}')
print(f'Category 2 rows:         {(df_sub["category_id"].astype(str) == "2").sum()}')
print(f'All checks passed. Safe to submit.')

# ----------------------------------------------------------------
# STEP 5: SAVE & DOWNLOAD
# ----------------------------------------------------------------
print('\n' + '='*60)
print('STEP 5: Saving submission')
print('='*60)

ts       = datetime.datetime.now().strftime('%Y%m%d_%H%M')
sub_path = f'{OUTPUT_DIR}/submission_{ts}.tsv'

df_sub.to_csv(
    sub_path,
    sep='\t',
    index=False,
    quoting=csv_mod.QUOTE_NONE,
    encoding='utf-8'
)

print(f'Saved: {sub_path}')

from google.colab import files
files.download(sub_path)
print('\nDone! Upload the downloaded .tsv to EvalAI.')
print('Filename must contain only letters, digits, underscores and end in .tsv')

## CELL 12 — Threshold Tuning (Most Important Step!)
F-beta=0.2 means precision is ~25× more important than recall.
We find the threshold that maximises F-beta on the validation set.

In [None]:
def compute_fbeta_score(pred_spans_list, gold_spans_list, beta=0.2):
    """
    Span-level F-beta. Each span is (entity_type, aspect_value).
    Matches competition evaluation: counts per (type, value) pair.
    """
    tp = fp = fn = 0
    for preds, golds in zip(pred_spans_list, gold_spans_list):
        # Use multisets: duplicate HONDA entries all count
        from collections import Counter
        pred_counts = Counter(preds)
        gold_counts = Counter(golds)
        for span, count in pred_counts.items():
            matched = min(count, gold_counts.get(span, 0))
            tp += matched
            fp += count - matched
        for span, count in gold_counts.items():
            matched = min(count, pred_counts.get(span, 0))
            fn += count - matched

    precision = tp / (tp + fp + 1e-9)
    recall    = tp / (tp + fn + 1e-9)
    fbeta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-9)
    return precision, recall, fbeta


# Use last fold's validation set for tuning
tune_examples = val_exs

print(f'Tuning on {len(tune_examples)} validation examples...')

# Gold spans for validation set
gold_spans_all = []
for ex in tune_examples:
    spans = bio_to_spans(ex['tokens'], ex['tags'])
    # Exclude 'O' spans (not evaluated)
    gold_spans_all.append([(t, v) for t, v in spans if t != 'O'])

# Try different thresholds
print('\nThreshold | Precision | Recall  | F-beta(0.2)')
print('-' * 52)

best_threshold = 0.5
best_fbeta = 0.0
results = []

for threshold in np.arange(0.40, 0.97, 0.03):
    pred_spans_all = []
    for ex in tune_examples:
        bio_preds = get_word_level_predictions(fold_models, ex['tokens'], threshold)
        spans = bio_to_spans(ex['tokens'], bio_preds)
        pred_spans_all.append([(t, v) for t, v in spans if t != 'O'])

    p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_all)
    results.append((threshold, p, r, f))
    marker = ' <-- BEST' if f > best_fbeta else ''
    print(f'  {threshold:.2f}    | {p:.4f}    | {r:.4f}  | {f:.4f}{marker}')

    if f > best_fbeta:
        best_fbeta = f
        best_threshold = threshold

print(f'\n>>> BEST THRESHOLD = {best_threshold:.2f}')
print(f'>>> F-beta(0.2) = {best_fbeta:.4f}')

# Also try a slightly more aggressive threshold (often better on test due to precision weight)
aggressive_threshold = min(best_threshold + 0.06, 0.95)
print(f'>>> Also try aggressive threshold = {aggressive_threshold:.2f} (submit both, compare on LB)')

Tuning on 500 validation examples...

Threshold | Precision | Recall  | F-beta(0.2)
----------------------------------------------------
  0.40    | 0.8964    | 0.8820  | 0.8958 <-- BEST
  0.43    | 0.8968    | 0.8815  | 0.8962 <-- BEST
  0.46    | 0.8986    | 0.8802  | 0.8979 <-- BEST
  0.49    | 0.8990    | 0.8794  | 0.8983 <-- BEST
  0.52    | 0.9009    | 0.8762  | 0.9000 <-- BEST
  0.55    | 0.9048    | 0.8724  | 0.9035 <-- BEST
  0.58    | 0.9072    | 0.8690  | 0.9057 <-- BEST


KeyboardInterrupt: 

In [None]:
# ================================================================
# CELL 12B: Per-entity threshold tuning
# Different entity types need different confidence cutoffs
# ================================================================

# Start with the global best threshold for all entities
entity_thresholds = {label: best_threshold for label in label2id if label != 'O' and not label.startswith('I-')}

# Entity types with lots of FPs need HIGHER thresholds
# Entity types with lots of FNs and few FPs need LOWER thresholds
# Based on your error analysis:
HIGH_FP_ENTITIES = ['Kompatibles_Fahrzeug_Modell', 'Einbauposition', 'Produktart', 'Im_Lieferumfang_Enthalten']
LOW_FP_ENTITIES  = ['Herstellernummer', 'Hersteller', 'Bremsscheibenart', 'Produktlinie']

def get_word_level_predictions_per_entity(models, tokens, entity_thresholds, default_threshold=0.5):
    """
    Like get_word_level_predictions but uses per-entity thresholds.
    """
    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_tensors='pt'
    )
    word_ids = enc.word_ids(batch_index=0)
    input_ids = enc['input_ids'].to(DEVICE)
    attn_mask  = enc['attention_mask'].to(DEVICE)

    all_logits = []
    for m in models:
        with torch.no_grad():
            with torch.amp.autocast('cuda'):  # updated API to fix FutureWarning
                logits = m(input_ids=input_ids, attention_mask=attn_mask).logits
        all_logits.append(logits.cpu().float())
    avg_logits = torch.stack(all_logits).mean(0).squeeze(0)

    probs = torch.softmax(avg_logits, dim=-1)
    max_probs, pred_ids = probs.max(dim=-1)

    word_preds = {}
    for pos, wid in enumerate(word_ids):
        if wid is None or wid in word_preds:
            continue
        label = id2label[pred_ids[pos].item()]
        conf  = max_probs[pos].item()

        if label == 'O':
            word_preds[wid] = 'O'
        else:
            # Strip B- or I- to get entity type for threshold lookup
            etype = label.replace('B-', '').replace('I-', '')
            threshold = entity_thresholds.get(etype, default_threshold)
            word_preds[wid] = label if conf >= threshold else 'O'

    return [word_preds.get(i, 'O') for i in range(len(tokens))]


# Tune each high-FP entity's threshold independently
print('Tuning per-entity thresholds...')
print('(Only tuning the problematic entities to save time)\n')

tuned_thresholds = dict(entity_thresholds)  # start from global best

for entity in HIGH_FP_ENTITIES:
    b_entity = f'B-{entity}'
    if b_entity not in label2id:
        continue

    best_e_threshold = best_threshold
    best_e_fbeta = 0.0

    for t in np.arange(best_threshold, 0.97, 0.02):
        test_thresholds = dict(tuned_thresholds)
        test_thresholds[entity] = t

        pred_spans_all = []
        gold_spans_all_local = []
        for ex in tune_examples:
            bio_preds = get_word_level_predictions_per_entity(
                fold_models, ex['tokens'], test_thresholds)
            spans = bio_to_spans(ex['tokens'], bio_preds)
            pred_spans_all.append([(t2, v) for t2, v in spans if t2 == entity])
            gold_spans = bio_to_spans(ex['tokens'], ex['tags'])
            gold_spans_all_local.append([(t2, v) for t2, v in gold_spans if t2 == entity])

        p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_all_local)
        if f > best_e_fbeta:
            best_e_fbeta = f
            best_e_threshold = t

    tuned_thresholds[entity] = best_e_threshold
    print(f'  {entity}: {best_threshold:.2f} → {best_e_threshold:.2f} (fbeta={best_e_fbeta:.4f})')

print('\nFinal per-entity thresholds:')
for entity, t in tuned_thresholds.items():
    if t != best_threshold:
        print(f'  {entity}: {t:.2f}  (changed from {best_threshold:.2f})')

NameError: name 'label2id' is not defined

## CELL 13 — Generate Quiz Submission
This generates the submission file for records 5001-30000.

In [None]:
## CELL 13 — Generate Quiz Submission (Correct EvalAI Format)

import datetime
import csv
import pandas as pd

# In Cell 13, change generate_submission to use tuned_thresholds:

def generate_submission_v2(examples, models, entity_thresholds, output_path):
    print(f'Generating submission (per-entity thresholds)...')
    rows = []
    for i, ex in enumerate(examples):
        if i % 2000 == 0:
            print(f'  {i}/{len(examples)}')
        bio_preds = get_word_level_predictions_per_entity(
            models, ex['tokens'], entity_thresholds)
        spans = bio_to_spans(ex['tokens'], bio_preds)
        cat = ex['category_id']

        for entity_type, aspect_value in spans:
            if entity_type == 'O':
               continue

    # Filter invalid aspects for this category
            if entity_type not in allowed_aspects_per_category[cat]:
                continue

            rows.append({
                 'record_number': ex['record_id'],
                 'category_id':   ex['category_id'],
                 'aspect_name':   entity_type,
                 'aspect_value':  aspect_value
             })

    df_sub = pd.DataFrame(rows)
    import csv as csv_mod
    df_sub.to_csv(output_path, sep='\t', index=False,
                  quoting=csv_mod.QUOTE_NONE, encoding='utf-8')
    print(f'Saved: {output_path} ({len(df_sub)} rows)')
    return df_sub

sub_path_v3 = f'{OUTPUT_DIR}/submission_per_entity_thresholds.tsv'
df_sub_v3 = generate_submission_v2(quiz_examples, fold_models, tuned_thresholds, sub_path_v3)

from google.colab import files
files.download(sub_path_v3)

NameError: name 'tuned_thresholds' is not defined

## CELL 14 — Generate Aggressive Threshold Submission
Since F-beta=0.2 strongly rewards precision, this often scores higher on the leaderboard.

In [None]:
sub_path_v2 = f'{OUTPUT_DIR}/submission_t{aggressive_threshold:.2f}_{ts}.tsv'
df_sub_v2 = generate_submission(quiz_examples, fold_models, aggressive_threshold, sub_path_v2)

print(f'\nv1 (t={best_threshold:.2f}): {len(df_sub_v1)} rows (more recall)')
print(f'v2 (t={aggressive_threshold:.2f}): {len(df_sub_v2)} rows (more precision)')
print('Submit v1 first. If LB score disappoints, try v2.')

Generating submission for 25000 examples...
Threshold: 0.8500000000000003
  Processing 0/25000...


  with torch.cuda.amp.autocast():


  Processing 2000/25000...
  Processing 4000/25000...
  Processing 6000/25000...
  Processing 8000/25000...
  Processing 10000/25000...
  Processing 12000/25000...
  Processing 14000/25000...
  Processing 16000/25000...
  Processing 18000/25000...
  Processing 20000/25000...
  Processing 22000/25000...
  Processing 24000/25000...

Saved: /content/drive/MyDrive/ebay_ner/outputs/submission_t0.85_1444.tsv
Total rows: 159608
Unique records with predictions: 24885

v1 (t=0.79): 163692 rows (more recall)
v2 (t=0.85): 159608 rows (more precision)
Submit v1 first. If LB score disappoints, try v2.


## CELL 15 — Validate & Download Submission Files

In [None]:
def validate_submission(df_sub, quiz_examples):
    print('=== Submission Validation ===')

    # 1. Required columns
    required = ['record_number', 'category_id', 'aspect_name', 'aspect_value']
    for col in required:
        assert col in df_sub.columns, f'Missing column: {col}'
    print('OK: All required columns present')

    # 2. No null values
    nulls = df_sub.isnull().sum().sum()
    assert nulls == 0, f'{nulls} null values found!'
    print('OK: No null values')

    # 3. Record numbers are in range
    valid_ids = set(ex['record_id'] for ex in quiz_examples)
    sub_ids = set(df_sub['record_number'].astype(str))
    invalid = sub_ids - valid_ids
    if invalid:
        print(f'WARNING: {len(invalid)} record numbers not in quiz set: {list(invalid)[:5]}')
    else:
        print(f'OK: All record numbers are valid quiz IDs')

    # 4. Aspect values use only ASCII space (not tab)
    has_tab = df_sub['aspect_value'].str.contains('\t').any()
    assert not has_tab, 'TAB found in aspect values!'
    print('OK: No TAB characters in aspect values')

    # 5. Show sample
    print(f'\nTotal rows: {len(df_sub)}')
    print('Sample rows:')
    print(df_sub.head(10).to_string())

validate_submission(df_sub_v1, quiz_examples)

# Download both files
from google.colab import files
print('\nDownloading submission files...')
files.download(sub_path_v1)
files.download(sub_path_v2)

=== Submission Validation ===
OK: All required columns present
OK: No null values
OK: All record numbers are valid quiz IDs
OK: No TAB characters in aspect values

Total rows: 163692
Sample rows:
   record_number  category_id                  aspect_name          aspect_value
0           5001            1    Kompatible_Fahrzeug_Marke                  OPEL
1           5001            1  Kompatibles_Fahrzeug_Modell  ASTRA H 1.7 CDTI-SET
2           5001            1         Anzahl_Der_Einheiten                     2
3           5001            1    Im_Lieferumfang_Enthalten         Bremsscheiben
4           5001            1         Anzahl_Der_Einheiten                     4
5           5001            1    Im_Lieferumfang_Enthalten                Beläge
6           5001            1               Einbauposition                    VA
7           5002            1                   Produktart                  Satz
8           5002            1    Im_Lieferumfang_Enthalten           Gabelf

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## CELL 16 — Quick Error Analysis on Validation Set (Optional)
Run this to understand what your model gets wrong before submitting.

In [None]:
from collections import Counter

fp_types = Counter()
fn_types = Counter()
fp_examples = []
fn_examples = []

for ex in tune_examples[:200]:  # Sample 200 val examples
    bio_preds = get_word_level_predictions(fold_models, ex['tokens'], best_threshold)
    pred_spans = set(bio_to_spans(ex['tokens'], bio_preds))
    gold_spans = set(bio_to_spans(ex['tokens'], ex['tags']))
    pred_spans = {s for s in pred_spans if s[0] != 'O'}
    gold_spans = {s for s in gold_spans if s[0] != 'O'}

    for span in pred_spans - gold_spans:
        fp_types[span[0]] += 1
        if len(fp_examples) < 5:
            fp_examples.append({'title': ex['title'], 'predicted': span})

    for span in gold_spans - pred_spans:
        fn_types[span[0]] += 1
        if len(fn_examples) < 5:
            fn_examples.append({'title': ex['title'], 'missed': span})

print('=== FALSE POSITIVES by type (over-predicted) ===')
for t, c in fp_types.most_common():
    print(f'  {t}: {c}')

print('\n=== FALSE NEGATIVES by type (under-predicted) ===')
for t, c in fn_types.most_common():
    print(f'  {t}: {c}')

print('\n--- Sample False Positives (wrong predictions) ---')
for x in fp_examples:
    print(f'  Title: {x["title"]}')
    print(f'  Incorrectly predicted: {x["predicted"]}\n')

print('--- Sample False Negatives (missed entities) ---')
for x in fn_examples:
    print(f'  Title: {x["title"]}')
    print(f'  Missed: {x["missed"]}\n')

  with torch.cuda.amp.autocast():


=== FALSE POSITIVES by type (over-predicted) ===
  Kompatibles_Fahrzeug_Modell: 62
  Produktart: 10
  Einbauposition: 9
  Im_Lieferumfang_Enthalten: 8
  Herstellernummer: 3
  Kompatible_Fahrzeug_Marke: 3
  Kompatibles_Fahrzeug_Jahr: 2
  Bremsscheiben-Aussendurchmesser: 2
  Hersteller: 1

=== FALSE NEGATIVES by type (under-predicted) ===
  Kompatibles_Fahrzeug_Modell: 127
  Produktart: 30
  Im_Lieferumfang_Enthalten: 25
  Einbauposition: 14
  Herstellernummer: 10
  Kompatible_Fahrzeug_Marke: 7
  Hersteller: 5
  Bremsscheibenart: 3
  Produktlinie: 3
  Kompatibles_Fahrzeug_Jahr: 3
  Oe/Oem_Referenznummer(N): 3
  Material: 2
  Größe: 2
  Besonderheiten: 2
  Modell: 2
  Breite: 1
  Länge: 1
  Stärke: 1
  Bremsscheiben-Aussendurchmesser: 1
  Menge: 1
  Anwendung: 1

--- Sample False Positives (wrong predictions) ---
  Title: Bremsscheiben Bremsbeläge vorne hinten 5-Loch für Opel Astra G Zafira A 2.0 OPC
  Incorrectly predicted: ('Kompatibles_Fahrzeug_Modell', 'Astra G')

  Title: Bremsscheib

## CELL 17 — OOM Fixes & Troubleshooting
Run this cell if anything breaks.

In [None]:
# ============================================================
# PROBLEM: CUDA out of memory
# FIX 1: Reduce batch size
#   BATCH_SIZE = 8   (in Cell 10)
#
# FIX 2: Switch to smaller model
#   MODEL_NAME = 'deepset/gbert-base'   (in Cell 7)
#
# FIX 3: Enable gradient checkpointing
#   Add after model = load_fresh_model().to(DEVICE):
#   model.gradient_checkpointing_enable()
#
# FIX 4: Clear memory between folds
import gc
gc.collect()
torch.cuda.empty_cache()
print('Memory cleared')
print('GPU memory used:', round(torch.cuda.memory_allocated()/1e9,2), 'GB')
print('GPU memory free:', round((torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated())/1e9,2), 'GB')

# ============================================================
# PROBLEM: Colab disconnected mid-training
# FIX: Models are saved to Drive after each fold.
# Re-run Cell 10 - already-saved folds just need to be reloaded:
#
# fold_models = []
# for fold_num in range(1, N_FOLDS+1):
#     m = load_fresh_model().to(DEVICE)
#     m.load_state_dict(torch.load(f'{OUTPUT_DIR}/fold{fold_num}_best.pt'))
#     m.eval()
#     fold_models.append(m)
#     print(f'Loaded fold {fold_num}')

# ============================================================
# PROBLEM: All predictions are 'O'
# Your threshold is too high. Try threshold=0.3 first to confirm
# the model is predicting anything at all, then tune up.

# ============================================================
# PROBLEM: Empty tag not reading correctly
# Make sure you used keep_default_na=False, na_values=None in read_csv
# Check: print(df_train_raw[COL_TAG].value_counts(dropna=False))

print('See comments above for fixes.')

In [None]:
# CELL A: Fix Fahrzeug_Modell + generate better submission
# Run RIGHT NOW

import csv as csv_mod, datetime
import numpy as np
from collections import Counter

# --- Find best threshold for Fahrzeug_Modell ---
print('Finding best Fahrzeug_Modell threshold...')
print(f'{"threshold":>10} | {"Precision":>10} | {"Recall":>8} | {"F-beta":>10} | {"# preds":>8}')
print('-'*58)

best_modell_t     = 0.76
best_modell_fbeta = 0.0

for t_modell in np.arange(0.76, 0.97, 0.02):
    test_t = dict(tuned_thresholds)
    test_t['Kompatibles_Fahrzeug_Modell'] = t_modell

    pred_spans_all   = []
    gold_spans_local = []
    for ex in tune_examples:
        preds = get_word_level_predictions_per_entity(
            fold_models, ex['tokens'], test_t)
        spans = bio_to_spans(ex['tokens'], preds)
        pred_spans_all.append(
            [(t2, v) for t2, v in spans
             if t2 == 'Kompatibles_Fahrzeug_Modell'])
        gold = bio_to_spans(ex['tokens'], ex['tags'])
        gold_spans_local.append(
            [(t2, v) for t2, v in gold
             if t2 == 'Kompatibles_Fahrzeug_Modell'])

    p, r, f = compute_fbeta_score(pred_spans_all, gold_spans_local)
    count   = sum(len(s) for s in pred_spans_all)
    marker  = ' <-- BEST' if f > best_modell_fbeta else ''
    print(f'{t_modell:>10.2f} | {p:>10.4f} | {r:>8.4f} | '
          f'{f:>10.4f} | {count:>8}{marker}')

    if f > best_modell_fbeta:
        best_modell_fbeta = f
        best_modell_t     = t_modell

print(f'\nBest threshold: {best_modell_t:.2f}')

# Update threshold
tuned_thresholds['Kompatibles_Fahrzeug_Modell'] = best_modell_t

# --- Generate new submission ---
rows = []
for ex in quiz_examples:
    category           = str(ex['category_id'])
    valid_for_category = VALID_ASPECTS.get(category, set())
    bio_preds          = get_word_level_predictions_per_entity(
        fold_models, ex['tokens'], tuned_thresholds)
    spans              = bio_to_spans(ex['tokens'], bio_preds)
    for entity_type, aspect_value in spans:
        if entity_type == 'O':
            continue
        if entity_type not in valid_for_category:
            continue
        rows.append({
            'record_number': ex['record_id'],
            'category_id':   ex['category_id'],
            'aspect_name':   entity_type,
            'aspect_value':  aspect_value
        })

df_new   = pd.DataFrame(rows)
ts       = datetime.datetime.now().strftime('%Y%m%d_%H%M')
new_path = f'{OUTPUT_DIR}/submission_modell_fix_{ts}.tsv'
df_new.to_csv(new_path, sep='\t', index=False,
              quoting=csv_mod.QUOTE_NONE, encoding='utf-8')

print(f'Old Fahrzeug_Modell count: 35,509')
print(f'New Fahrzeug_Modell count: '
      f'{(df_new["aspect_name"]=="Kompatibles_Fahrzeug_Modell").sum():,}')
print(f'Total rows: {len(df_new):,}')

from google.colab import files
files.download(new_path)
print('\nDone! Submit this file to EvalAI now.')

Finding best Fahrzeug_Modell threshold...
 threshold |  Precision |   Recall |     F-beta |  # preds
----------------------------------------------------------


NameError: name 'tuned_thresholds' is not defined