In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/lib/kaggle/gcp.py
/kaggle/input/synthetic-gemini-created-dataset-for-finteuning/val.jsonl
/kaggle/input/synthetic-gemini-created-dataset-for-finteuning/test.jsonl
/kaggle/input/synthetic-gemini-created-dataset-for-finteuning/train.jsonl
/kaggle/input/synthetic-gemini-created-dataset-for-finteuning/llm_intent_dataset.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/train_final.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_final.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/equity_edge_cases.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/val_final.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/bond_edge_cases.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/general_non_bond_edge_cases.jsonl


In [34]:
# ===== GEMINI SETUP CELL =====
!pip install -q google-genai

import os

# Paste your key here (or use Kaggle secrets and set env there)
os.environ["GEMINI_API_KEY"] = "AIzaSyD-9r0N-YGJfL_hZR2elzwpc6m4f6PPb2Y"

print("GEMINI_API_KEY set:", "GEMINI_API_KEY" in os.environ)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GEMINI_API_KEY set: True


In [4]:
import os
import json
import random
from pathlib import Path
from typing import List, Dict, Any, Optional

from sklearn.model_selection import train_test_split
from google import genai

# ============================================================
# CONFIG
# ============================================================
DATA_DIR = Path("/kaggle/input/synthetic-gemini-created-dataset-for-finteuning")
OUTPUT_DIR = Path("/kaggle/working/extended_intent_dataset")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

NON_BOND_CONFIG = {
    "model": "gemini-2.0-flash",
    "samples_per_intent": 400,   # per non-bond intent (tune if needed)
    "max_per_call": 40,          # how many examples per Gemini call
    "augmentation_factor": 0.4,  # 40% extra augmented rows on combined data
    "train_frac": 0.7,
    "val_frac": 0.15,            # test_frac = 1 - train_frac - val_frac
}

assert os.environ.get("GEMINI_API_KEY"), "Set GEMINI_API_KEY in Kaggle before running this cell."

# ============================================================
# UTILS: LOAD EXISTING BOND DATA
# ============================================================

def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    data = []
    if not path.exists():
        print(f"⚠ File not found, skipping: {path}")
        return data
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return data


bond_train = load_jsonl(DATA_DIR / "train.jsonl")
bond_val   = load_jsonl(DATA_DIR / "val.jsonl")
bond_test  = load_jsonl(DATA_DIR / "test.jsonl")
bond_llm   = load_jsonl(DATA_DIR / "llm_intent_dataset.jsonl")

bond_all = bond_train + bond_val + bond_test + bond_llm

# Deduplicate on (text, intent)
seen = set()
deduped_bond: List[Dict[str, Any]] = []
for row in bond_all:
    key = (row.get("text", "").strip(), row.get("intent"))
    if key in seen:
        continue
    seen.add(key)
    deduped_bond.append(row)

print("=" * 60)
print(f"Loaded bond data: {len(bond_all)} rows -> {len(deduped_bond)} unique rows")
print("=" * 60)


# ============================================================
# SIMPLE AUGMENTER (same spirit as original DataAugmenter)
# ============================================================

class SimpleAugmenter:
    @staticmethod
    def augment_dataset(dataset: List[Dict[str, Any]], factor: float = 0.4) -> List[Dict[str, Any]]:
        num_to_augment = int(len(dataset) * factor)
        if num_to_augment <= 0:
            return dataset
        samples = random.sample(dataset, min(num_to_augment, len(dataset)))
        augmented = []
        for sample in samples:
            aug_type = random.choice(["synonym", "insertion", "deletion"])
            if aug_type == "synonym":
                augmented.append(SimpleAugmenter._synonym_replacement(sample))
            elif aug_type == "insertion":
                augmented.append(SimpleAugmenter._random_insertion(sample))
            else:
                augmented.append(SimpleAugmenter._random_deletion(sample))
        print(f"✓ Augmented {len(augmented)} extra rows")
        return dataset + augmented

    @staticmethod
    def _synonym_replacement(sample: Dict[str, Any]) -> Dict[str, Any]:
        synonyms = {
            "find": ["locate", "search for", "look for"],
            "show": ["display", "list", "present"],
            "get": ["fetch", "retrieve"],
            "explain": ["describe", "clarify", "break down"],
        }
        words = sample["text"].split()
        for i, w in enumerate(words):
            low = w.lower().strip(".,!?")
            if low in synonyms and random.random() < 0.3:
                words[i] = random.choice(synonyms[low])
        new_sample = sample.copy()
        new_sample["text"] = " ".join(words)
        return new_sample

    @staticmethod
    def _random_insertion(sample: Dict[str, Any]) -> Dict[str, Any]:
        words = sample["text"].split()
        if len(words) > 3:
            pos = random.randint(0, len(words))
            words.insert(pos, random.choice(["please", "kindly", "also"]))
        new_sample = sample.copy()
        new_sample["text"] = " ".join(words)
        return new_sample

    @staticmethod
    def _random_deletion(sample: Dict[str, Any]) -> Dict[str, Any]:
        words = sample["text"].split()
        words = [w for w in words if w.lower() not in ["please", "kindly"] or random.random() > 0.5]
        new_sample = sample.copy()
        new_sample["text"] = " ".join(words) if words else sample["text"]
        return new_sample


# ============================================================
# GEMINI: NON-BOND DATA GENERATOR
# ============================================================

class NonBondLLMDataGenerator:
    """
    Generate labelled non-bond queries for two new intents:
      - non_bond_search : things you want a search / retrieval backend to handle
      - non_bond_llm    : things better answered by an LLM directly
    """
    def __init__(self, model_name: str, samples_per_intent: int, max_per_call: int = 40):
        self.client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
        self.model_name = model_name
        self.samples_per_intent = samples_per_intent
        self.max_per_call = max_per_call

        self.intent_descriptions = {
            "non_bond_search": (
                "Queries that are clearly NOT about bonds or fixed income. "
                "They ask to look up or fetch external information: news, "
                "documents, data, web pages, PDFs, regulations, etc. "
                "Examples verbs: 'search', 'find', 'show me', 'look up', 'get me', 'download'."
            ),
            "non_bond_llm": (
                "Queries that are clearly NOT about bonds or fixed income. "
                "They ask the assistant to reason, explain, draft, summarize, compare, "
                "or brainstorm. They are best answered directly by an LLM instead of search. "
                "Examples verbs: 'explain', 'summarize', 'compare', 'rewrite', 'draft', 'brainstorm'."
            ),
        }

        self.forbidden_keywords = [
            "bond", "bonds", "g-sec", "gsec", "debenture", "coupon rate",
            "yield", "duration", "sovereign bond", "corporate bond", "fixed income"
        ]

    def generate_dataset(self) -> List[Dict[str, Any]]:
        all_rows: List[Dict[str, Any]] = []
        print("=" * 60)
        print("GENERATING NON-BOND DATA WITH GEMINI")
        print("=" * 60)

        for intent_name in ["non_bond_search", "non_bond_llm"]:
            needed = self.samples_per_intent
            print(f"\n→ Intent: {intent_name}  target={needed}")
            while needed > 0:
                batch_size = min(self.max_per_call, needed)
                batch = self._generate_batch_for_intent(intent_name, batch_size)
                all_rows.extend(batch)
                needed -= len(batch)
                print(f"  Collected {self.samples_per_intent - needed}/{self.samples_per_intent} for {intent_name}")

        print(f"\n✓ Total non-bond rows: {len(all_rows)}")
        return all_rows

    def _generate_batch_for_intent(self, intent_name: str, batch_size: int) -> List[Dict[str, Any]]:
        description = self.intent_descriptions[intent_name]

        system_msg = (
            "You are generating synthetic training data for an intent classifier. "
            "Your job is to create realistic user queries that are NOT about bonds or fixed income."
        )

        user_msg = f"""
Generate {batch_size} diverse user queries whose PRIMARY intent is: "{intent_name}".

Intent description:
{description}

Important constraints:
- The queries MUST NOT be about bonds, fixed income, yields, coupons, debentures, or interest-rate products.
- They can be about anything else: equity markets, crypto, general knowledge, tech, sports, travel, etc.
- Mix short (3-8 words) and longer (1-3 sentences) queries.
- Vary user persona: casual, professional, student, etc.

Return the output in JSON Lines format (one JSON object per line, no array, no extra text).

Each JSON object MUST have exactly these keys:
- "text": string, the user query
- "intent": string, MUST be exactly "{intent_name}"
- "sectors": array of strings, MUST be an empty array [] for non-bond queries
- "rating": null
- "duration": string, use "medium"
- "constraints": object with boolean fields, all false:
  {{
    "preserve_yield": false,
    "maintain_liquidity": false,
    "avoid_downgrades": false,
    "sector_diversity": false,
    "rating_above_aa": false
  }}
"""

        full_prompt = system_msg + "\n\n" + user_msg

        resp = self.client.models.generate_content(
            model=self.model_name,
            contents=full_prompt,
        )

        raw = resp.text or ""
        return self._parse_jsonl(raw, intent_name)

    def _parse_jsonl(self, raw: str, intent_name: str) -> List[Dict[str, Any]]:
        rows: List[Dict[str, Any]] = []
        for line in raw.splitlines():
            line = line.strip()
            if not line or line.startswith("```"):
                continue
            if line.startswith("-"):
                line = line.lstrip("-").strip()
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            sample = self._normalize_record(obj, intent_name)
            if sample is not None:
                rows.append(sample)
        return rows

    def _normalize_record(self, obj: Dict[str, Any], intent_name: str) -> Optional[Dict[str, Any]]:
        text = str(obj.get("text", "")).strip()
        if not text:
            return None

        # Filter out anything that still looks bond-ish
        lower = text.lower()
        if any(kw in lower for kw in self.forbidden_keywords):
            return None

        intent = obj.get("intent", intent_name)
        if intent != intent_name:
            intent = intent_name

        # Force schema consistent with bond dataset
        sample = {
            "text": text,
            "intent": intent,
            "sectors": [],           # always empty for non-bond
            "rating": None,          # always null
            "duration": "medium",    # keep it simple
            "constraints": {
                "preserve_yield": False,
                "maintain_liquidity": False,
                "avoid_downgrades": False,
                "sector_diversity": False,
                "rating_above_aa": False,
            },
        }
        return sample


# ============================================================
# RUN GENERATION + MERGE + AUGMENT + SPLIT
# ============================================================

nonbond_gen = NonBondLLMDataGenerator(
    model_name=NON_BOND_CONFIG["model"],
    samples_per_intent=NON_BOND_CONFIG["samples_per_intent"],
    max_per_call=NON_BOND_CONFIG["max_per_call"],
)

nonbond_data = nonbond_gen.generate_dataset()
print(f"\nNon-bond label counts:")
from collections import Counter
print(Counter(row["intent"] for row in nonbond_data))

# Merge with bond data
combined = deduped_bond + nonbond_data
print(f"\nCombined dataset size before augmentation: {len(combined)}")

# Augment the combined dataset
combined_aug = SimpleAugmenter.augment_dataset(
    combined,
    factor=NON_BOND_CONFIG["augmentation_factor"],
)
print(f"Combined dataset size AFTER augmentation: {len(combined_aug)}")

# ------------------------------------------------------------
# Final stratified split by intent into train/val/test
# ------------------------------------------------------------
intents = [row["intent"] for row in combined_aug]

train_tmp, test_data = train_test_split(
    combined_aug,
    test_size=1.0 - (NON_BOND_CONFIG["train_frac"] + NON_BOND_CONFIG["val_frac"]),
    stratify=intents,
    random_state=42,
)

intents_tmp = [row["intent"] for row in train_tmp]
val_size = NON_BOND_CONFIG["val_frac"] / (NON_BOND_CONFIG["train_frac"] + NON_BOND_CONFIG["val_frac"])

train_data, val_data = train_test_split(
    train_tmp,
    test_size=val_size,
    stratify=intents_tmp,
    random_state=42,
)

print("\nFinal split sizes (all intents: bond + non-bond)")
print(f"Train: {len(train_data)}")
print(f"Val:   {len(val_data)}")
print(f"Test:  {len(test_data)}")

# Save out new JSONL files
def save_jsonl(path: Path, rows: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

save_jsonl(OUTPUT_DIR / "train_extended.jsonl", train_data)
save_jsonl(OUTPUT_DIR / "val_extended.jsonl",   val_data)
save_jsonl(OUTPUT_DIR / "test_extended.jsonl",  test_data)

print(f"\n✓ Saved extended datasets (with non-bond intents) to: {OUTPUT_DIR}")
print("  - train_extended.jsonl")
print("  - val_extended.jsonl")
print("  - test_extended.jsonl")

Loaded bond data: 22100 rows -> 5067 unique rows
GENERATING NON-BOND DATA WITH GEMINI

→ Intent: non_bond_search  target=400
  Collected 40/400 for non_bond_search
  Collected 80/400 for non_bond_search
  Collected 120/400 for non_bond_search
  Collected 160/400 for non_bond_search
  Collected 199/400 for non_bond_search
  Collected 238/400 for non_bond_search
  Collected 278/400 for non_bond_search
  Collected 317/400 for non_bond_search
  Collected 356/400 for non_bond_search
  Collected 396/400 for non_bond_search
  Collected 400/400 for non_bond_search

→ Intent: non_bond_llm  target=400
  Collected 40/400 for non_bond_llm
  Collected 80/400 for non_bond_llm
  Collected 120/400 for non_bond_llm
  Collected 160/400 for non_bond_llm
  Collected 200/400 for non_bond_llm
  Collected 239/400 for non_bond_llm
  Collected 279/400 for non_bond_llm
  Collected 318/400 for non_bond_llm
  Collected 357/400 for non_bond_llm
  Collected 397/400 for non_bond_llm
  Collected 400/400 for non_bond_

In [4]:
!pip install -q transformers accelerate scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
"""
Bond Query Classifier - Training on extended (bond + non-bond) dataset
======================================================================

Assumes these files exist (your newly created dataset):

/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/train_extended.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/val_extended.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_extended.jsonl
"""

# ==================== INSTALL DEPENDENCIES ====================
import sys
import subprocess

print("=" * 60)
print("INSTALLING DEPENDENCIES")
print("=" * 60)

subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers", "accelerate", "scikit-learn"
])

print("✓ Dependencies installed!\n")


# ==================== IMPORTS ====================
import os
import json
import random
from pathlib import Path
from typing import List, Dict, Any

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModel,
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm.auto import tqdm


# ==================== GPU CHECK ====================
print("=" * 60)
print("GPU CHECK")
print("=" * 60)

if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"✓ CUDA version: {torch.version.cuda}")
else:
    print("⚠ WARNING: No GPU detected! Training will be very slow.")

print()


# ==================== CONFIGURATION ====================
CONFIG = {
    'base_model': 'microsoft/deberta-v3-small',

    # ---- extended dataset paths ----
    'dataset_dir': '/kaggle/input/bonds-query-classifier-finetuning-slm-dataset',
    'train_file': 'train_extended.jsonl',
    'val_file':   'val_extended.jsonl',
    'test_file':  'test_extended.jsonl',

    'batch_size': 32,
    'num_epochs': 12,          # you can bump to 8–10 once it trains fine
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'max_length': 128,
    'output_dir': '/kaggle/working/bond_classifier_v3',
    'seed': 42,
}

# Set seeds (same as old pipeline)
random.seed(CONFIG['seed'])
np.random.seed(CONFIG['seed'])
torch.manual_seed(CONFIG['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG['seed'])


# ==================== PYTORCH DATASET (NOW 15 INTENTS) ====================

class BondQueryDataset(Dataset):
    """PyTorch Dataset (bond + non-bond router)"""
    
    def __init__(self, data: List[Dict[str, Any]], tokenizer, max_length: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # 13 original intents + 2 new router intents
        self.intent_to_id = {
            'buy_recommendation': 0,
            'sell_recommendation': 1,
            'portfolio_analysis': 2,
            'reduce_duration': 3,
            'increase_yield': 4,
            'hedge_volatility': 5,
            'sector_rebalance': 6,
            'barbell_strategy': 7,
            'switch_bonds': 8,
            'explain_recommendation': 9,
            'market_outlook': 10,
            'credit_analysis': 11,
            'forecast_prices': 12,
            'non_bond_search': 13,  # NEW
            'non_bond_llm': 14,     # NEW
        }
        
        self.sector_to_id = {
            'Sovereign': 0, 'PSU Energy': 1, 'Financial': 2,
            'Corporate': 3, 'Infrastructure': 4, 'NBFC': 5, 'Banking': 6
        }
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        
        encoding = self.tokenizer(
            sample['text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # ---- intent ----
        intent_label = self.intent_to_id[sample['intent']]
        
        # ---- sectors (multi-label) ----
        sector_labels = torch.zeros(len(self.sector_to_id))
        for sector in sample.get('sectors', []):
            if sector in self.sector_to_id:
                sector_labels[self.sector_to_id[sector]] = 1
        
        # ---- rating ----
        rating = sample.get('rating')
        rating_map = {'AAA': 0, 'AA+': 1, 'AA': 2, 'A+': 3, 'A': 4, 'BBB': 5}
        rating_label = rating_map.get(rating, 6)   # 6 = "other / none"
        
        # ---- duration ----
        duration_map = {'short': 0, 'medium': 1, 'long': 2}
        duration_label = duration_map.get(sample.get('duration', 'medium'), 1)
        
        # ---- constraints (5 binary flags) ----
        constraints = sample.get('constraints', {})
        constraint_labels = torch.tensor([
            float(constraints.get('preserve_yield', False)),
            float(constraints.get('maintain_liquidity', False)),
            float(constraints.get('avoid_downgrades', False)),
            float(constraints.get('sector_diversity', False)),
            float(constraints.get('rating_above_aa', False))
        ])
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'intent_label': torch.tensor(intent_label),
            'sector_labels': sector_labels,
            'rating_label': torch.tensor(rating_label),
            'duration_label': torch.tensor(duration_label),
            'constraint_labels': constraint_labels
        }


# ==================== MODEL (SAME ARCH, 15 INTENT OUTPUTS) ====================

class ProductionBondClassifier(nn.Module):
    """Multi-task classifier"""
    
    def __init__(self, base_model: str = 'distilbert-base-uncased', dropout: float = 0.15):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(base_model)
        hidden_size = self.bert.config.hidden_size
        
        self.feature_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(dropout),
            nn.GELU()
        )
        
        feature_size = hidden_size // 2
        
        # ⬇️ changed from 13 → 15 intents
        self.intent_head = nn.Linear(feature_size, 15)
        self.sector_head = nn.Linear(feature_size, 7)
        self.rating_head = nn.Linear(feature_size, 7)
        self.duration_head = nn.Linear(feature_size, 3)
        self.constraint_head = nn.Linear(feature_size, 5)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        features = self.feature_layer(cls_output)
        features = self.dropout(features)
        
        return {
            'intent_logits': self.intent_head(features),
            'sector_logits': self.sector_head(features),
            'rating_logits': self.rating_head(features),
            'duration_logits': self.duration_head(features),
            'constraint_logits': self.constraint_head(features),
        }


# ==================== LOSS (UNCHANGED) ====================

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()


class MultiTaskLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.intent_loss_fn = FocalLoss(gamma=2.0)
        self.sector_loss_fn = nn.BCEWithLogitsLoss()
        self.rating_loss_fn = nn.CrossEntropyLoss()
        self.duration_loss_fn = nn.CrossEntropyLoss()
        self.constraint_loss_fn = nn.BCEWithLogitsLoss()
    
    def forward(self, outputs, labels):
        intent_loss = self.intent_loss_fn(outputs['intent_logits'], labels['intent_label'])
        sector_loss = self.sector_loss_fn(outputs['sector_logits'], labels['sector_labels'])
        rating_loss = self.rating_loss_fn(outputs['rating_logits'], labels['rating_label'])
        duration_loss = self.duration_loss_fn(outputs['duration_logits'], labels['duration_label'])
        constraint_loss = self.constraint_loss_fn(outputs['constraint_logits'], labels['constraint_labels'])
        
        total = intent_loss + 0.5*sector_loss + 0.3*rating_loss + 0.3*duration_loss + 0.4*constraint_loss
        
        return {'total': total, 'intent': intent_loss}


# ==================== TRAINER (UNCHANGED) ====================

class Trainer:
    def __init__(self, model, train_loader, val_loader, optimizer, scheduler, criterion, device, output_dir):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.best_acc = 0.0
    
    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0
        all_preds, all_labels = [], []
        
        pbar = tqdm(self.train_loader, desc=f'Epoch {epoch}')
        for batch in pbar:
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            
            labels = {k: v.to(self.device) for k, v in batch.items() 
                      if k not in ['input_ids', 'attention_mask']}
            
            self.optimizer.zero_grad()
            outputs = self.model(input_ids, attention_mask)
            loss_dict = self.criterion(outputs, labels)
            loss = loss_dict['total']
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()
            
            total_loss += loss.item()
            preds = outputs['intent_logits'].argmax(dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels['intent_label'].cpu().numpy())
            
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{accuracy_score(all_labels[-len(preds):], preds):.3f}'
            })
        
        return total_loss / len(self.train_loader)
    
    def evaluate(self):
        self.model.eval()
        total_loss = 0
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc='Evaluating'):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = {k: v.to(self.device) for k, v in batch.items() 
                          if k not in ['input_ids', 'attention_mask']}
                
                outputs = self.model(input_ids, attention_mask)
                loss_dict = self.criterion(outputs, labels)
                total_loss += loss_dict['total'].item()
                
                preds = outputs['intent_logits'].argmax(dim=-1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels['intent_label'].cpu().numpy())
        
        return {
            'loss': total_loss / len(self.val_loader),
            'accuracy': accuracy_score(all_labels, all_preds),
            'f1_macro': f1_score(all_labels, all_preds, average='macro')
        }
    
    def save_checkpoint(self, epoch, metrics):
        if metrics['accuracy'] > self.best_acc:
            self.best_acc = metrics['accuracy']
            torch.save(self.model.state_dict(), self.output_dir / 'pytorch_model.bin')
            print(f"✓ New best model saved (acc: {metrics['accuracy']:.4f})")
    
    def train(self, num_epochs):
        print("=" * 60)
        print("TRAINING")
        print("=" * 60)
        
        for epoch in range(1, num_epochs + 1):
            train_loss = self.train_epoch(epoch)
            print(f"\nEpoch {epoch}/{num_epochs} - Train Loss: {train_loss:.4f}")
            
            val_metrics = self.evaluate()
            print(f"Val Loss: {val_metrics['loss']:.4f}")
            print(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
            print(f"Val F1: {val_metrics['f1_macro']:.4f}\n")
            
            self.save_checkpoint(epoch, val_metrics)
        
        print(f"Training complete! Best accuracy: {self.best_acc:.4f}\n")


# ==================== JSONL LOADER ====================

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows


# ==================== MAIN TRAINING FUNCTION ====================

def train_model():
    """Train on pre-built extended dataset (bond + non-bond)."""
    
    dataset_dir = CONFIG['dataset_dir']
    train_path = os.path.join(dataset_dir, CONFIG['train_file'])
    val_path   = os.path.join(dataset_dir, CONFIG['val_file'])
    test_path  = os.path.join(dataset_dir, CONFIG['test_file'])

    print("=" * 60)
    print("LOADING EXTENDED DATASET")
    print("=" * 60)
    train_data = load_jsonl(train_path)
    val_data   = load_jsonl(val_path)
    test_data  = load_jsonl(test_path)

    print(f"Train rows: {len(train_data)}")
    print(f"Val rows:   {len(val_data)}")
    print(f"Test rows:  {len(test_data)}\n")

    # ---- Tokenizer & Datasets ----
    print("=" * 60)
    print("LOADING MODEL & TOKENIZER")
    print("=" * 60)
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['base_model'])
    
    train_dataset = BondQueryDataset(train_data, tokenizer, CONFIG['max_length'])
    val_dataset   = BondQueryDataset(val_data,   tokenizer, CONFIG['max_length'])
    test_dataset  = BondQueryDataset(test_data,  tokenizer, CONFIG['max_length'])
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=CONFIG['batch_size'])
    test_loader  = DataLoader(test_dataset,  batch_size=CONFIG['batch_size'])
    
    # ---- Model ----
    model = ProductionBondClassifier(CONFIG['base_model'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    print(f"✓ Model loaded: {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"✓ Device: {device}\n")
    
    # ---- Optimizer & Scheduler ----
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=0.01)
    num_training_steps = len(train_loader) * CONFIG['num_epochs']
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * CONFIG['warmup_ratio']),
        num_training_steps=num_training_steps
    )
    
    criterion = MultiTaskLoss()
    
    # ---- Train ----
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        device=device,
        output_dir=CONFIG['output_dir'],
    )
    trainer.train(CONFIG['num_epochs'])
    
    # ---- Save tokenizer ----
    tokenizer.save_pretrained(CONFIG['output_dir'])
    
    # ---- Final test ----
    print("=" * 60)
    print("FINAL TEST")
    print("=" * 60)
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = outputs['intent_logits'].argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['intent_label'].numpy())
    
    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='macro')
    
    print(f"\n✓ Test Accuracy: {test_acc:.4f}")
    print(f"✓ Test F1 Macro: {test_f1:.4f}\n")
    
    print("=" * 60)
    print("TRAINING COMPLETE!")
    print("=" * 60)
    print(f"\n✓ Model saved to: {CONFIG['output_dir']}")
    print(f"✓ Files: pytorch_model.bin, tokenizer files")
    print("\nTo download:")
    print("  1. Go to Output tab")
    print("  2. Download 'bond_classifier_v3' folder")
    print("  3. Use locally with your inference notebook")


# ==================== RUN ====================

if __name__ == '__main__':
    train_model()
 

INSTALLING DEPENDENCIES
✓ Dependencies installed!

GPU CHECK
✓ GPU detected: Tesla T4
✓ GPU memory: 15.83 GB
✓ CUDA version: 12.4

LOADING EXTENDED DATASET
Train rows: 5749
Val rows:   1232
Test rows:  1232

LOADING MODEL & TOKENIZER




✓ Model loaded: 142,206,757 parameters
✓ Device: cuda

TRAINING


Epoch 1:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 1/12 - Train Loss: 3.4867


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 1.9907
Val Accuracy: 0.6266
Val F1: 0.5090

✓ New best model saved (acc: 0.6266)


Epoch 2:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 2/12 - Train Loss: 1.2776


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.7625
Val Accuracy: 0.9886
Val F1: 0.9900

✓ New best model saved (acc: 0.9886)


Epoch 3:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 3/12 - Train Loss: 0.7386


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.6087
Val Accuracy: 0.9927
Val F1: 0.9921

✓ New best model saved (acc: 0.9927)


Epoch 4:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 4/12 - Train Loss: 0.5986


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.5192
Val Accuracy: 0.9951
Val F1: 0.9950

✓ New best model saved (acc: 0.9951)


Epoch 5:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 5/12 - Train Loss: 0.5137


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.4520
Val Accuracy: 0.9968
Val F1: 0.9965

✓ New best model saved (acc: 0.9968)


Epoch 6:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 6/12 - Train Loss: 0.4510


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3984
Val Accuracy: 0.9943
Val F1: 0.9941



Epoch 7:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 7/12 - Train Loss: 0.4046


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3597
Val Accuracy: 0.9951
Val F1: 0.9949



Epoch 8:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 8/12 - Train Loss: 0.3745


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3411
Val Accuracy: 0.9951
Val F1: 0.9949



Epoch 9:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 9/12 - Train Loss: 0.3549


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3323
Val Accuracy: 0.9943
Val F1: 0.9943



Epoch 10:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 10/12 - Train Loss: 0.3434


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3241
Val Accuracy: 0.9943
Val F1: 0.9943



Epoch 11:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 11/12 - Train Loss: 0.3376


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3219
Val Accuracy: 0.9951
Val F1: 0.9951



Epoch 12:   0%|          | 0/180 [00:00<?, ?it/s]


Epoch 12/12 - Train Loss: 0.3354


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3209
Val Accuracy: 0.9951
Val F1: 0.9951

Training complete! Best accuracy: 0.9968

FINAL TEST


Testing:   0%|          | 0/39 [00:00<?, ?it/s]


✓ Test Accuracy: 0.9959
✓ Test F1 Macro: 0.9960

TRAINING COMPLETE!

✓ Model saved to: /kaggle/working/bond_classifier_v3
✓ Files: pytorch_model.bin, tokenizer files

To download:
  1. Go to Output tab
  2. Download 'bond_classifier_v3' folder
  3. Use locally with your inference notebook


In [28]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import time
import torch.nn as nn

# --- Model architecture (must match training) ---
class ProductionBondClassifier(nn.Module):
    """Same architecture as training: DeBERTa-v3-small + multi-task heads."""
    
    def __init__(self, base_model: str = 'microsoft/deberta-v3-small', dropout: float = 0.15):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(base_model)
        hidden_size = self.bert.config.hidden_size
        
        self.feature_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(dropout),
            nn.GELU()
        )
        
        feature_size = hidden_size // 2
        
        # Heads: intent + sectors + rating + duration + constraints
        # ⬇️ 15 intents now (13 bond + 2 non-bond)
        self.intent_head = nn.Linear(feature_size, 15)
        self.sector_head = nn.Linear(feature_size, 7)
        self.rating_head = nn.Linear(feature_size, 7)
        self.duration_head = nn.Linear(feature_size, 3)
        self.constraint_head = nn.Linear(feature_size, 5)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        features = self.feature_layer(cls_output)
        features = self.dropout(features)
        
        return {
            'intent_logits': self.intent_head(features),
            'sector_logits': self.sector_head(features),
            'rating_logits': self.rating_head(features),
            'duration_logits': self.duration_head(features),
            'constraint_logits': self.constraint_head(features),
        }


class BondClassifier:
    def __init__(self, model_path: str, base_model: str = "microsoft/deberta-v3-small"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load tokenizer saved during training
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load model + weights
        self.model = ProductionBondClassifier(base_model=base_model)
        # NOTE: training script saved as 'pytorch_model.bin'
        state_dict = torch.load(f"{model_path}/pytorch_model.bin", map_location=self.device)
        self.model.load_state_dict(state_dict)
        self.model.to(self.device)
        self.model.eval()

        # This order MUST match BondQueryDataset.intent_to_id used in training
        self.intent_names = [
            'buy_recommendation',     # 0
            'sell_recommendation',    # 1
            'portfolio_analysis',     # 2
            'reduce_duration',        # 3
            'increase_yield',         # 4
            'hedge_volatility',       # 5
            'sector_rebalance',       # 6
            'barbell_strategy',       # 7
            'switch_bonds',           # 8
            'explain_recommendation', # 9
            'market_outlook',         # 10
            'credit_analysis',        # 11
            'forecast_prices',        # 12
            'non_bond_search',        # 13 (NEW)
            'non_bond_llm',           # 14 (NEW)
        ]
    
    def _route_from_intent(self, intent: str) -> str:
        """
        Simple router decision:
        - 'non_bond_search' -> 'search'
        - 'non_bond_llm'    -> 'llm'
        - everything else   -> 'bond'
        """
        if intent == 'non_bond_search':
            return 'search'
        elif intent == 'non_bond_llm':
            return 'llm'
        else:
            return 'bond'
    
    def classify(self, query: str):
        # Tokenize
        enc = self.tokenizer(
            query,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=128
        )
        input_ids = enc["input_ids"].to(self.device)
        attention_mask = enc["attention_mask"].to(self.device)
        
        # Run model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            intent_logits = outputs['intent_logits']
            intent_probs = F.softmax(intent_logits, dim=-1)
            
        # Get the predicted intent
        intent_idx = intent_probs.argmax(dim=-1).item()
        intent = self.intent_names[intent_idx]
        confidence = intent_probs.max().item()
        
        route = self._route_from_intent(intent)
        
        return intent, confidence, route


# --- Initialize the classifier ---
MODEL_PATH = '/kaggle/working/bond_classifier_v3'  # Path to your trained model directory
classifier = BondClassifier(MODEL_PATH)

# --- Input your query here ---
query = input("Enter your query: ")

start_time = time.time()
intent, confidence, route = classifier.classify(query)
end_time = time.time()

print(f"Predicted Intent: {intent}")
print(f"Confidence: {confidence:.3f}")
print(f"Router decision: {route}  (bond/search/llm)")
print(f"Time taken for classification: {end_time - start_time:.4f} seconds")

Enter your query:  which stocks should i buy


Predicted Intent: buy_recommendation
Confidence: 0.595
Router decision: bond  (bond/search/llm)
Time taken for classification: 0.0141 seconds


In [30]:
import os
import json
from pathlib import Path
from collections import Counter
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
from google import genai

# ------------------ EDGE-STAGE CONFIG ------------------ #
EDGE_CONFIG = {
    "train_path": "/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/train_extended.jsonl",
    "val_path":   "/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/val_extended.jsonl",
    "test_path":  "/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_extended.jsonl",

    # Stage-1 model dir (where pytorch_model.bin + tokenizer are)
    "base_model_dir": "/kaggle/working/bond_classifier_v3",

    # New dir for the edge-tuned model
    "output_dir": "/kaggle/working/bond_classifier_v3_edge",

    # Training hyperparams for stage-2
    "batch_size": 32,
    "num_epochs": 3,          # small number, this is just refinement
    "learning_rate": 1e-5,    # a bit lower for continued fine-tuning
    "warmup_ratio": 0.1,
    "max_length": 128,

    # Gemini edge-case generation
    "edge_model_name": "gemini-2.0-flash",
    "edge_samples_per_intent": 200,   # per non-bond intent
    "max_per_call": 40,
    "seed": 42,
}

# Make sure output dir exists
Path(EDGE_CONFIG["output_dir"]).mkdir(parents=True, exist_ok=True)

# Re-seed
random.seed(EDGE_CONFIG["seed"])
torch.manual_seed(EDGE_CONFIG["seed"])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(EDGE_CONFIG["seed"])

def load_jsonl(path: str | Path):
    path = Path(path)
    data = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return data

In [31]:
class NonBondEquityEdgeGenerator:
    """
    Generate 'hard' non-bond queries that mention stocks / equities / cashflow / P&L,
    explicitly labeled as:
      - non_bond_search  (needs web / retrieval)
      - non_bond_llm     (needs reasoning / drafting / explanation)
    Schema matches the extended dataset (sectors = [], rating = null, etc.)
    """

    def __init__(self, model_name: str, samples_per_intent: int, max_per_call: int = 40):
        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise RuntimeError("GEMINI_API_KEY not set in environment.")
        self.client = genai.Client(api_key=api_key)

        self.model_name = model_name
        self.samples_per_intent = samples_per_intent
        self.max_per_call = max_per_call

        # Things that should be present (equity world)
        self.equity_keywords = [
            "stock", "stocks", "equity", "equities", "share", "shares",
            "cash flow", "cashflow", "income statement", "balance sheet",
            "p&l", "profit and loss", "eps", "earnings", "dividend",
            "ipo", "market cap", "price to earnings", "p/e"
        ]
        # Things that MUST NOT appear (bond / fixed-income terms)
        self.forbidden_keywords = [
            "bond", "bonds", "debenture", "g-sec", "gsec",
            "coupon", "fixed income", "yield curve", "sovereign bond"
        ]

        self.intent_descriptions = {
            "non_bond_search": (
                "Queries about stocks/equities/company financials that clearly ask to "
                "search / fetch external information (news, filings, PDFs, data)."
            ),
            "non_bond_llm": (
                "Queries about stocks/equities/company financials that are best answered "
                "by reasoning / explanation / summarization / drafting, without explicit search."
            ),
        }

    def generate_dataset(self):
        all_rows = []
        print("=" * 60)
        print("GENERATING EQUITY EDGE-CASE NON-BOND DATA (Gemini)")
        print("=" * 60)

        for intent_name in ["non_bond_search", "non_bond_llm"]:
            needed = self.samples_per_intent
            print(f"\n→ Intent: {intent_name}  target={needed}")
            while needed > 0:
                batch_size = min(self.max_per_call, needed * 2)  # a bit extra to survive filtering
                batch = self._generate_batch_for_intent(intent_name, batch_size)
                all_rows.extend(batch)
                needed -= len(batch)
                print(f"  Collected {self.samples_per_intent - needed}/{self.samples_per_intent} for {intent_name}")

        print(f"\n✓ Total equity edge-case rows: {len(all_rows)}")
        print("Label distribution:", Counter(r["intent"] for r in all_rows))
        return all_rows

    def _generate_batch_for_intent(self, intent_name: str, batch_size: int):
        description = self.intent_descriptions[intent_name]

        system_msg = (
            "You are generating synthetic training data for an intent classifier. "
            "Create realistic user queries that are about STOCKS / EQUITIES / COMPANY FINANCIALS, "
            "and NOT about bonds or fixed income. Output JSON Lines."
        )

        user_msg = f"""
Generate {batch_size} diverse user queries whose PRIMARY intent is: "{intent_name}".

Intent description:
{description}

Hard constraints:
- Queries MUST be about stocks, equities, company fundamentals, earnings, P&L, cash flow,
  financial statements, valuations, etc.
- Queries MUST NOT be about bonds, debentures, sovereign debt, coupons, G-Secs or fixed income.
- Make some explicitly data-fetch/search-like, others explanation/analysis-like as appropriate.

For "non_bond_search":
- The user should clearly request search / fetch / lookup, e.g. "search", "find", "look up",
  "download", "get me the latest ...".

For "non_bond_llm":
- The user should clearly ask for explanation / comparison / summarization / drafting / brainstorming
  where an LLM can answer from knowledge & reasoning (without explicit web search being required).

Return EXACTLY one JSON object per line (JSONL), no array, no commentary.

Each JSON object MUST have:
- "text": string, the user query
- "intent": string, MUST be exactly "{intent_name}"
- "sectors": empty array []
- "rating": null
- "duration": "medium"
- "constraints": object with:
  {{
    "preserve_yield": false,
    "maintain_liquidity": false,
    "avoid_downgrades": false,
    "sector_diversity": false,
    "rating_above_aa": false
  }}
"""

        full_prompt = system_msg + "\n\n" + user_msg
        resp = self.client.models.generate_content(
            model=self.model_name,
            contents=full_prompt,
        )
        raw = resp.text or ""
        return self._parse_jsonl(raw, intent_name)

    def _parse_jsonl(self, raw: str, intent_name: str):
        rows = []
        for line in raw.splitlines():
            line = line.strip()
            if not line or line.startswith("```"):
                continue
            if line.startswith("-"):
                line = line.lstrip("-").strip()
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            sample = self._normalize_record(obj, intent_name)
            if sample is not None:
                rows.append(sample)
        return rows

    def _normalize_record(self, obj, intent_name: str):
        text = str(obj.get("text", "")).strip()
        if not text:
            return None

        lower = text.lower()

        # Must contain at least one equity keyword
        if not any(kw in lower for kw in self.equity_keywords):
            return None

        # Must NOT contain any bond keyword
        if any(kw in lower for kw in self.forbidden_keywords):
            return None

        # Force schema
        sample = {
            "text": text,
            "intent": intent_name,
            "sectors": [],
            "rating": None,
            "duration": "medium",
            "constraints": {
                "preserve_yield": False,
                "maintain_liquidity": False,
                "avoid_downgrades": False,
                "sector_diversity": False,
                "rating_above_aa": False,
            },
        }
        return sample

In [35]:
edge_gen = NonBondEquityEdgeGenerator(
    model_name=EDGE_CONFIG["edge_model_name"],
    samples_per_intent=EDGE_CONFIG["edge_samples_per_intent"],
    max_per_call=EDGE_CONFIG["max_per_call"],
)

equity_edge_cases = edge_gen.generate_dataset()

# (Optional) Save for inspection
edge_path = Path("/kaggle/working/equity_edge_cases.jsonl")
with edge_path.open("w", encoding="utf-8") as f:
    for r in equity_edge_cases:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"Saved equity edge cases to {edge_path}")

GENERATING EQUITY EDGE-CASE NON-BOND DATA (Gemini)

→ Intent: non_bond_search  target=200
  Collected 17/200 for non_bond_search
  Collected 34/200 for non_bond_search
  Collected 53/200 for non_bond_search
  Collected 77/200 for non_bond_search
  Collected 94/200 for non_bond_search
  Collected 115/200 for non_bond_search
  Collected 135/200 for non_bond_search
  Collected 155/200 for non_bond_search
  Collected 176/200 for non_bond_search
  Collected 205/200 for non_bond_search

→ Intent: non_bond_llm  target=200
  Collected 34/200 for non_bond_llm
  Collected 58/200 for non_bond_llm
  Collected 94/200 for non_bond_llm
  Collected 126/200 for non_bond_llm
  Collected 161/200 for non_bond_llm
  Collected 196/200 for non_bond_llm
  Collected 204/200 for non_bond_llm

✓ Total equity edge-case rows: 409
Label distribution: Counter({'non_bond_search': 205, 'non_bond_llm': 204})
Saved equity edge cases to /kaggle/working/equity_edge_cases.jsonl


In [36]:
# Must match training + inference order
INTENT_LABELS = [
    "buy_recommendation",
    "sell_recommendation",
    "portfolio_analysis",
    "reduce_duration",
    "increase_yield",
    "hedge_volatility",
    "sector_rebalance",
    "barbell_strategy",
    "switch_bonds",
    "explain_recommendation",
    "market_outlook",
    "credit_analysis",
    "forecast_prices",
    "non_bond_search",
    "non_bond_llm",
]

SECTOR_TO_ID = {
    "Sovereign": 0,
    "PSU Energy": 1,
    "Financial": 2,
    "Corporate": 3,
    "Infrastructure": 4,
    "NBFC": 5,
    "Banking": 6,
}

RATING_TO_ID = {
    "AAA": 0,
    "AA+": 1,
    "AA": 2,
    "A+": 3,
    "A": 4,
    "BBB": 5,
    # 6 = "other"/None
}

DURATION_TO_ID = {
    "short": 0,
    "medium": 1,
    "long": 2,
}

class ExtendedBondQueryDataset(Dataset):
    def __init__(self, records, tokenizer, max_length: int = 128):
        self.records = records
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.intent_to_id = {name: i for i, name in enumerate(INTENT_LABELS)}

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        sample = self.records[idx]

        text = sample["text"]
        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        intent = sample["intent"]
        intent_id = self.intent_to_id[intent]

        # sectors (multi-label)
        sectors = sample.get("sectors", []) or []
        sector_labels = torch.zeros(len(SECTOR_TO_ID))
        for s in sectors:
            if s in SECTOR_TO_ID:
                sector_labels[SECTOR_TO_ID[s]] = 1.0

        # rating (single class)
        rating = sample.get("rating")
        rating_id = RATING_TO_ID.get(rating, 6)  # 6 = "other"/None

        # duration
        duration = str(sample.get("duration", "medium")).lower()
        duration_id = DURATION_TO_ID.get(duration, 1)

        # constraints
        constraints = sample.get("constraints", {}) or {}
        constraint_labels = torch.tensor([
            float(constraints.get("preserve_yield", False)),
            float(constraints.get("maintain_liquidity", False)),
            float(constraints.get("avoid_downgrades", False)),
            float(constraints.get("sector_diversity", False)),
            float(constraints.get("rating_above_aa", False)),
        ])

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "intent_label": torch.tensor(intent_id, dtype=torch.long),
            "sector_labels": sector_labels,
            "rating_label": torch.tensor(rating_id, dtype=torch.long),
            "duration_label": torch.tensor(duration_id, dtype=torch.long),
            "constraint_labels": constraint_labels,
        }

In [37]:
def continue_finetune_with_equity_edges():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # ---- 1) Load base splits ----
    train_records = load_jsonl(EDGE_CONFIG["train_path"])
    val_records   = load_jsonl(EDGE_CONFIG["val_path"])
    test_records  = load_jsonl(EDGE_CONFIG["test_path"])

    print(f"Base train size: {len(train_records)}")
    print(f"Base val size:   {len(val_records)}")
    print(f"Base test size:  {len(test_records)}")

    # ---- 2) Generate & append equity edge-cases (already created above) ----
    global equity_edge_cases
    if "equity_edge_cases" not in globals():
        edge_gen = NonBondEquityEdgeGenerator(
            model_name=EDGE_CONFIG["edge_model_name"],
            samples_per_intent=EDGE_CONFIG["edge_samples_per_intent"],
            max_per_call=EDGE_CONFIG["max_per_call"],
        )
        equity_edge_cases = edge_gen.generate_dataset()

    print(f"Appending {len(equity_edge_cases)} equity edge-case samples to TRAIN only.")
    train_records_extended = train_records + equity_edge_cases

    # ---- 3) Tokenizer ----
    # Use tokenizer saved from stage-1 (or base model)
    if Path(EDGE_CONFIG["base_model_dir"]).exists():
        tokenizer = AutoTokenizer.from_pretrained(EDGE_CONFIG["base_model_dir"])
    else:
        tokenizer = AutoTokenizer.from_pretrained(CONFIG["base_model"])

    # ---- 4) Datasets & loaders ----
    train_dataset = ExtendedBondQueryDataset(
        train_records_extended, tokenizer, EDGE_CONFIG["max_length"]
    )
    val_dataset   = ExtendedBondQueryDataset(
        val_records, tokenizer, EDGE_CONFIG["max_length"]
    )
    test_dataset  = ExtendedBondQueryDataset(
        test_records, tokenizer, EDGE_CONFIG["max_length"]
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=EDGE_CONFIG["batch_size"],
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=EDGE_CONFIG["batch_size"],
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=EDGE_CONFIG["batch_size"],
    )

    # ---- 5) Load model from previous checkpoint ----
    from pathlib import Path as _Path

    base_ckpt = _Path(EDGE_CONFIG["base_model_dir"]) / "pytorch_model.bin"
    if not base_ckpt.exists():
        raise FileNotFoundError(f"Base checkpoint not found: {base_ckpt}")

    model = ProductionBondClassifier(base_model=CONFIG["base_model"])
    state_dict = torch.load(base_ckpt, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)

    print(f"Loaded base model from {base_ckpt}")

    # ---- 6) Optimizer & scheduler ----
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=EDGE_CONFIG["learning_rate"],
        weight_decay=0.01,
    )

    num_training_steps = len(train_loader) * EDGE_CONFIG["num_epochs"]
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * EDGE_CONFIG["warmup_ratio"]),
        num_training_steps=num_training_steps,
    )

    criterion = MultiTaskLoss()

    # ---- 7) Trainer (reuse your existing Trainer class) ----
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        device=device,
        output_dir=EDGE_CONFIG["output_dir"],
    )

    trainer.train(EDGE_CONFIG["num_epochs"])

    # ---- 8) Final test evaluation ----
    print("=" * 60)
    print("FINAL TEST (after equity edge-case fine-tuning)")
    print("=" * 60)

    model.eval()
    all_preds, all_labels = [], []

    from sklearn.metrics import accuracy_score, f1_score
    from tqdm.auto import tqdm as _tqdm

    with torch.no_grad():
        for batch in _tqdm(test_loader, desc="Testing"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs["intent_logits"].argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["intent_label"].numpy())

    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average="macro")

    print(f"\n✓ Test Accuracy: {test_acc:.4f}")
    print(f"✓ Test F1 Macro: {test_f1:.4f}\n")

    # ---- 9) Save edge-tuned checkpoint ----
    edge_ckpt = Path(EDGE_CONFIG["output_dir"]) / "pytorch_model.bin"
    edge_ckpt_edge_name = Path(EDGE_CONFIG["output_dir"]) / "pytorch_model_edge.bin"

    if edge_ckpt.exists():
        import shutil
        shutil.copy(edge_ckpt, edge_ckpt_edge_name)
        print(f"✓ Edge-tuned model saved to: {edge_ckpt}")
        print(f"✓ Also copied to: {edge_ckpt_edge_name}")
    else:
        print("⚠ Trainer did not save a checkpoint named 'pytorch_model.bin'?! Check Trainer.save_checkpoint logic.")


In [38]:
continue_finetune_with_equity_edges()

Device: cuda
Base train size: 5749
Base val size:   1232
Base test size:  1232
Appending 409 equity edge-case samples to TRAIN only.
Loaded base model from /kaggle/working/bond_classifier_v3/pytorch_model.bin
TRAINING


Epoch 1:   0%|          | 0/193 [00:00<?, ?it/s]


Epoch 1/3 - Train Loss: 0.4444


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.4198
Val Accuracy: 0.9943
Val F1: 0.9940

✓ New best model saved (acc: 0.9943)


Epoch 2:   0%|          | 0/193 [00:00<?, ?it/s]


Epoch 2/3 - Train Loss: 0.4035


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3909
Val Accuracy: 0.9951
Val F1: 0.9948

✓ New best model saved (acc: 0.9951)


Epoch 3:   0%|          | 0/193 [00:00<?, ?it/s]


Epoch 3/3 - Train Loss: 0.3849


Evaluating:   0%|          | 0/39 [00:00<?, ?it/s]

Val Loss: 0.3850
Val Accuracy: 0.9943
Val F1: 0.9940

Training complete! Best accuracy: 0.9951

FINAL TEST (after equity edge-case fine-tuning)


Testing:   0%|          | 0/39 [00:00<?, ?it/s]


✓ Test Accuracy: 0.9943
✓ Test F1 Macro: 0.9940

✓ Edge-tuned model saved to: /kaggle/working/bond_classifier_v3_edge/pytorch_model.bin
✓ Also copied to: /kaggle/working/bond_classifier_v3_edge/pytorch_model_edge.bin


In [43]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import time
import torch.nn as nn

# ---------------------------------------------------------
# CONFIG: choose which model to load
# ---------------------------------------------------------
USE_EDGE_MODEL = True  # True = 15-intent edge model, False = original 13-intent model

BASE_MODEL_DIR = "/kaggle/working/bond_classifier_v3"
EDGE_MODEL_DIR = "/kaggle/working/bond_classifier_v3_edge"
BASE_MODEL_NAME = "microsoft/deberta-v3-small"
CHECKPOINT_NAME = "pytorch_model.bin"  # both training scripts save under this name


# ---------------------------------------------------------
# MODEL ARCHITECTURE (shared, but num_intents is configurable)
# ---------------------------------------------------------
class ProductionBondClassifier(nn.Module):
    """
    Same backbone as training (DeBERTa-v3-small + multi-task heads),
    but with configurable number of intent classes so we can support:
      - 13 intents  (original model)
      - 15 intents  (edge model: + non_bond_search, non_bond_llm)
    """
    def __init__(
        self,
        base_model: str = BASE_MODEL_NAME,
        dropout: float = 0.15,
        num_intents: int = 13,
    ):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(base_model)
        hidden_size = self.bert.config.hidden_size
        
        self.feature_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(dropout),
            nn.GELU()
        )
        
        feature_size = hidden_size // 2
        
        # Heads: intent + sectors + rating + duration + constraints
        self.intent_head = nn.Linear(feature_size, num_intents)
        self.sector_head = nn.Linear(feature_size, 7)
        self.rating_head = nn.Linear(feature_size, 7)
        self.duration_head = nn.Linear(feature_size, 3)
        self.constraint_head = nn.Linear(feature_size, 5)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        features = self.feature_layer(cls_output)
        features = self.dropout(features)
        
        return {
            'intent_logits': self.intent_head(features),
            'sector_logits': self.sector_head(features),
            'rating_logits': self.rating_head(features),
            'duration_logits': self.duration_head(features),
            'constraint_logits': self.constraint_head(features),
        }


# ---------------------------------------------------------
# WRAPPER CLASS
# ---------------------------------------------------------
class BondClassifier:
    def __init__(self, use_edge_model: bool = True, base_model: str = BASE_MODEL_NAME):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.use_edge_model = use_edge_model
        
        # Choose which directory & label set to use
        if self.use_edge_model:
            self.model_path = EDGE_MODEL_DIR
            self.num_intents = 15
            # 13 bond intents + 2 non-bond intents
            self.intent_names = [
                'buy_recommendation',     # 0
                'sell_recommendation',    # 1
                'portfolio_analysis',     # 2
                'reduce_duration',        # 3
                'increase_yield',         # 4
                'hedge_volatility',       # 5
                'sector_rebalance',       # 6
                'barbell_strategy',       # 7
                'switch_bonds',           # 8
                'explain_recommendation', # 9
                'market_outlook',         # 10
                'credit_analysis',        # 11
                'forecast_prices',        # 12
                'non_bond_search',        # 13
                'non_bond_llm',           # 14
            ]
        else:
            self.model_path = BASE_MODEL_DIR
            self.num_intents = 13
            # Original 13 bond intents
            self.intent_names = [
                'buy_recommendation',     # 0
                'sell_recommendation',    # 1
                'portfolio_analysis',     # 2
                'reduce_duration',        # 3
                'increase_yield',         # 4
                'hedge_volatility',       # 5
                'sector_rebalance',       # 6
                'barbell_strategy',       # 7
                'switch_bonds',           # 8
                'explain_recommendation', # 9
                'market_outlook',         # 10
                'credit_analysis',        # 11
                'forecast_prices',        # 12
            ]
        
        # ---- Load tokenizer ----
        # In training you used AutoTokenizer.from_pretrained(CONFIG['base_model']),
        # so for inference we can safely do the same:
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        # ---- Load model + weights ----
        self.model = ProductionBondClassifier(
            base_model=base_model,
            num_intents=self.num_intents,
        )
        ckpt_path = os.path.join(self.model_path, CHECKPOINT_NAME)
        if not os.path.exists(ckpt_path):
            raise FileNotFoundError(f"Checkpoint not found at: {ckpt_path}")
        
        state_dict = torch.load(ckpt_path, map_location=self.device)
        self.model.load_state_dict(state_dict)
        self.model.to(self.device)
        self.model.eval()
    
    def _route_from_intent(self, intent: str) -> str:
        """
        Simple router decision:
        - 'non_bond_search' -> 'search'
        - 'non_bond_llm'    -> 'llm'
        - everything else   -> 'bond'
        
        For the original 13-intent model, you will never see non_bond_*,
        so route will always be 'bond'.
        """
        if intent == 'non_bond_search':
            return 'search'
        elif intent == 'non_bond_llm':
            return 'llm'
        else:
            return 'bond'
    
    def classify(self, query: str):
        # Tokenize
        enc = self.tokenizer(
            query,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=128
        )
        input_ids = enc["input_ids"].to(self.device)
        attention_mask = enc["attention_mask"].to(self.device)
        
        # Run model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            intent_logits = outputs['intent_logits']
            intent_probs = F.softmax(intent_logits, dim=-1)
            
        # Get the predicted intent
        intent_idx = intent_probs.argmax(dim=-1).item()
        intent = self.intent_names[intent_idx]
        confidence = intent_probs.max().item()
        
        route = self._route_from_intent(intent)
        
        return intent, confidence, route


# ---------------------------------------------------------
# USAGE
# ---------------------------------------------------------
if __name__ == "__main__":
    # Toggle USE_EDGE_MODEL at the top to switch between:
    # - original 13-intent model
    # - 15-intent edge model with non-bond routing
    classifier = BondClassifier(use_edge_model=USE_EDGE_MODEL)
    
    query = input("Enter your query: ")
    
    start_time = time.time()
    intent, confidence, route = classifier.classify(query)
    end_time = time.time()
    
    print(f"\nModel: {'EDGE (15 intents)' if USE_EDGE_MODEL else 'BASE (13 intents)'}")
    print(f"Predicted Intent: {intent}")
    print(f"Confidence: {confidence:.3f}")
    print(f"Router decision: {route}  (bond/search/llm)")
    print(f"Time taken for classification: {end_time - start_time:.4f} seconds")




Enter your query:  which stocks should i buy



Model: EDGE (15 intents)
Predicted Intent: buy_recommendation
Confidence: 0.428
Router decision: bond  (bond/search/llm)
Time taken for classification: 0.0145 seconds


In [None]:
import os
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from collections import Counter

from google import genai
from tqdm.auto import tqdm

# ============================================================
# GENERAL NON-BOND EDGE CASE CONFIG
# ============================================================
GENERAL_NON_BOND_CONFIG = {
    "model": "gemini-2.0-flash",
    "samples_per_intent": 500,      # 500 non_bond_search + 500 non_bond_llm = 1000 rows
    "max_per_call": 50,             # 50 per request -> 10 calls per intent -> 20 calls total
    "output_path": Path("/kaggle/working/general_non_bond_edge_cases.jsonl"),
}

assert os.environ.get("GEMINI_API_KEY"), "Set GEMINI_API_KEY in Kaggle before running this cell."


# ============================================================
# GENERAL NON-BOND EDGE GENERATOR
# ============================================================

class GeneralNonBondEdgeGenerator:
    """
    Generate HARD / GENERAL non-bond edge cases, complementary to your existing
    equity_edge_cases.jsonl and extended bond datasets.

    Intents:
      - non_bond_search : clearly needs external data / retrieval
      - non_bond_llm    : primarily needs LLM reasoning / generation
    """

    def __init__(self, model_name: str, samples_per_intent: int, max_per_call: int = 50):
        self.client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
        self.model_name = model_name
        self.samples_per_intent = samples_per_intent
        self.max_per_call = max_per_call

    # ---------------- CORE PUBLIC API ----------------

    def generate_dataset(self) -> List[Dict[str, Any]]:
        print("=" * 60)
        print("GENERATING GENERAL NON-BOND EDGE CASE DATA (Gemini)")
        print("=" * 60)

        all_rows: List[Dict[str, Any]] = []

        # 1) non_bond_search
        print("\n→ Generating for intent: non_bond_search")
        all_rows.extend(
            self._generate_for_intent(
                intent_name="non_bond_search",
                total_needed=self.samples_per_intent,
            )
        )

        # 2) non_bond_llm
        print("\n→ Generating for intent: non_bond_llm")
        all_rows.extend(
            self._generate_for_intent(
                intent_name="non_bond_llm",
                total_needed=self.samples_per_intent,
            )
        )

        print(f"\n✓ Total general non-bond rows: {len(all_rows)}")
        return all_rows

    # ---------------- PROMPT BUILDING ----------------

    def _general_edge_scenarios_block(self) -> str:
        """
        Big-ass description of all the non-bond edge-case themes.
        We DELIBERATELY de-emphasize pure bond stuff, and also avoid
        duplicating only equity cases (you already have those).
        """
        return """
You are generating HARD / GENERAL non-bond edge cases for a router in a bond assistant.

The model must learn that the following are NOT bond-specific tasks, even if they mention
rates, interest, duration, yield, or "bond" in a non-finance sense.

Spread examples across these themes (roughly evenly):

1) Banking & Retail Credit (non-bond)
   - Home loans, car loans, personal loans, credit cards, overdrafts, BNPL, EMIs,
     education loans, loan refinancing, etc.
   - Example: "Should I prepay my home loan or invest extra money in an index fund?"
   - Example: "Compare SBI and HDFC credit card reward programs for frequent flyers"
   - Example: "Refinance my 12% car loan to 9%, is it worth it?"

2) Deposits & Cash Management
   - Savings accounts, fixed deposits (FDs), recurring deposits (RDs), liquid funds,
     sweep accounts, money parked in current accounts.
   - Example: "Find top 5 bank FDs above 7.5% for 2 years"
   - Example: "Is a liquid fund better than savings account for my emergency corpus?"

3) Personal Finance & Planning
   - Budgeting, emergency funds, retirement planning (EPF, NPS, 401k equivalents),
     insurance (life, health, term, ULIPs), tax saving choices.
   - Example: "How much emergency fund should I keep as a salaried employee in India?"
   - Example: "Compare ELSS vs PPF vs NPS for tax-saving and long-term returns"
   - Example: "What health insurance riders should I consider for my parents?"

4) Corporate Finance & Business Decisions (non-bond focus)
   - CAPEX decisions, payback, NPV, IRR, working capital management, pricing, unit economics,
     project appraisal, capital structure with minimal bond focus.
   - Example: "Calculate NPV and IRR for this factory project with given cashflows"
   - Example: "Explain working capital cycle for an FMCG distributor"
   - Example: "How does issuing ESOPs dilute existing shareholders?"

5) Accounting & Reporting
   - Income statement, balance sheet, cashflow statement, ratios, audit issues,
     revenue recognition, lease accounting, provisions.
   - Example: "Why can net profit be positive while operating cashflow is negative?"
   - Example: "Explain difference between EBITDA, EBIT, and EBT using examples"
   - Example: "How are leases treated under Ind AS / IFRS 16?"

6) Macroeconomics, FX, Commodities, Crypto
   - Inflation, GDP, currency exchange, oil prices, gold, crypto, but not bond portfolios
     as the main topic.
   - Example: "How does rupee depreciation impact import-heavy companies?"
   - Example: "Explain halvings in Bitcoin and their historical impact on price"
   - Example: "What happens to emerging market currencies when US Fed hikes rates?"

7) General ChatGPT-style Non-Finance Tasks
   - Coding, data science, math, essays, productivity, career advice, travel itineraries,
     email drafting, etc.
   - Example: "Write a Python script to compute XIRR for irregular cashflows"
   - Example: "Plan a 4-day trip in Kerala with a budget of 25k"
   - Example: "Draft a professional email to negotiate a higher salary offer"

8) Non-Financial Uses of the Word "bond"
   - Chemical bonds, social bonds, emotional bonding, James Bond, adhesives, materials.
   - Example: "Explain covalent vs ionic bonds with daily life analogies"
   - Example: "Rank James Bond actors by critical acclaim"
   - Example: "How to build a stronger emotional bond with my team at work?"

9) Mixed / Ambiguous Queries where bonds are NOT the primary focus
   - Multi-asset portfolios where equities, gold, real estate dominate and bonds are
     a side note.
   - Example: "I have 60% in equity mutual funds, 20% in gold, 10% in FDs, 10% in RBI bonds,
               is my allocation too risky?"
   - Example: "Design a barbell between growth stocks and high-quality FDs"
   - Here, ensure the main question is broader than just 'optimize bonds'.

CRITICAL RULE:
- The PRIMARY topic must NOT be a pure fixed-income bond portfolio task.
- It's okay if the user mentions 'interest rate', 'yield', or even 'bond' casually,
  but the assistant should NOT treat this as a core bond-investing query.
"""

    def _intent_hint_block(self, intent_name: str) -> str:
        if intent_name == "non_bond_search":
            return """
For this batch, the \"intent\" is \"non_bond_search\".

queries MUST:
- clearly ask to LOOK UP or FETCH external information:
  * live or historical prices, charts, rates
  * bank product details, loan offers, FD rates
  * PDFs, statements, filings, FAQs, support pages, regulations
  * lists and screeners (e.g., top funds, best FDs above X%, etc.)
- Use verbs like: "find", "search", "look up", "show me", "get", "download", "list",
  "screen for", "pull data for", "fetch".
"""
        else:
            return """
For this batch, the \"intent\" is \"non_bond_llm\".

queries MUST:
- primarily ask for REASONING, EXPLANATION, or GENERATION, not raw data fetch:
  * explain, summarize, compare, analyze, interpret, critique
  * calculate NPV/IRR on given numbers
  * draft / rewrite emails or documents
  * provide frameworks, pros/cons, structured advice
- Use verbs like: "explain", "summarize", "compare", "analyze", "walk me through",
  "draft", "rewrite", "brainstorm", "advise", "help me understand".
"""

    # ---------------- GENERATION LOOPS ----------------

    def _generate_for_intent(self, intent_name: str, total_needed: int) -> List[Dict[str, Any]]:
        rows: List[Dict[str, Any]] = []
        remaining = total_needed

        print(f"  Target samples for {intent_name}: {total_needed}")
        pbar = tqdm(total=total_needed, desc=f"Generating {intent_name}")

        while remaining > 0:
            batch_size = min(self.max_per_call, remaining)
            batch = self._generate_batch_for_intent(intent_name, batch_size)
            rows.extend(batch)
            got = len(batch)
            remaining -= got
            pbar.update(got)

        pbar.close()
        print(f"  ✓ Collected {len(rows)} samples for {intent_name}")
        return rows

    def _generate_batch_for_intent(self, intent_name: str, batch_size: int) -> List[Dict[str, Any]]:
        system_msg = (
            "You are generating synthetic training data for an intent classifier used "
            "by a bond-investing assistant.\n"
            "The goal is to produce non-bond edge cases that should be routed AWAY from "
            "the bond-specific engine and towards either: search or a general LLM."
        )

        user_msg = f"""
Generate {batch_size} diverse user queries whose PRIMARY intent is: "{intent_name}".

{self._intent_hint_block(intent_name)}

{self._general_edge_scenarios_block()}

Global constraints:
- HARD RULE: Do NOT make the main task "optimize a bond portfolio", "find bonds", or
  "recommend which bond to buy/sell". Those are handled elsewhere.
- It's okay to mention bonds in passing, but the core question must be about:
  banking, loans, deposits, accounting, macro, personal finance, generic tasks, or
  non-financial 'bond'.
- Vary user persona: retail, HNI, student, small business owner, CFO, startup founder,
  non-finance person, etc.
- Vary style and length: short prompts (4–8 words) and longer 1–3 sentence prompts.

Return the output STRICTLY in JSON Lines format:
- One JSON object per line
- No surrounding array
- No backticks or code fences
- No commentary

Each JSON object MUST have exactly these keys:
- "text": string, the user query
- "intent": string, MUST be exactly "{intent_name}"
- "sectors": array of strings, MUST be an empty array [] for these non-bond queries
- "rating": null
- "duration": string, ALWAYS "medium" for these non-bond queries
- "constraints": object with boolean fields, ALL false:
  {{
    "preserve_yield": false,
    "maintain_liquidity": false,
    "avoid_downgrades": false,
    "sector_diversity": false,
    "rating_above_aa": false
  }}
"""

        full_prompt = system_msg + "\n\n" + user_msg

        resp = self.client.models.generate_content(
            model=self.model_name,
            contents=full_prompt,
        )

        raw = resp.text or ""
        return self._parse_jsonl(raw, intent_name)

    # ---------------- PARSING & NORMALIZATION ----------------

    def _parse_jsonl(self, raw: str, intent_name: str) -> List[Dict[str, Any]]:
        rows: List[Dict[str, Any]] = []
        for line in raw.splitlines():
            line = line.strip()
            if not line or line.startswith("```"):
                continue
            if line.startswith("-"):
                line = line.lstrip("-").strip()
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            sample = self._normalize_record(obj, intent_name)
            if sample is not None:
                rows.append(sample)
        return rows

    def _normalize_record(self, obj: Dict[str, Any], intent_name: str) -> Optional[Dict[str, Any]]:
        text = str(obj.get("text", "")).strip()
        if not text:
            return None

        # We override everything to match your schema exactly
        sample = {
            "text": text,
            "intent": intent_name,
            "sectors": [],          # always empty (non-bond)
            "rating": None,         # always null
            "duration": "medium",   # fixed
            "constraints": {
                "preserve_yield": False,
                "maintain_liquidity": False,
                "avoid_downgrades": False,
                "sector_diversity": False,
                "rating_above_aa": False,
            },
        }
        return sample


# ============================================================
# RUN GENERATION + SAVE
# ============================================================

gen_nonbond = GeneralNonBondEdgeGenerator(
    model_name=GENERAL_NON_BOND_CONFIG["model"],
    samples_per_intent=GENERAL_NON_BOND_CONFIG["samples_per_intent"],
    max_per_call=GENERAL_NON_BOND_CONFIG["max_per_call"],
)

general_edge_data = gen_nonbond.generate_dataset()

print("\nLabel counts in general non-bond edge dataset:")
print(Counter(row["intent"] for row in general_edge_data))

out_path = GENERAL_NON_BOND_CONFIG["output_path"]
out_path.parent.mkdir(parents=True, exist_ok=True)

with out_path.open("w", encoding="utf-8") as f:
    for row in general_edge_data:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"\n✓ Saved general non-bond edge dataset to: {out_path}")
print(f"  Total rows: {len(general_edge_data)}")


GENERATING GENERAL NON-BOND EDGE CASE DATA (Gemini)

→ Generating for intent: non_bond_search
  Target samples for non_bond_search: 500


Generating non_bond_search:   0%|          | 0/500 [00:00<?, ?it/s]

  ✓ Collected 500 samples for non_bond_search

→ Generating for intent: non_bond_llm
  Target samples for non_bond_llm: 500


Generating non_bond_llm:   0%|          | 0/500 [00:00<?, ?it/s]

In [45]:
import os
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from collections import Counter

from google import genai
from tqdm.auto import tqdm

# ============================================================
# EXTRA BOND EDGE-CASE CONFIG
# ============================================================
BOND_EDGE_CONFIG = {
    "model": "gemini-2.0-flash",
    "samples_per_intent": 180,       # per bond intent (13 intents -> 2340 rows)
    "max_per_call": 60,              # 3 calls/intent -> 39 calls total
    "output_path": Path("/kaggle/working/bond_edge_cases.jsonl"),
}

assert os.environ.get("GEMINI_API_KEY"), "Set GEMINI_API_KEY in Kaggle before running this cell."


# ============================================================
# EDGE-CASE BOND INTENT GENERATOR (GEMINI)
# ============================================================

class BondEdgeLLMDataGenerator:
    """
    Generate *extra* synthetic bond queries for all 13 bond intents.

    Goal:
    - Harder, more realistic, messy edge cases
    - Multi-constraint queries, combined objectives, follow-ups, ambiguous wording
    - Still CLEAN, labelled, and consistent with your 13-intent schema.

    Output schema (per row):
      - text: str
      - intent: one of the 13 bond intents
      - sectors: list[str] subset of
            ["Sovereign", "PSU Energy", "Financial", "Corporate",
             "Infrastructure", "NBFC", "Banking"]
      - rating: null or one of ["AAA","AA+","AA","A+","A","BBB"]
      - duration: "short" | "medium" | "long"
      - constraints: dict with 5 boolean keys
    """

    BOND_INTENT_NAMES = [
        "buy_recommendation",
        "sell_recommendation",
        "portfolio_analysis",
        "reduce_duration",
        "increase_yield",
        "hedge_volatility",
        "sector_rebalance",
        "barbell_strategy",
        "switch_bonds",
        "explain_recommendation",
        "market_outlook",
        "credit_analysis",
        "forecast_prices",
    ]

    def __init__(self, model_name: str, samples_per_intent: int, max_per_call: int = 60):
        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise RuntimeError("GEMINI_API_KEY not set in environment")

        self.client = genai.Client(api_key=api_key)
        self.model_name = model_name
        self.samples_per_intent = samples_per_intent
        self.max_per_call = max_per_call

        self.allowed_sectors = [
            "Sovereign", "PSU Energy", "Financial",
            "Corporate", "Infrastructure", "NBFC", "Banking"
        ]
        self.allowed_ratings = ["AAA", "AA+", "AA", "A+", "A", "BBB"]
        self.allowed_durations = ["short", "medium", "long"]

        # Base descriptions (reusing your intent semantics)
        self.intent_descriptions = {
            "buy_recommendation": "User wants recommendations of which bonds to BUY, often with preferences about sector, rating, duration, yield, liquidity, or risk tolerance.",
            "sell_recommendation": "User wants to know what to SELL or whether to exit certain bonds or positions.",
            "portfolio_analysis": "User wants analysis or diagnosis of their current bond portfolio: exposures, duration, sector mix, risks, diversification.",
            "reduce_duration": "User wants to reduce INTEREST-RATE risk/duration of their bond portfolio while staying invested.",
            "increase_yield": "User wants to increase portfolio YIELD, often trading off some quality/liquidity or duration.",
            "hedge_volatility": "User wants to hedge or reduce the impact of bond price volatility or interest-rate moves.",
            "sector_rebalance": "User wants to rebalance SECTOR allocation or reduce concentration in particular sectors/issuers.",
            "barbell_strategy": "User wants a BARBELL strategy (mix of short + long duration bonds, sometimes with constraints).",
            "switch_bonds": "User wants to switch from one BOND or issuer to another similar but better bond.",
            "explain_recommendation": "User wants EXPLANATION / rationale behind some earlier recommendation or trade idea.",
            "market_outlook": "User wants OUTLOOK on bond markets, yields, spreads, central bank policy, etc.",
            "credit_analysis": "User wants analysis of CREDIT quality, default risk, downgrade risk or rating outlook.",
            "forecast_prices": "User wants explicit FORECASTS of future bond prices, yields, or total returns.",
        }

    # ---------------- PUBLIC API ----------------

    def generate_dataset(self) -> List[Dict[str, Any]]:
        print("=" * 60)
        print("GENERATING EXTRA BOND EDGE-CASE DATA (Gemini)")
        print("=" * 60)

        all_samples: List[Dict[str, Any]] = []

        for intent_name in self.BOND_INTENT_NAMES:
            print(f"\n→ Intent: {intent_name} (target {self.samples_per_intent})")
            intent_samples = self._generate_for_intent(intent_name, self.samples_per_intent)
            all_samples.extend(intent_samples)

        print(f"\n✓ Generated {len(all_samples)} extra bond edge-case samples\n")
        return all_samples

    # ---------------- PROMPT HELPERS ----------------

    def _edge_case_hint_block(self, intent_name: str) -> str:
        """
        Extra guidance per intent to push Gemini towards edge-case scenarios.
        """
        if intent_name == "buy_recommendation":
            return """
Edge-case patterns:
- Conflicting constraints: high yield AND AAA-only AND < 2-year maturity
- Liquidity vs yield trade-offs
- Tax constraints (tax-free vs taxable)
- Lot-size / ticket size constraints (e.g., minimum 10 lakh)
- Very specific sectors/issuers, concentration concerns, ESG filters
"""
        if intent_name == "sell_recommendation":
            return """
Edge-case patterns:
- Deciding whether to exit after downgrade, spread widening, or macro scare
- Partial exit vs full exit decisions
- Tax-loss harvesting reasons
- Multiple bonds in question, but primary decision around 1-2 names
"""
        if intent_name == "portfolio_analysis":
            return """
Edge-case patterns:
- Ugly portfolios: concentrated in 1–2 issuers, weird duration profiles
- Mismatch between stated risk tolerance and current holdings
- Asking for scenario analysis (rate hikes/cuts, spread shocks)
- Holdings described in text, not in perfect tabular form
"""
        if intent_name == "reduce_duration":
            return """
Edge-case patterns:
- Need to cut duration without sacrificing yield too much
- Want to maintain sector or credit quality while reducing duration
- Already barbelled portfolios that need duration trimming on one side
"""
        if intent_name == "increase_yield":
            return """
Edge-case patterns:
- User chasing yield but with specific max drawdown or rating floor
- Asking about callable/perpetual structures vs plain vanilla
- Tension between liquidity and higher yield
"""
        if intent_name == "hedge_volatility":
            return """
Edge-case patterns:
- Hedge using barbell, cash, or inverse interest-rate exposures
- Portfolio with large MTM swings after policy shocks
- Desire to hedge only a sub-portfolio or certain maturity bucket
"""
        if intent_name == "sector_rebalance":
            return """
Edge-case patterns:
- Overweight PSU Energy, NBFC, or one corporate group
- Need to bring sectors within target bands
- Sector views changing due to regulation, ESG, or macro themes
"""
        if intent_name == "barbell_strategy":
            return """
Edge-case patterns:
- Specific short/long buckets (e.g., <1 year and >10 year)
- Target average duration & yield for the overall barbell
- Barbell plus constraint on issuer or sector concentration
"""
        if intent_name == "switch_bonds":
            return """
Edge-case patterns:
- Switch from downgraded or illiquid bonds into better quality/liquidity
- Same issuer switch (old series to new series)
- Switch to capture curve rolldown or spread compression
"""
        if intent_name == "explain_recommendation":
            return """
Edge-case patterns:
- User quoting a past recommendation and asking 'why'
- Wanting explanation in simple language for senior management / board
- Disagreeing with the recommendation and asking for justification
"""
        if intent_name == "market_outlook":
            return """
Edge-case patterns:
- Multi-scenario outlooks: soft landing, hard landing, stagflation
- Views on specific parts of the curve (2Y vs 10Y)
- Outlook conditional on upcoming policy events or inflation prints
"""
        if intent_name == "credit_analysis":
            return """
Edge-case patterns:
- Complex capital structures (AT1, perpetuals, sub debt)
- Parent-subsidiary support, implicit guarantees
- Early warning signals, covenant breaches, rating watch
"""
        if intent_name == "forecast_prices":
            return """
Edge-case patterns:
- Explicit price/return forecast over 3–12 months
- Total return including carry and rolldown, not just price moves
- Asking for forecast under 2–3 different rate paths
"""
        return ""

    # ---------------- GENERATION LOOPS ----------------

    def _generate_for_intent(self, intent_name: str, total_needed: int) -> List[Dict[str, Any]]:
        samples: List[Dict[str, Any]] = []
        remaining = total_needed

        pbar = tqdm(total=total_needed, desc=f"Generating {intent_name}")
        while remaining > 0:
            batch_size = min(self.max_per_call, remaining)
            batch = self._generate_batch_for_intent(intent_name, batch_size)
            samples.extend(batch)
            got = len(batch)
            remaining -= got
            pbar.update(got)
        pbar.close()

        print(f"  ✓ Collected {len(samples)} samples for {intent_name}")
        return samples

    def _generate_batch_for_intent(self, intent_name: str, batch_size: int) -> List[Dict[str, Any]]:
        base_desc = self.intent_descriptions[intent_name]
        edge_hint = self._edge_case_hint_block(intent_name)

        system_msg = (
            "You are generating synthetic TRAINING DATA for a supervised bond-intent classifier.\n"
            "You MUST produce realistic, messy, institutional-grade bond queries.\n"
            "The examples should be HARDER / EDGE-CASE variants, not simple textbook prompts."
        )

        user_msg = f"""
Generate {batch_size} diverse user queries whose PRIMARY intent is exactly: "{intent_name}".

Intent semantics:
{base_desc}

Edge-case guidance:
{edge_hint}

Global rules:
- The queries MUST BE about bonds / fixed-income investing, portfolios, or markets.
- Do NOT generate generic equity-only or generic personal-finance prompts here.
- Explicitly use bond jargon: spreads, duration, YTM, callable, perpetual, roll-down, MTM, etc. where natural.
- Vary:
  * investor type (retail, HNI, treasury, CIO, risk officer, family office),
  * tone (short commands, long descriptive questions, follow-ups),
  * complexity (single objective vs multi-constraint trade-offs).

Labeling rules:
- The PRIMARY task must match the intent "{intent_name}".
- Even if the query touches multiple ideas, decide the dominant intention.

Output FORMAT:
Return the output STRICTLY in JSON Lines format:
- One JSON object per line
- No surrounding array
- No commentary, no backticks

Each JSON object MUST have exactly these keys:
- "text": string, the user query
- "intent": string, MUST be exactly "{intent_name}"
- "sectors": array of strings, each must be one of:
  ["Sovereign", "PSU Energy", "Financial", "Corporate", "Infrastructure", "NBFC", "Banking"]
  (can be empty if sectors are not clearly specified)
- "rating": null or one of ["AAA","AA+","AA","A+","A","BBB"]
- "duration": string, one of ["short","medium","long"]
- "constraints": object with EXACTLY these boolean fields:
  {{
    "preserve_yield": <true/false>,
    "maintain_liquidity": <true/false>,
    "avoid_downgrades": <true/false>,
    "sector_diversity": <true/false>,
    "rating_above_aa": <true/false>
  }}

Constraints defaults:
- If unsure, use null for "rating".
- If not mentioned, use "medium" for duration.
- All constraint flags default to false unless explicitly implied.
"""

        full_prompt = system_msg + "\n\n" + user_msg

        resp = self.client.models.generate_content(
            model=self.model_name,
            contents=full_prompt,
        )

        raw = resp.text or ""
        return self._parse_jsonl(raw, intent_name)

    # ---------------- PARSING & NORMALIZATION ----------------

    def _parse_jsonl(self, raw: str, intent_name: str) -> List[Dict[str, Any]]:
        rows: List[Dict[str, Any]] = []
        for line in raw.splitlines():
            line = line.strip()
            if not line or line.startswith("```"):
                continue
            if line.startswith("-"):
                line = line.lstrip("-").strip()
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            sample = self._normalize_record(obj, intent_name)
            if sample is not None:
                rows.append(sample)
        return rows

    def _normalize_record(self, obj: Dict[str, Any], intent_name: str) -> Optional[Dict[str, Any]]:
        text = str(obj.get("text", "")).strip()
        if not text:
            return None

        # Force canonical intent
        intent = obj.get("intent", intent_name)
        if intent != intent_name:
            intent = intent_name

        # Sectors
        sectors_raw = obj.get("sectors") or []
        sectors: List[str] = []
        if isinstance(sectors_raw, list):
            for s in sectors_raw:
                ss = str(s).strip()
                if ss in self.allowed_sectors and ss not in sectors:
                    sectors.append(ss)

        # Rating
        rating_raw = obj.get("rating")
        if rating_raw is None:
            rating_norm = None
        else:
            r = str(rating_raw).strip()
            rating_norm = r if r in self.allowed_ratings else None

        # Duration
        duration_raw = str(obj.get("duration", "medium")).strip().lower()
        duration = duration_raw if duration_raw in self.allowed_durations else "medium"

        # Constraints
        cons_raw = obj.get("constraints") or {}
        constraints = {
            "preserve_yield": bool(cons_raw.get("preserve_yield", False)),
            "maintain_liquidity": bool(cons_raw.get("maintain_liquidity", False)),
            "avoid_downgrades": bool(cons_raw.get("avoid_downgrades", False)),
            "sector_diversity": bool(cons_raw.get("sector_diversity", False)),
            "rating_above_aa": bool(cons_raw.get("rating_above_aa", False)),
        }

        # If rating is AAA/AA+/AA, it's reasonable to flip rating_above_aa
        if rating_norm in ("AAA", "AA+", "AA"):
            constraints["rating_above_aa"] = True

        return {
            "text": text,
            "intent": intent,
            "sectors": sectors,
            "rating": rating_norm,
            "duration": duration,
            "constraints": constraints,
        }


# ============================================================
# RUN GENERATION + SAVE BOND EDGE DATA
# ============================================================

bond_edge_gen = BondEdgeLLMDataGenerator(
    model_name=BOND_EDGE_CONFIG["model"],
    samples_per_intent=BOND_EDGE_CONFIG["samples_per_intent"],
    max_per_call=BOND_EDGE_CONFIG["max_per_call"],
)

bond_edge_data = bond_edge_gen.generate_dataset()

print("Intent counts in new bond edge dataset:")
print(Counter(row["intent"] for row in bond_edge_data))

out_path_bond = BOND_EDGE_CONFIG["output_path"]
out_path_bond.parent.mkdir(parents=True, exist_ok=True)

with out_path_bond.open("w", encoding="utf-8") as f:
    for row in bond_edge_data:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"\n✓ Saved extra bond edge data to: {out_path_bond}")
print(f"  Total rows: {len(bond_edge_data)}")

GENERATING EXTRA BOND EDGE-CASE DATA (Gemini)

→ Intent: buy_recommendation (target 180)


Generating buy_recommendation:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 182 samples for buy_recommendation

→ Intent: sell_recommendation (target 180)


Generating sell_recommendation:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for sell_recommendation

→ Intent: portfolio_analysis (target 180)


Generating portfolio_analysis:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 181 samples for portfolio_analysis

→ Intent: reduce_duration (target 180)


Generating reduce_duration:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for reduce_duration

→ Intent: increase_yield (target 180)


Generating increase_yield:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for increase_yield

→ Intent: hedge_volatility (target 180)


Generating hedge_volatility:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for hedge_volatility

→ Intent: sector_rebalance (target 180)


Generating sector_rebalance:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for sector_rebalance

→ Intent: barbell_strategy (target 180)


Generating barbell_strategy:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for barbell_strategy

→ Intent: switch_bonds (target 180)


Generating switch_bonds:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for switch_bonds

→ Intent: explain_recommendation (target 180)


Generating explain_recommendation:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for explain_recommendation

→ Intent: market_outlook (target 180)


Generating market_outlook:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for market_outlook

→ Intent: credit_analysis (target 180)


Generating credit_analysis:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for credit_analysis

→ Intent: forecast_prices (target 180)


Generating forecast_prices:   0%|          | 0/180 [00:00<?, ?it/s]

  ✓ Collected 180 samples for forecast_prices

✓ Generated 2343 extra bond edge-case samples

Intent counts in new bond edge dataset:
Counter({'buy_recommendation': 182, 'portfolio_analysis': 181, 'sell_recommendation': 180, 'reduce_duration': 180, 'increase_yield': 180, 'hedge_volatility': 180, 'sector_rebalance': 180, 'barbell_strategy': 180, 'switch_bonds': 180, 'explain_recommendation': 180, 'market_outlook': 180, 'credit_analysis': 180, 'forecast_prices': 180})

✓ Saved extra bond edge data to: /kaggle/working/bond_edge_cases.jsonl
  Total rows: 2343


In [48]:
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple
from collections import Counter

from sklearn.model_selection import train_test_split

# ============================================================
# PATHS: ALL 6 FILES
# ============================================================

BASE_DATA_DIR = Path("/kaggle/input/bonds-query-classifier-finetuning-slm-dataset")

TRAIN_EXT = BASE_DATA_DIR / "train_extended.jsonl"
VAL_EXT   = BASE_DATA_DIR / "val_extended.jsonl"
TEST_EXT  = BASE_DATA_DIR / "test_extended.jsonl"

# Adjust these if your earlier generators used different paths
EQUITY_EDGE_PATH       = Path("/kaggle/working/equity_edge_cases.jsonl")
GENERAL_NONBOND_PATH   = Path("/kaggle/working/general_non_bond_edge_cases.jsonl")
BOND_EDGE_PATH         = Path("/kaggle/working/bond_edge_cases.jsonl")

FINAL_OUT_DIR = Path("/kaggle/working/final_intent_dataset")
FINAL_OUT_DIR.mkdir(parents=True, exist_ok=True)


def load_jsonl(path: Path) -> List[Dict[str, Any]]:
    data: List[Dict[str, Any]] = []
    if not path.exists():
        print(f"⚠ File not found: {path}")
        return data
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return data


def save_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


# ============================================================
# LOAD ALL 6 DATASETS
# ============================================================

print("=" * 60)
print("LOADING ALL 6 DATA FILES")
print("=" * 60)

train_base = load_jsonl(TRAIN_EXT)
val_base   = load_jsonl(VAL_EXT)
test_base  = load_jsonl(TEST_EXT)

equity_edge      = load_jsonl(EQUITY_EDGE_PATH)
general_nonbond  = load_jsonl(GENERAL_NONBOND_PATH)
bond_edge        = load_jsonl(BOND_EDGE_PATH)

print(f"train_extended: {len(train_base)}")
print(f"val_extended:   {len(val_base)}")
print(f"test_extended:  {len(test_base)}")
print(f"equity_edge:    {len(equity_edge)}")
print(f"general_nonbond_edge: {len(general_nonbond)}")
print(f"bond_edge:      {len(bond_edge)}")

# Combine everything
all_data_raw = train_base + val_base + test_base + equity_edge + general_nonbond + bond_edge
print(f"\nCombined raw size (before dedupe): {len(all_data_raw)}")


# ============================================================
# DEDUPLICATE BY (text, intent)
# ============================================================

seen_keys = set()
all_data: List[Dict[str, Any]] = []
for row in all_data_raw:
    text = str(row.get("text", "")).strip()
    intent = row.get("intent")
    if not text or intent is None:
        continue
    key = (text, intent)
    if key in seen_keys:
        continue
    seen_keys.add(key)
    all_data.append(row)

print(f"✓ After dedupe: {len(all_data)} rows")


# Quick sanity: intent distribution
intent_counts = Counter(r["intent"] for r in all_data if "intent" in r)
print("\nIntent distribution (all_data):")
for k, v in sorted(intent_counts.items()):
    print(f"  {k:20s} : {v}")


# ============================================================
# FINAL STRATIFIED SPLIT (train / val / test)
# ============================================================

TRAIN_FRAC = 0.7
VAL_FRAC   = 0.15  # test = 0.15

intents = [r["intent"] for r in all_data]

train_all, test_final = train_test_split(
    all_data,
    test_size=1.0 - (TRAIN_FRAC + VAL_FRAC),
    stratify=intents,
    random_state=42,
)

intents_train_all = [r["intent"] for r in train_all]
val_size = VAL_FRAC / (TRAIN_FRAC + VAL_FRAC)

train_final, val_final = train_test_split(
    train_all,
    test_size=val_size,
    stratify=intents_train_all,
    random_state=42,
)

print("\nFinal split sizes:")
print(f"  train_final: {len(train_final)}")
print(f"  val_final:   {len(val_final)}")
print(f"  test_final:  {len(test_final)}")


# Save
train_out_path = FINAL_OUT_DIR / "train_final.jsonl"
val_out_path   = FINAL_OUT_DIR / "val_final.jsonl"
test_out_path  = FINAL_OUT_DIR / "test_final.jsonl"

save_jsonl(train_out_path, train_final)
save_jsonl(val_out_path,   val_final)
save_jsonl(test_out_path,  test_final)

print("\n✓ Saved final combined splits to:")
print(f"  {train_out_path}")
print(f"  {val_out_path}")
print(f"  {test_out_path}")

LOADING ALL 6 DATA FILES
train_extended: 5749
val_extended:   1232
test_extended:  1232
equity_edge:    409
general_nonbond_edge: 1000
bond_edge:      2343

Combined raw size (before dedupe): 11965
✓ After dedupe: 10243 rows

Intent distribution (all_data):
  barbell_strategy     : 552
  buy_recommendation   : 1021
  credit_analysis      : 563
  explain_recommendation : 503
  forecast_prices      : 638
  hedge_volatility     : 529
  increase_yield       : 760
  market_outlook       : 571
  non_bond_llm         : 1017
  non_bond_search      : 1102
  portfolio_analysis   : 535
  reduce_duration      : 578
  sector_rebalance     : 721
  sell_recommendation  : 564
  switch_bonds         : 589

Final split sizes:
  train_final: 7169
  val_final:   1537
  test_final:  1537

✓ Saved final combined splits to:
  /kaggle/working/final_intent_dataset/train_final.jsonl
  /kaggle/working/final_intent_dataset/val_final.jsonl
  /kaggle/working/final_intent_dataset/test_final.jsonl


In [2]:
"""
Bond Query Classifier - Training on FINAL master dataset (bond + non-bond + edge cases)
=======================================================================================

Assumes these files exist:

/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/train_final.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/val_final.jsonl
/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_final.jsonl
"""

# ==================== INSTALL DEPENDENCIES ====================
import sys
import subprocess

print("=" * 60)
print("INSTALLING DEPENDENCIES")
print("=" * 60)

subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers", "accelerate", "scikit-learn"
])

print("✓ Dependencies installed!\n")


# ==================== IMPORTS ====================
import os
import json
import random
from pathlib import Path
from typing import List, Dict, Any

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModel,
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm.auto import tqdm


# ==================== GPU CHECK ====================
print("=" * 60)
print("GPU CHECK")
print("=" * 60)

if torch.cuda.is_available():
    print(f"✓ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"✓ CUDA version: {torch.version.cuda}")
else:
    print("⚠ WARNING: No GPU detected! Training will be very slow.")

print()


# ==================== CONFIGURATION ====================
CONFIG = {
    'base_model': 'microsoft/deberta-v3-small',

    # ---- FINAL dataset paths ----
    'dataset_dir': '/kaggle/input/bonds-query-classifier-finetuning-slm-dataset',
    'train_file': 'train_final.jsonl',
    'val_file':   'val_final.jsonl',
    'test_file':  'test_final.jsonl',

    'batch_size': 32,
    'num_epochs': 10,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'max_length': 128,
    'output_dir': '/kaggle/working/bond_classifier_v3_final',
    'seed': 42,
}

# Set seeds
random.seed(CONFIG['seed'])
np.random.seed(CONFIG['seed'])
torch.manual_seed(CONFIG['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG['seed'])


# ==================== PYTORCH DATASET (15 INTENTS) ====================

class BondQueryDataset(Dataset):
    """PyTorch Dataset (bond + non-bond router)"""
    
    def __init__(self, data: List[Dict[str, Any]], tokenizer, max_length: int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # 13 original intents + 2 non-bond router intents
        self.intent_to_id = {
            'buy_recommendation': 0,
            'sell_recommendation': 1,
            'portfolio_analysis': 2,
            'reduce_duration': 3,
            'increase_yield': 4,
            'hedge_volatility': 5,
            'sector_rebalance': 6,
            'barbell_strategy': 7,
            'switch_bonds': 8,
            'explain_recommendation': 9,
            'market_outlook': 10,
            'credit_analysis': 11,
            'forecast_prices': 12,
            'non_bond_search': 13,
            'non_bond_llm': 14,
        }
        
        self.sector_to_id = {
            'Sovereign': 0, 'PSU Energy': 1, 'Financial': 2,
            'Corporate': 3, 'Infrastructure': 4, 'NBFC': 5, 'Banking': 6
        }
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        
        encoding = self.tokenizer(
            sample['text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # ---- intent ----
        intent_label = self.intent_to_id[sample['intent']]
        
        # ---- sectors (multi-label) ----
        sector_labels = torch.zeros(len(self.sector_to_id))
        for sector in sample.get('sectors', []):
            if sector in self.sector_to_id:
                sector_labels[self.sector_to_id[sector]] = 1
        
        # ---- rating ----
        rating = sample.get('rating')
        rating_map = {'AAA': 0, 'AA+': 1, 'AA': 2, 'A+': 3, 'A': 4, 'BBB': 5}
        rating_label = rating_map.get(rating, 6)   # 6 = "none / other"
        
        # ---- duration ----
        duration_map = {'short': 0, 'medium': 1, 'long': 2}
        duration_label = duration_map.get(sample.get('duration', 'medium'), 1)
        
        # ---- constraints (5 binary flags) ----
        constraints = sample.get('constraints', {})
        constraint_labels = torch.tensor([
            float(constraints.get('preserve_yield', False)),
            float(constraints.get('maintain_liquidity', False)),
            float(constraints.get('avoid_downgrades', False)),
            float(constraints.get('sector_diversity', False)),
            float(constraints.get('rating_above_aa', False))
        ])
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'intent_label': torch.tensor(intent_label),
            'sector_labels': sector_labels,
            'rating_label': torch.tensor(rating_label),
            'duration_label': torch.tensor(duration_label),
            'constraint_labels': constraint_labels
        }


# ==================== MODEL (SAME ARCH, 15-CLASS INTENT HEAD) ====================

class ProductionBondClassifier(nn.Module):
    """Multi-task classifier: intent + sectors + rating + duration + constraints"""
    
    def __init__(self, base_model: str = 'microsoft/deberta-v3-small', dropout: float = 0.15):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(base_model)
        hidden_size = self.bert.config.hidden_size
        
        self.feature_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(dropout),
            nn.GELU()
        )
        
        feature_size = hidden_size // 2
        
        # 15 intent classes (13 bond + 2 non-bond)
        self.intent_head = nn.Linear(feature_size, 15)
        self.sector_head = nn.Linear(feature_size, 7)
        self.rating_head = nn.Linear(feature_size, 7)
        self.duration_head = nn.Linear(feature_size, 3)
        self.constraint_head = nn.Linear(feature_size, 5)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        features = self.feature_layer(cls_output)
        features = self.dropout(features)
        
        return {
            'intent_logits': self.intent_head(features),
            'sector_logits': self.sector_head(features),
            'rating_logits': self.rating_head(features),
            'duration_logits': self.duration_head(features),
            'constraint_logits': self.constraint_head(features),
        }


# ==================== LOSS ====================

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()


class MultiTaskLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.intent_loss_fn = FocalLoss(gamma=2.0)
        self.sector_loss_fn = nn.BCEWithLogitsLoss()
        self.rating_loss_fn = nn.CrossEntropyLoss()
        self.duration_loss_fn = nn.CrossEntropyLoss()
        self.constraint_loss_fn = nn.BCEWithLogitsLoss()
    
    def forward(self, outputs, labels):
        intent_loss = self.intent_loss_fn(outputs['intent_logits'], labels['intent_label'])
        sector_loss = self.sector_loss_fn(outputs['sector_logits'], labels['sector_labels'])
        rating_loss = self.rating_loss_fn(outputs['rating_logits'], labels['rating_label'])
        duration_loss = self.duration_loss_fn(outputs['duration_logits'], labels['duration_label'])
        constraint_loss = self.constraint_loss_fn(outputs['constraint_logits'], labels['constraint_labels'])
        
        total = intent_loss + 0.5 * sector_loss + 0.3 * rating_loss + 0.3 * duration_loss + 0.4 * constraint_loss
        
        return {'total': total, 'intent': intent_loss}


# ==================== TRAINER ====================

class Trainer:
    def __init__(self, model, train_loader, val_loader, optimizer, scheduler, criterion, device, output_dir):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.best_acc = 0.0
    
    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0
        all_preds, all_labels = [], []
        
        pbar = tqdm(self.train_loader, desc=f'Epoch {epoch}')
        for batch in pbar:
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            
            labels = {k: v.to(self.device) for k, v in batch.items()
                      if k not in ['input_ids', 'attention_mask']}
            
            self.optimizer.zero_grad()
            outputs = self.model(input_ids, attention_mask)
            loss_dict = self.criterion(outputs, labels)
            loss = loss_dict['total']
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()
            
            total_loss += loss.item()
            preds = outputs['intent_logits'].argmax(dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels['intent_label'].cpu().numpy())
            
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{accuracy_score(all_labels[-len(preds):], preds):.3f}'
            })
        
        return total_loss / len(self.train_loader)
    
    def evaluate(self):
        self.model.eval()
        total_loss = 0
        all_preds, all_labels = [], []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc='Evaluating'):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = {k: v.to(self.device) for k, v in batch.items()
                          if k not in ['input_ids', 'attention_mask']}
                
                outputs = self.model(input_ids, attention_mask)
                loss_dict = self.criterion(outputs, labels)
                total_loss += loss_dict['total'].item()
                
                preds = outputs['intent_logits'].argmax(dim=-1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels['intent_label'].cpu().numpy())
        
        return {
            'loss': total_loss / len(self.val_loader),
            'accuracy': accuracy_score(all_labels, all_preds),
            'f1_macro': f1_score(all_labels, all_preds, average='macro')
        }
    
    def save_checkpoint(self, epoch, metrics):
        if metrics['accuracy'] > self.best_acc:
            self.best_acc = metrics['accuracy']
            torch.save(self.model.state_dict(), self.output_dir / 'pytorch_model.bin')
            print(f"✓ New best model saved (acc: {metrics['accuracy']:.4f})")
    
    def train(self, num_epochs):
        print("=" * 60)
        print("TRAINING")
        print("=" * 60)
        
        for epoch in range(1, num_epochs + 1):
            train_loss = self.train_epoch(epoch)
            print(f"\nEpoch {epoch}/{num_epochs} - Train Loss: {train_loss:.4f}")
            
            val_metrics = self.evaluate()
            print(f"Val Loss: {val_metrics['loss']:.4f}")
            print(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
            print(f"Val F1: {val_metrics['f1_macro']:.4f}\n")
            
            self.save_checkpoint(epoch, val_metrics)
        
        print(f"Training complete! Best accuracy: {self.best_acc:.4f}\n")


# ==================== JSONL LOADER + OPTIONAL CLEANING ====================

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows


def clean_nonbond_rows(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Optional tiny cleanup:
    - Drop non_bond_* rows whose text still clearly mentions bonds/g-secs.
    If you don't want this, just return rows unchanged.
    """
    cleaned = []
    dropped = 0
    bondish = ("bond", "bonds", "g-sec", "gsec", "debenture", "coupon", "yield")
    for r in rows:
        intent = r.get("intent", "")
        text = (r.get("text") or "").lower()
        if intent in ("non_bond_search", "non_bond_llm") and any(k in text for k in bondish):
            dropped += 1
            continue
        cleaned.append(r)
    if dropped > 0:
        print(f"  ➜ Dropped {dropped} noisy non-bond rows mentioning bonds")
    return cleaned


# ==================== MAIN TRAINING FUNCTION ====================

def train_model():
    """Train on FINAL master dataset (bond + non-bond + edge cases)."""
    
    dataset_dir = CONFIG['dataset_dir']
    train_path = os.path.join(dataset_dir, CONFIG['train_file'])
    val_path   = os.path.join(dataset_dir, CONFIG['val_file'])
    test_path  = os.path.join(dataset_dir, CONFIG['test_file'])

    print("=" * 60)
    print("LOADING FINAL DATASET")
    print("=" * 60)
    train_data = load_jsonl(train_path)
    val_data   = load_jsonl(val_path)
    test_data  = load_jsonl(test_path)

    print(f"Raw Train rows: {len(train_data)}")
    print(f"Raw Val rows:   {len(val_data)}")
    print(f"Raw Test rows:  {len(test_data)}")

    # --- Optional: small clean-up for non-bond rows ---
    train_data = clean_nonbond_rows(train_data)
    val_data   = clean_nonbond_rows(val_data)
    test_data  = clean_nonbond_rows(test_data)

    print(f"Clean Train rows: {len(train_data)}")
    print(f"Clean Val rows:   {len(val_data)}")
    print(f"Clean Test rows:  {len(test_data)}\n")

    # ---- Tokenizer & Datasets ----
    print("=" * 60)
    print("LOADING MODEL & TOKENIZER")
    print("=" * 60)
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['base_model'])
    
    train_dataset = BondQueryDataset(train_data, tokenizer, CONFIG['max_length'])
    val_dataset   = BondQueryDataset(val_data,   tokenizer, CONFIG['max_length'])
    test_dataset  = BondQueryDataset(test_data,  tokenizer, CONFIG['max_length'])
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=CONFIG['batch_size'])
    test_loader  = DataLoader(test_dataset,  batch_size=CONFIG['batch_size'])
    
    # ---- Model ----
    model = ProductionBondClassifier(CONFIG['base_model'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    print(f"✓ Model loaded: {sum(p.numel() for p in model.parameters()):,} parameters")
    print(f"✓ Device: {device}\n")
    
    # ---- Optimizer & Scheduler ----
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=0.01)
    num_training_steps = len(train_loader) * CONFIG['num_epochs']
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * CONFIG['warmup_ratio']),
        num_training_steps=num_training_steps
    )
    
    criterion = MultiTaskLoss()
    
    # ---- Train ----
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        device=device,
        output_dir=CONFIG['output_dir'],
    )
    trainer.train(CONFIG['num_epochs'])
    
    # ---- Save tokenizer ----
    tokenizer.save_pretrained(CONFIG['output_dir'])
    
    # ---- Final test ----
    print("=" * 60)
    print("FINAL TEST")
    print("=" * 60)
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Testing'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = outputs['intent_logits'].argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['intent_label'].numpy())
    
    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='macro')
    
    print(f"\n✓ Test Accuracy: {test_acc:.4f}")
    print(f"✓ Test F1 Macro: {test_f1:.4f}\n")
    
    print("=" * 60)
    print("TRAINING COMPLETE!")
    print("=" * 60)
    print(f"\n✓ Model saved to: {CONFIG['output_dir']}")
    print(f"✓ Files: pytorch_model.bin, tokenizer files")
    print("\nTo download:")
    print("  1. Go to Output tab")
    print("  2. Download 'bond_classifier_v3_final' folder")
    print("  3. Use locally with your inference notebook")


# ==================== RUN ====================

if __name__ == '__main__':
    train_model()

INSTALLING DEPENDENCIES
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 4.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 113.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 91.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 45.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 2.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 8.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 33.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 14.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 8.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 87.6 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 25.6.0 requires rmm-cu12==25.6.*, but you have rmm-cu12 25.2.0 which is incompatible.


✓ Dependencies installed!

GPU CHECK
✓ GPU detected: Tesla T4
✓ GPU memory: 15.83 GB
✓ CUDA version: 12.4

LOADING FINAL DATASET
Raw Train rows: 7169
Raw Val rows:   1537
Raw Test rows:  1537
  ➜ Dropped 67 noisy non-bond rows mentioning bonds
  ➜ Dropped 12 noisy non-bond rows mentioning bonds
  ➜ Dropped 16 noisy non-bond rows mentioning bonds
Clean Train rows: 7102
Clean Val rows:   1525
Clean Test rows:  1521

LOADING MODEL & TOKENIZER


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

2025-11-30 16:31:18.856630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764520279.062789      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764520279.127261      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

✓ Model loaded: 142,206,757 parameters
✓ Device: cuda

TRAINING


Epoch 1:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 1/10 - Train Loss: 3.2768


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 1.4890
Val Accuracy: 0.7508
Val F1: 0.7226

✓ New best model saved (acc: 0.7508)


Epoch 2:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 2/10 - Train Loss: 1.1423


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.7941
Val Accuracy: 0.9390
Val F1: 0.9477

✓ New best model saved (acc: 0.9390)


Epoch 3:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 3/10 - Train Loss: 0.7708


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.6658
Val Accuracy: 0.9430
Val F1: 0.9500

✓ New best model saved (acc: 0.9430)


Epoch 4:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 4/10 - Train Loss: 0.6376


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.5868
Val Accuracy: 0.9554
Val F1: 0.9611

✓ New best model saved (acc: 0.9554)


Epoch 5:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 5/10 - Train Loss: 0.5607


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.5520
Val Accuracy: 0.9489
Val F1: 0.9562



Epoch 6:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 6/10 - Train Loss: 0.5147


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.5266
Val Accuracy: 0.9600
Val F1: 0.9652

✓ New best model saved (acc: 0.9600)


Epoch 7:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 7/10 - Train Loss: 0.4842


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.5111
Val Accuracy: 0.9554
Val F1: 0.9615



Epoch 8:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 8/10 - Train Loss: 0.4671


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.5007
Val Accuracy: 0.9554
Val F1: 0.9622



Epoch 9:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 9/10 - Train Loss: 0.4537


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.4958
Val Accuracy: 0.9561
Val F1: 0.9626



Epoch 10:   0%|          | 0/222 [00:00<?, ?it/s]


Epoch 10/10 - Train Loss: 0.4499


Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]

Val Loss: 0.4954
Val Accuracy: 0.9541
Val F1: 0.9609

Training complete! Best accuracy: 0.9600

FINAL TEST


Testing:   0%|          | 0/48 [00:00<?, ?it/s]


✓ Test Accuracy: 0.9619
✓ Test F1 Macro: 0.9660

TRAINING COMPLETE!

✓ Model saved to: /kaggle/working/bond_classifier_v3_final
✓ Files: pytorch_model.bin, tokenizer files

To download:
  1. Go to Output tab
  2. Download 'bond_classifier_v3_final' folder
  3. Use locally with your inference notebook


In [6]:
# ==================== EVAL: BOND + NON-BOND ROUTER MODEL ====================
import json
import time
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, f1_score, classification_report


# ==================== MODEL (MUST MATCH TRAINING) ====================

class ProductionBondClassifier(nn.Module):
    """DeBERTa-v3-small + multi-task heads, 15 intents (13 bond + 2 non-bond)."""
    
    def __init__(self, base_model: str = 'microsoft/deberta-v3-small', dropout: float = 0.15):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained(base_model)
        hidden_size = self.bert.config.hidden_size
        
        self.feature_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.Dropout(dropout),
            nn.GELU()
        )
        
        feature_size = hidden_size // 2
        
        # 15 intents: 13 bond + 2 router
        self.intent_head = nn.Linear(feature_size, 15)
        self.sector_head = nn.Linear(feature_size, 7)
        self.rating_head = nn.Linear(feature_size, 7)
        self.duration_head = nn.Linear(feature_size, 3)
        self.constraint_head = nn.Linear(feature_size, 5)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        features = self.feature_layer(cls_output)
        features = self.dropout(features)
        
        return {
            'intent_logits': self.intent_head(features),
            'sector_logits': self.sector_head(features),
            'rating_logits': self.rating_head(features),
            'duration_logits': self.duration_head(features),
            'constraint_logits': self.constraint_head(features),
        }


# ==================== CLASSIFIER WRAPPER ====================

class BondClassifier:
    def __init__(self, model_path: str, base_model: str = "microsoft/deberta-v3-small"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # ✅ Load tokenizer from base model (NOT from local dir to avoid HFValidationError)
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        # Load model + weights from local folder
        self.model = ProductionBondClassifier(base_model=base_model)
        state_dict = torch.load(Path(model_path) / "pytorch_model.bin", map_location=self.device)
        self.model.load_state_dict(state_dict)
        self.model.to(self.device)
        self.model.eval()

        # MUST match BondQueryDataset.intent_to_id used during training
        self.intent_names = [
            'buy_recommendation',     # 0
            'sell_recommendation',    # 1
            'portfolio_analysis',     # 2
            'reduce_duration',        # 3
            'increase_yield',         # 4
            'hedge_volatility',       # 5
            'sector_rebalance',       # 6
            'barbell_strategy',       # 7
            'switch_bonds',           # 8
            'explain_recommendation', # 9
            'market_outlook',         # 10
            'credit_analysis',        # 11
            'forecast_prices',        # 12
            'non_bond_search',        # 13
            'non_bond_llm',           # 14
        ]
        self.intent_to_id = {name: i for i, name in enumerate(self.intent_names)}
    
    def _route_from_intent(self, intent: str) -> str:
        """
        Router decision:
        - 'non_bond_search' -> 'search'
        - 'non_bond_llm'    -> 'llm'
        - everything else   -> 'bond'
        """
        if intent == 'non_bond_search':
            return 'search'
        elif intent == 'non_bond_llm':
            return 'llm'
        else:
            return 'bond'
    
    def classify(self, query: str):
        enc = self.tokenizer(
            query,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=128
        )
        input_ids = enc["input_ids"].to(self.device)
        attention_mask = enc["attention_mask"].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            intent_logits = outputs['intent_logits']
            intent_probs = F.softmax(intent_logits, dim=-1)
        
        intent_idx = intent_probs.argmax(dim=-1).item()
        intent = self.intent_names[intent_idx]
        confidence = intent_probs.max().item()
        route = self._route_from_intent(intent)
        
        return intent, confidence, route


# ==================== JSONL LOADER ====================

def load_jsonl(path: str):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows


# ==================== EVALUATION ON TEST JSONL ====================

def evaluate_on_jsonl(classifier: BondClassifier, jsonl_path: str):
    data = load_jsonl(jsonl_path)
    print(f"Loaded {len(data)} test rows from {jsonl_path}")
    
    y_true_idx = []
    y_pred_idx = []
    routes = []

    start = time.time()
    for row in data:
        text = row["text"]
        true_intent_str = row["intent"]

        pred_intent, conf, route = classifier.classify(text)
        routes.append(route)

        # Map both true & pred to indices
        if true_intent_str not in classifier.intent_to_id:
            # If some weird label sneaks in, skip that row
            continue
        y_true_idx.append(classifier.intent_to_id[true_intent_str])
        y_pred_idx.append(classifier.intent_to_id[pred_intent])
    
    elapsed = time.time() - start
    print(f"\nInference time on test set: {elapsed:.2f}s "
          f"({len(y_true_idx) / max(elapsed,1e-6):.1f} samples/sec)")
    
    # Overall metrics
    acc = accuracy_score(y_true_idx, y_pred_idx)
    f1_macro = f1_score(y_true_idx, y_pred_idx, average="macro")
    print(f"\n=== Overall Intent Metrics ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {f1_macro:.4f}")
    
    # Per-intent report
    print("\n=== Per-intent Classification Report ===")
    print(classification_report(
        y_true_idx, 
        y_pred_idx, 
        target_names=classifier.intent_names, 
        digits=4
    ))
    
    # Router stats
    from collections import Counter
    print("\n=== Router Decision Distribution (pred side) ===")
    print(Counter(routes))


# ==================== RUN EVAL + INTERACTIVE TEST ====================

if __name__ == "__main__":
    MODEL_PATH = "/kaggle/working/bond_classifier_v3_final"  # folder with pytorch_model.bin
    TEST_JSONL = "/kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_final.jsonl"
    
    classifier = BondClassifier(MODEL_PATH)
    
    # 1) Full evaluation on your final test set
    evaluate_on_jsonl(classifier, TEST_JSONL)
    
    # 2) Quick manual query check
    # print("\n================ Manual sanity check ================")
    # while True:
    #     q = input("\nEnter a query (or 'q' to quit): ").strip()
    #     if q.lower() in {"q", "quit", "exit"}:
    #         break
    #     t0 = time.time()
    #     intent, conf, route = classifier.classify(q)
    #     dt = time.time() - t0
    #     print(f"Predicted Intent : {intent}")
    #     print(f"Confidence       : {conf:.3f}")
    #     print(f"Router decision  : {route}  (bond/search/llm)")
    #     print(f"Latency          : {dt*1000:.1f} ms")


Loaded 1537 test rows from /kaggle/input/bonds-query-classifier-finetuning-slm-dataset/test_final.jsonl

Inference time on test set: 18.15s (84.7 samples/sec)

=== Overall Intent Metrics ===
Accuracy: 0.9623
Macro F1: 0.9676

=== Per-intent Classification Report ===
                        precision    recall  f1-score   support

    buy_recommendation     0.9733    0.9542    0.9637       153
   sell_recommendation     0.9884    1.0000    0.9942        85
    portfolio_analysis     0.9747    0.9625    0.9686        80
       reduce_duration     0.9765    0.9540    0.9651        87
        increase_yield     0.9652    0.9737    0.9694       114
      hedge_volatility     0.9351    0.9114    0.9231        79
      sector_rebalance     0.9817    0.9907    0.9862       108
      barbell_strategy     0.9765    1.0000    0.9881        83
          switch_bonds     0.9667    0.9886    0.9775        88
explain_recommendation     1.0000    1.0000    1.0000        75
        market_outlook     0


Enter a query (or 'q' to quit):  Show me stocks increase


Predicted Intent : non_bond_search
Confidence       : 0.867
Router decision  : search  (bond/search/llm)
Latency          : 13.8 ms


KeyboardInterrupt: Interrupted by user

In [10]:
while True:
        q = input("\nEnter a query (or 'q' to quit): ").strip()
        if q.lower() in {"q", "quit", "exit"}:
            break
        t0 = time.time()
        intent, conf, route = classifier.classify(q)
        dt = time.time() - t0
        print(f"Predicted Intent : {intent}")
        print(f"Confidence       : {conf:.3f}")
        print(f"Router decision  : {route}  (bond/search/llm)")
        print(f"Latency          : {dt*1000:.1f} ms")


Enter a query (or 'q' to quit):  Aryan jain bonds


Predicted Intent : buy_recommendation
Confidence       : 0.668
Router decision  : bond  (bond/search/llm)
Latency          : 14.4 ms



Enter a query (or 'q' to quit):  Hi how are u


Predicted Intent : non_bond_search
Confidence       : 0.316
Router decision  : search  (bond/search/llm)
Latency          : 12.6 ms



Enter a query (or 'q' to quit):  fuck y


Predicted Intent : non_bond_llm
Confidence       : 0.456
Router decision  : llm  (bond/search/llm)
Latency          : 13.5 ms



Enter a query (or 'q' to quit):  what s up


Predicted Intent : non_bond_search
Confidence       : 0.410
Router decision  : search  (bond/search/llm)
Latency          : 14.0 ms



Enter a query (or 'q' to quit):  can you help me increase my stock


Predicted Intent : increase_yield
Confidence       : 0.620
Router decision  : bond  (bond/search/llm)
Latency          : 13.0 ms



Enter a query (or 'q' to quit):  can you help me increase my stocks


Predicted Intent : increase_yield
Confidence       : 0.734
Router decision  : bond  (bond/search/llm)
Latency          : 13.2 ms



Enter a query (or 'q' to quit):  is stock in my portfolio


Predicted Intent : non_bond_search
Confidence       : 0.681
Router decision  : search  (bond/search/llm)
Latency          : 12.9 ms



Enter a query (or 'q' to quit):  can i see my stocks


Predicted Intent : non_bond_search
Confidence       : 0.767
Router decision  : search  (bond/search/llm)
Latency          : 12.4 ms



Enter a query (or 'q' to quit):  q
