In [None]:
# Install PyTorch with CUDA 11.8
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install the rest
!pip install numpy pandas tqdm transformers scikit-learn datasets spacy sentence-transformers nltk



Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
#!/usr/bin/env python3
# Uncomment these commands if you want a fresh run:
#!rm -rf stage3_cache_transformer_ner/
#!rm -f preprocessed_hotpotqa_stable.parquet
#!rm -rf ./models/neural_ranker_final_hotpotqa/
print("All old cache files have been deleted.")
"""
Neural Ranker Training Script for Multi-Hop RAG Pipeline
Trains a cross-encoder model on HotpotQA dataset with pipeline-specific features.
This version uses a high-performance, GPU-accelerated Transformer for NER and includes
robust, transactional caching and error handling.
"""

import os
import re
import multiprocessing as mp
from multiprocessing import Pool
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import logging
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    pipeline as hf_pipeline
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK data is available
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ==============================================================================
# --- CONFIGURATION ---
# ==============================================================================

@dataclass
class TrainingConfig:
    model_name: str = "microsoft/deberta-v3-base"
    max_length: int = 512
    batch_size: int = 12
    learning_rate: float = 1.5e-5
    num_epochs: int = 4

    warmup_ratio: float = 0.20
    weight_decay: float = 0.05
    hard_negative_ratio: float = 0.1 # Crucial for preventing trivial solutions

    output_dir: str = "./models/neural_ranker"
    save_steps: int = 250
    eval_steps: int = 250
    logging_steps: int = 100
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_f1"
    greater_is_better: bool = True
    dataloader_num_workers: int = 2
    fp16: bool = torch.cuda.is_available()
    gradient_accumulation_steps: int = 3
    max_grad_norm: float = 1.0

    # Data processing features
    min_sentence_length: int = 20
    use_hop_features: bool = True
    use_rhetorical_features: bool = True
    use_entity_features: bool = True
    use_reasoning_features: bool = True
    embedding_batch_size: int = 128
    ner_batch_size: int = 64

@dataclass
class SentenceWithMetadata:
    text: str; doc_title: str; doc_id: int; sent_id: int; entities: List[str]
    rhetorical_role: str = "Background"; hop_depth: int = 0
    has_reasoning_signals: bool = False; is_causal: bool = False

# ==============================================================================
# --- HIGH-PERFORMANCE DATA PROCESSOR ---
# ==============================================================================
class HotpotQADataProcessor:
    rhetorical_patterns = {
        'Main Claim': ['argue', 'claim', 'assert', 'believe', 'conclude'],
        'Supporting Evidence': ['evidence', 'data', 'research', 'study', 'found'],
        'Expert Opinion': ['expert', 'according to', 'stated', 'opinion'],
    }

    def __init__(self, config: TrainingConfig, sbert_model: SentenceTransformer):
        self.config = config
        self.sbert_model = sbert_model
        if self.config.use_entity_features:
            logger.info("Loading GPU-accelerated Transformer NER pipeline...")
            self.ner_pipeline = hf_pipeline(
                "ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER",
                device=0 if torch.cuda.is_available() else -1, grouped_entities=True
            )
        else:
            self.ner_pipeline = None

    @staticmethod
    def _initial_parse_worker(row: Dict) -> Optional[Dict]:
        try:
            query = row.get('query')
            if not isinstance(query, str) or not query.strip(): return None
            sp_titles = set(row.get('sp', []))
            raw_context = row.get('context', "")
            if not isinstance(raw_context, str) or not raw_context.strip(): return None
            docs_raw = re.split(r'\n\nTitle: ', raw_context)
            sentences = []
            if docs_raw and docs_raw[0].strip():
                first_doc = docs_raw[0].strip()
                if not first_doc.startswith("Title:"):
                    docs_raw[0] = "Title: " + first_doc
            for doc_part in docs_raw:
                if not doc_part or not doc_part.strip(): continue
                lines = doc_part.split('\n', 1)
                doc_title = lines[0].replace("Title: ", "").strip()
                doc_text = lines[1].strip() if len(lines) > 1 else ""
                for sent_text in sent_tokenize(doc_text):
                    if len(sent_text) > TrainingConfig.min_sentence_length:
                        sentences.append({
                            "text": sent_text, "doc_title": doc_title,
                            "is_positive_source": doc_title in sp_titles
                        })
            return {"query": query, "sentences": sentences}
        except Exception as e:
            logger.error(f"Critical error parsing row ID {row.get('query_id', 'N/A')}: {e}")
            return None

    def _batch_process_sentences(self, all_sentences: List[Dict], chunk_size: int = 20000):
        logger.info(f"Batch processing {len(all_sentences)} unique sentences (chunk_size={chunk_size})...")
        sentence_texts = [s['text'] for s in all_sentences]
        embeddings_list = []
        try:
            for i in tqdm(range(0, len(sentence_texts), chunk_size), desc="SBERT Encoding Chunks"):
                chunk = sentence_texts[i:i + chunk_size]
                embeddings_list.append(self.sbert_model.encode(
                    chunk, batch_size=self.config.embedding_batch_size,
                    show_progress_bar=False, convert_to_numpy=True
                ))
            sentence_embeddings = np.vstack(embeddings_list) if embeddings_list else np.zeros((0, self.sbert_model.get_sentence_embedding_dimension()))
        except Exception as e:
            logger.error(f"SBERT encoding failed: {e}")
            dim = self.sbert_model.get_sentence_embedding_dimension()
            sentence_embeddings = np.zeros((len(sentence_texts), dim))
        all_entities = [[] for _ in sentence_texts]
        if self.ner_pipeline:
            logger.info("Running Transformer NER batch processing...")
            for i in tqdm(range(0, len(sentence_texts), chunk_size), desc="Transformer NER Chunks"):
                chunk_texts = sentence_texts[i:i + chunk_size]
                try:
                    results = self.ner_pipeline(chunk_texts, batch_size=self.config.ner_batch_size)
                    for j, ner_result in enumerate(results):
                        all_entities[i + j] = [(entity['word'], entity['entity_group']) for entity in ner_result]
                except Exception as e:
                    logger.warning(f"NER chunk failed for sentences {i} to {i+len(chunk_texts)}: {e}")
        return sentence_embeddings, all_entities

    @staticmethod
    def _assemble_examples_worker(args: Tuple) -> List[Tuple[str, int, Dict]]:
        parsed_row, query_embedding, sentence_data_map, config, embedding_dim = args
        query, sentences = parsed_row['query'], parsed_row['sentences']
        if not sentences: return []
        def _classify_rhetorical_role(text):
            text_lower = text.lower()
            for role, patterns in HotpotQADataProcessor.rhetorical_patterns.items():
                if any(p in text_lower for p in patterns): return role
            return "Background_Information"
        def _detect_causal_language(text):
            return any(word in text.lower() for word in ['cause', 'because', 'due to', 'result', 'lead to'])
        def _create_structured_input(q, s_meta):
            components = []
            if config.use_hop_features: components.append(f"[HOP:{s_meta.hop_depth}]")
            if config.use_rhetorical_features: components.append(f"[ROLE:{s_meta.rhetorical_role.replace(' ', '_')}]")
            if config.use_entity_features and s_meta.entities: components.append(f"[ENT:{','.join(s_meta.entities[:3])}]")
            if config.use_reasoning_features:
                if s_meta.is_causal: components.append("[CAUSAL]")
                if s_meta.has_reasoning_signals: components.append("[REASONING]")
            return f"{' '.join(components)} {q} [SEP] {s_meta.text}"
        positives, negatives = [], []
        sent_emb_list = [sentence_data_map.get(s['text'], {}).get('embedding', np.zeros(embedding_dim)) for s in sentences]
        sentence_embeddings = np.array(sent_emb_list)
        q_norm = np.linalg.norm(query_embedding) + 1e-9
        s_norms = np.linalg.norm(sentence_embeddings, axis=1) + 1e-9
        similarities = (sentence_embeddings @ query_embedding) / (s_norms * q_norm)
        for i, sentence in enumerate(sentences):
            sent_text = sentence['text']
            entry = sentence_data_map.get(sent_text, {})
            entities = entry.get('entities', [])
            is_causal = _detect_causal_language(sent_text)
            sentence_meta = SentenceWithMetadata(
                text=sent_text, doc_title=sentence.get('doc_title', ''),
                entities=[e[0] for e in entities], rhetorical_role=_classify_rhetorical_role(sent_text),
                is_causal=is_causal, has_reasoning_signals=(len(entities) > 1 or is_causal),
                doc_id=sentence.get('doc_id', 0), sent_id=sentence.get('sent_idx', i)
            )
            structured_input = _create_structured_input(query, sentence_meta)
            metadata = {'similarity': float(similarities[i]), 'doc_title': sentence.get('doc_title', '')}
            example = (structured_input, int(sentence.get('is_positive_source', False)), metadata)
            if sentence.get('is_positive_source', False): positives.append(example)
            else: negatives.append(example)
        if not positives: return []
        negatives.sort(key=lambda x: x[2]['similarity'], reverse=True)
        num_hard = int(len(positives) * config.hard_negative_ratio)
        selected_negatives = negatives[:num_hard] + negatives[-(len(positives) - num_hard):]
        return positives + selected_negatives

    def process_dataset(self, dataset: List[Dict], num_workers: int) -> List[Tuple[str, int, Dict]]:
        cache_dir = "stage3_cache_transformer_ner"
        os.makedirs(cache_dir, exist_ok=True)
        cache_files = {"parsed_rows.json", "sentence_embeddings.npy", "sentence_texts.json", "sentence_meta.json", "query_embeddings.npy", "query_texts.json"}
        if all(os.path.exists(os.path.join(cache_dir, f)) for f in cache_files):
            logger.info("Loading Stage 3 cache to accelerate processing...")
            with open(os.path.join(cache_dir, "parsed_rows.json"), "r") as f: parsed_rows = json.load(f)
            sentence_embeddings = np.load(os.path.join(cache_dir, "sentence_embeddings.npy"))
            with open(os.path.join(cache_dir, "sentence_texts.json"), "r") as f: sentence_texts = json.load(f)
            with open(os.path.join(cache_dir, "sentence_meta.json"), "r") as f: sentence_meta = json.load(f)
            query_embeddings = np.load(os.path.join(cache_dir, "query_embeddings.npy"))
            with open(os.path.join(cache_dir, "query_texts.json"), "r") as f: query_texts = json.load(f)
            query_embedding_map = {q: emb for q, emb in zip(query_texts, query_embeddings)}
            sentence_data_map = {text: {"embedding": sentence_embeddings[i], "entities": meta.get("entities", [])} for i, (text, meta) in enumerate(zip(sentence_texts, sentence_meta))}
        else:
            logger.info("No complete cache found. Running full preprocessing pipeline...")
            with Pool(processes=num_workers) as pool:
                parsed_rows = list(tqdm(pool.imap(self._initial_parse_worker, dataset), total=len(dataset), desc="Stage 1: Parsing Rows"))
            parsed_rows = [row for row in parsed_rows if row and row.get('sentences')]
            if not parsed_rows: raise ValueError("Stage 1 parsing resulted in zero processable rows.")
            unique_sentences = {s['text']: s for row in parsed_rows for s in row['sentences']}
            unique_queries = sorted(list({row['query'] for row in parsed_rows}))
            all_unique_sentences = list(unique_sentences.values())
            sentence_embeddings, all_entities = self._batch_process_sentences(all_unique_sentences)
            query_embeddings = self.sbert_model.encode([f"query: {q}" for q in unique_queries], batch_size=self.config.embedding_batch_size, show_progress_bar=True, convert_to_numpy=True)
            query_embedding_map = {q: emb for q, emb in zip(unique_queries, query_embeddings)}
            sentence_texts = [s['text'] for s in all_unique_sentences]
            sentence_meta = [{"entities": all_entities[i]} for i in range(len(all_entities))]
            sentence_data_map = {text: {"embedding": sentence_embeddings[i], "entities": sentence_meta[i]["entities"]} for i, text in enumerate(sentence_texts)}
            temp_files = {f: f"{f}.tmp" for f in cache_files} # Use .tmp extension
            try:
                with open(os.path.join(cache_dir, temp_files["parsed_rows.json"]), "w") as f: json.dump(parsed_rows, f)
                np.save(os.path.join(cache_dir, temp_files["sentence_embeddings.npy"]), sentence_embeddings)
                with open(os.path.join(cache_dir, temp_files["sentence_texts.json"]), "w") as f: json.dump(sentence_texts, f)
                with open(os.path.join(cache_dir, temp_files["sentence_meta.json"]), "w") as f: json.dump(sentence_meta, f)
                np.save(os.path.join(cache_dir, temp_files["query_embeddings.npy"]), query_embeddings)
                with open(os.path.join(cache_dir, temp_files["query_texts.json"]), "w") as f: json.dump(unique_queries, f)
                for final, temp in temp_files.items():
                    shutil.move(os.path.join(cache_dir, temp), os.path.join(cache_dir, final))
                logger.info(f"Saved Stage 3 cache to '{cache_dir}'.")
            except Exception as e:
                logger.error(f"Failed to write cache files: {e}. Cleaning up.")
                for temp in temp_files.values():
                    if os.path.exists(os.path.join(cache_dir, temp)): os.remove(os.path.join(cache_dir, temp))

        logger.info("Stage 4: Assembling final training examples using threads...")
        all_examples = []
        assembly_args = [(row, query_embedding_map[row['query']], sentence_data_map, self.config, sentence_embeddings.shape[1]) for row in parsed_rows]
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = [executor.submit(self._assemble_examples_worker, arg) for arg in assembly_args]
            for future in tqdm(as_completed(futures), total=len(futures), desc="Stage 4: Assembling Examples"):
                all_examples.extend(future.result())
        return all_examples

# ==============================================================================
# --- DATASET & TRAINER CLASSES ---
# ==============================================================================
class RankerDataset(Dataset):
    def __init__(self, examples: List[Tuple[str, int, Dict]], tokenizer, max_length: int):
        self.examples = examples; self.tokenizer = tokenizer; self.max_length = max_length
    def __len__(self): return len(self.examples)
    def __getitem__(self, idx):
        text, label, _ = self.examples[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.float)}

class WeightedTrainer(Trainer):
    """Custom Trainer to apply a weight to the positive class in the loss function."""
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Penalize the model more for misclassifying a "0" than a "1".
        pos_weight = torch.tensor([1.2], device=model.device) # 20% more penalty for false negatives
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class NeuralRankerTrainer:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
        self.data_processor = HotpotQADataProcessor(config, self.sbert_model)
        special_tokens = ['[HOP:0]', '[HOP:1]', '[ROLE:Main_Claim]', '[ROLE:Supporting_Evidence]', '[ROLE:Expert_Opinion]', '[ROLE:Background_Information]', '[ENT:', '[CAUSAL]', '[REASONING]']
        new_tokens = [token for token in special_tokens if token not in self.tokenizer.vocab]
        if new_tokens:
            self.tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
        self.model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=1, problem_type="regression")
        self.model.resize_token_embeddings(len(self.tokenizer))
        logger.info(f"Initialized model: {config.model_name}")

    def prepare_datasets(self, dataset_name: str, subset: str, train_split: str, max_train_samples: int, cache_file: str, num_workers: int):
        if os.path.exists(cache_file):
            logger.info(f"Loading preprocessed dataset from {cache_file}...")
            df = pd.read_parquet(cache_file)
            all_examples = [(row['text'], int(row['label']), json.loads(row['metadata'])) for _, row in df.iterrows()]
        else:
            logger.info(f"Loading dataset: {dataset_name}, subset: {subset}")
            dataset = load_dataset(dataset_name, name=subset, split=train_split)
            if max_train_samples:
                dataset = dataset.select(range(min(max_train_samples, len(dataset))))
            all_examples = self.data_processor.process_dataset(list(dataset), num_workers=num_workers)
            if not all_examples: raise RuntimeError("Data processing yielded zero examples. Aborting.")
            df_to_cache = pd.DataFrame([{"text": t, "label": l, "metadata": json.dumps(m)} for t, l, m in all_examples])
            df_to_cache.to_parquet(cache_file, index=False)
            logger.info(f"Saved preprocessed data to {cache_file}")
        np.random.shuffle(all_examples)
        split_point = int(0.9 * len(all_examples))
        train_examples, eval_examples = all_examples[:split_point], all_examples[split_point:]
        logger.info(f"Total examples: {len(all_examples)}, Train: {len(train_examples)}, Eval: {len(eval_examples)}")
        return (RankerDataset(train_examples, self.tokenizer, self.config.max_length),
                RankerDataset(eval_examples, self.tokenizer, self.config.max_length))

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        scores = torch.sigmoid(torch.tensor(predictions)).numpy().flatten()
        preds_binary = (scores > 0.5).astype(int)
        labels = labels.flatten()
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds_binary, average='binary', zero_division=0)
        return {'accuracy': accuracy_score(labels, preds_binary), 'precision': precision, 'recall': recall, 'f1': f1, 'auc': roc_auc_score(labels, scores)}

    def train(self, train_dataset, eval_dataset):
        os.makedirs(self.config.output_dir, exist_ok=True)
        training_args = TrainingArguments(
            output_dir=self.config.output_dir, num_train_epochs=self.config.num_epochs,
            per_device_train_batch_size=self.config.batch_size, per_device_eval_batch_size=self.config.batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=self.config.learning_rate, weight_decay=self.config.weight_decay,
            warmup_ratio=self.config.warmup_ratio, logging_steps=self.config.logging_steps,
            save_steps=self.config.save_steps, eval_strategy="steps", eval_steps=self.config.eval_steps,
            save_strategy="steps", load_best_model_at_end=self.config.load_best_model_at_end,
            metric_for_best_model=self.config.metric_for_best_model,
            greater_is_better=self.config.greater_is_better, fp16=self.config.fp16,
            dataloader_num_workers=self.config.dataloader_num_workers,
            max_grad_norm=self.config.max_grad_norm, report_to="none"
        )
        trainer = WeightedTrainer(
            model=self.model, args=training_args, train_dataset=train_dataset,
            eval_dataset=eval_dataset, compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=7)] # Patience of 5 is good for base training
        )
        logger.info("Starting training...")
        trainer.train()
        trainer.save_model()
        self.tokenizer.save_pretrained(self.config.output_dir)
        with open(os.path.join(self.config.output_dir, "training_config.json"), 'w') as f:
            json.dump(self.config.__dict__, f, indent=2)
        logger.info(f"Training completed. Model saved to {self.config.output_dir}")
        return trainer

# ==============================================================================
# --- MAIN EXECUTION BLOCK ---
# ==============================================================================
def main(model_name: str,
         output_dir: str,
         batch_size: int,
         learning_rate: float,
         num_epochs: int,
         max_train_samples: Optional[int],
         cache_file: str):
    """Main training function for notebook environments."""
    num_workers = max(1, mp.cpu_count() - 2)
    config = TrainingConfig(
        model_name=model_name, output_dir=output_dir, batch_size=batch_size,
        learning_rate=learning_rate, num_epochs=num_epochs,
        dataloader_num_workers=num_workers, embedding_batch_size=128,
        ner_batch_size=64, hard_negative_ratio=0.1, weight_decay=0.05,
        warmup_ratio=0.15
    )
    trainer_wrapper = NeuralRankerTrainer(config)
    train_dataset, eval_dataset = trainer_wrapper.prepare_datasets(
        dataset_name="TIGER-Lab/LongRAG", subset="hotpot_qa", train_split="full",
        max_train_samples=max_train_samples, cache_file=cache_file, num_workers=num_workers
    )
    trainer = trainer_wrapper.train(train_dataset, eval_dataset)
    eval_results = trainer.evaluate()
    logger.info(f"Final evaluation metrics: {eval_results}")
    print("\n" + "="*50)
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print(f"Model saved to: {config.output_dir}")
    print(f"Final F1 Score: {eval_results.get('eval_f1', 'N/A'):.4f}")
    print(f"Final AUC Score: {eval_results.get('eval_auc', 'N/A'):.4f}")
    print("="*50)
    return eval_results

if __name__ == "__main__":
    print("--- Starting STABLE Training for HotpotQA Base Model ---")
    main(
        model_name="microsoft/deberta-v3-base",
        output_dir="./models/neural_ranker_hotpot_stable",
        max_train_samples=None, # Use all 7405 rows
        cache_file="preprocessed_hotpotqa_stable.parquet",
        num_epochs=6,
        batch_size=12,
        learning_rate=1.5e-5
    )

All old cache files have been deleted.
--- Starting STABLE Training for HotpotQA Base Model ---


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to u

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
250,0.7612,0.730126,0.536979,0.518922,0.996764,0.682519,0.83233
500,0.229,0.183538,0.950777,0.967467,0.932786,0.94981,0.98203
750,0.1546,0.175238,0.957862,0.977911,0.936769,0.956898,0.98768
1000,0.1683,0.127021,0.971535,0.965585,0.977844,0.971676,0.988273
1250,0.135,0.137986,0.970044,0.974849,0.964899,0.969849,0.988677
1500,0.1358,0.114189,0.975513,0.969749,0.981578,0.975628,0.990237
1750,0.1336,0.111739,0.9734,0.967314,0.979836,0.973535,0.991668
2000,0.1265,0.088794,0.978247,0.971295,0.985561,0.978376,0.991994
2250,0.1004,0.113417,0.973897,0.973861,0.973861,0.973861,0.991692
2500,0.081,0.101769,0.976259,0.96432,0.989047,0.976527,0.99227



TRAINING COMPLETED SUCCESSFULLY!
Model saved to: ./models/neural_ranker_hotpot_stable
Final F1 Score: 0.9814
Final AUC Score: 0.9932


In [None]:
#!/usr/bin/env python3
#!rm -f preprocessed_nq_full_dataset.parquet
"""
Neural Ranker Continual Finetuning Script
Adapts a pre-trained HotpotQA neural ranker to work on Natural Questions (NQ) subset.
This version includes a weighted loss function, tuned hyperparameters, and fixes for
API argument mismatches.
"""

import os
import re
import multiprocessing as mp
from multiprocessing import Pool
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import logging
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    pipeline as hf_pipeline
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK data is available
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ==============================================================================
# --- CONFIGURATION ---
# ==============================================================================

@dataclass
class ContinualTrainingConfig:
    model_name: str = "./models/neural_ranker_hotpot_stable"
    max_length: int = 512
    batch_size: int = 12
    learning_rate: float = 2e-6
    num_epochs: int = 4
    warmup_ratio: float = 0.2
    weight_decay: float = 0.05
    output_dir: str = "./models/neural_ranker_hotpot_nq"
    save_steps: int = 250
    eval_steps: int = 250
    logging_steps: int = 50
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_f1"
    greater_is_better: bool = True
    dataloader_num_workers: int = 2
    fp16: bool = torch.cuda.is_available()
    gradient_accumulation_steps: int = 3
    max_grad_norm: float = 1.0
    hard_negative_ratio: float = 0.1
    min_sentence_length: int = 20
    use_hop_features: bool = True
    use_rhetorical_features: bool = True
    use_entity_features: bool = True
    use_reasoning_features: bool = True
    embedding_batch_size: int = 128
    ner_batch_size: int = 64

@dataclass
class SentenceWithMetadata:
    text: str; doc_title: str; doc_id: int; sent_id: int; entities: List[str]
    rhetorical_role: str = "Background"; hop_depth: int = 0
    has_reasoning_signals: bool = False; is_causal: bool = False

# ==============================================================================
# --- ADAPTIVE DATA PROCESSOR ---
# ==============================================================================
class AdaptiveDataProcessor:
    rhetorical_patterns = {
        'Main Claim': ['argue', 'claim', 'assert', 'believe', 'conclude'],
        'Supporting Evidence': ['evidence', 'data', 'research', 'study', 'found'],
        'Expert Opinion': ['expert', 'according to', 'stated', 'opinion'],
    }
    def __init__(self, config: ContinualTrainingConfig, sbert_model: SentenceTransformer):
        self.config = config
        self.sbert_model = sbert_model
        if self.config.use_entity_features:
            self.ner_pipeline = hf_pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1, grouped_entities=True)
        else:
            self.ner_pipeline = None
    @staticmethod
    def _adaptive_parse_worker(row: Dict) -> Optional[Dict]:
        try:
            query = row.get('query')
            if not isinstance(query, str) or not query.strip(): return None
            answer_text = ""
            if row.get('answer') and isinstance(row['answer'], list) and row['answer']:
                answer_text = row['answer'][0].lower().strip()
            sp_titles = set(row.get('sp', []))
            if not answer_text and not sp_titles: return None
            raw_context = row.get('context', "")
            if not isinstance(raw_context, str) or not raw_context.strip(): return None
            docs_raw = re.split(r'\n\nTitle: ', raw_context)
            sentences = []
            if docs_raw and docs_raw[0].strip():
                first_doc = docs_raw[0].strip()
                if not first_doc.startswith("Title:"): docs_raw[0] = "Title: " + first_doc
            positive_count = 0
            for doc_part in docs_raw:
                if not doc_part or not doc_part.strip(): continue
                lines = doc_part.split('\n', 1)
                doc_title = lines[0].replace("Title: ", "").strip()
                doc_text = lines[1].strip() if len(lines) > 1 else ""
                for sent_text in sent_tokenize(doc_text):
                    if len(sent_text) <= ContinualTrainingConfig.min_sentence_length: continue
                    is_positive = (sp_titles and doc_title in sp_titles) or (answer_text and answer_text in sent_text.lower())
                    if is_positive: positive_count += 1
                    sentences.append({"text": sent_text, "doc_title": doc_title, "is_positive_source": is_positive})
            return {"query": query, "sentences": sentences} if positive_count > 0 else None
        except Exception:
            return None
    def _batch_process_sentences(self, all_sentences: List[Dict], chunk_size: int = 20000):
        sentence_texts = [s['text'] for s in all_sentences]
        embeddings_list = []
        try:
            for i in tqdm(range(0, len(sentence_texts), chunk_size), desc="SBERT Encoding"):
                chunk = sentence_texts[i:i + chunk_size]
                embeddings_list.append(self.sbert_model.encode(chunk, batch_size=self.config.embedding_batch_size, show_progress_bar=False, convert_to_numpy=True))
            sentence_embeddings = np.vstack(embeddings_list) if embeddings_list else np.zeros((0, self.sbert_model.get_sentence_embedding_dimension()))
        except Exception as e:
            logger.error(f"SBERT encoding failed: {e}")
            sentence_embeddings = np.zeros((len(sentence_texts), self.sbert_model.get_sentence_embedding_dimension()))
        all_entities = [[] for _ in sentence_texts]
        if self.ner_pipeline:
            for i in tqdm(range(0, len(sentence_texts), chunk_size), desc="Transformer NER"):
                chunk_texts = sentence_texts[i:i + chunk_size]
                try:
                    results = self.ner_pipeline(chunk_texts, batch_size=self.config.ner_batch_size)
                    for j, ner_result in enumerate(results):
                        all_entities[i + j] = [(entity['word'], entity['entity_group']) for entity in ner_result]
                except Exception as e:
                    logger.warning(f"NER chunk failed: {e}")
        return sentence_embeddings, all_entities
    @staticmethod
    def _assemble_examples_worker(args: Tuple) -> List[Tuple[str, int, Dict]]:
        parsed_row, query_embedding, sentence_data_map, config, embedding_dim = args
        query, sentences = parsed_row['query'], parsed_row['sentences']
        if not sentences: return []
        def _classify_rhetorical_role(text):
            text_lower = text.lower()
            for role, patterns in AdaptiveDataProcessor.rhetorical_patterns.items():
                if any(p in text_lower for p in patterns): return role
            return "Background_Information"
        def _detect_causal_language(text):
            return any(word in text.lower() for word in ['cause', 'because', 'due to', 'result', 'lead to'])
        def _create_structured_input(q, s_meta):
            components = []
            if config.use_hop_features: components.append(f"[HOP:{s_meta.hop_depth}]")
            if config.use_rhetorical_features: components.append(f"[ROLE:{s_meta.rhetorical_role.replace(' ', '_')}]")
            if config.use_entity_features and s_meta.entities: components.append(f"[ENT:{','.join(s_meta.entities[:3])}]")
            if config.use_reasoning_features:
                if s_meta.is_causal: components.append("[CAUSAL]")
                if s_meta.has_reasoning_signals: components.append("[REASONING]")
            return f"{' '.join(components)} {q} [SEP] {s_meta.text}"
        positives, negatives = [], []
        sent_emb_list = [sentence_data_map.get(s['text'], {}).get('embedding', np.zeros(embedding_dim)) for s in sentences]
        sentence_embeddings = np.array(sent_emb_list)
        q_norm = np.linalg.norm(query_embedding) + 1e-9
        s_norms = np.linalg.norm(sentence_embeddings, axis=1) + 1e-9
        similarities = (sentence_embeddings @ query_embedding) / (s_norms * q_norm)
        for i, sentence in enumerate(sentences):
            entry = sentence_data_map.get(sentence['text'], {})
            entities = entry.get('entities', [])
            is_causal = _detect_causal_language(sentence['text'])
            sentence_meta = SentenceWithMetadata(text=sentence['text'], doc_title=sentence.get('doc_title', ''), entities=[e[0] for e in entities], rhetorical_role=_classify_rhetorical_role(sentence['text']), is_causal=is_causal, has_reasoning_signals=(len(entities) > 1 or is_causal), doc_id=0, sent_id=i)
            structured_input = _create_structured_input(query, sentence_meta)
            metadata = {'similarity': float(similarities[i]), 'doc_title': sentence.get('doc_title', '')}
            example = (structured_input, int(sentence.get('is_positive_source', False)), metadata)
            if sentence.get('is_positive_source', False): positives.append(example)
            else: negatives.append(example)
        if not positives: return []
        negatives.sort(key=lambda x: x[2]['similarity'], reverse=True)
        num_hard = int(len(positives) * config.hard_negative_ratio)
        selected_negatives = negatives[:num_hard]
        if len(positives) - num_hard > 0 and len(negatives) > num_hard:
             tail = negatives[num_hard:]
             num_easy = len(positives) - num_hard
             indices = np.linspace(0, len(tail) - 1, num_easy).astype(int)
             selected_negatives.extend([tail[idx] for idx in indices])
        return positives + selected_negatives
    def process_dataset(self, dataset: List[Dict], num_workers: int, cache_prefix: str) -> List[Tuple[str, int, Dict]]:
        cache_dir = f"stage3_cache_{cache_prefix}"
        os.makedirs(cache_dir, exist_ok=True)
        cache_files = {"parsed_rows.json", "sentence_embeddings.npy", "sentence_texts.json", "sentence_meta.json", "query_embeddings.npy", "query_texts.json"}
        if all(os.path.exists(os.path.join(cache_dir, f)) for f in cache_files):
            logger.info(f"Loading {cache_prefix.upper()} cache...")
            with open(os.path.join(cache_dir, "parsed_rows.json"), "r") as f: parsed_rows = json.load(f)
            sentence_embeddings = np.load(os.path.join(cache_dir, "sentence_embeddings.npy"))
            with open(os.path.join(cache_dir, "sentence_texts.json"), "r") as f: sentence_texts = json.load(f)
            with open(os.path.join(cache_dir, "sentence_meta.json"), "r") as f: sentence_meta = json.load(f)
            query_embeddings = np.load(os.path.join(cache_dir, "query_embeddings.npy"))
            with open(os.path.join(cache_dir, "query_texts.json"), "r") as f: query_texts = json.load(f)
            query_embedding_map = {q: emb for q, emb in zip(query_texts, query_embeddings)}
            sentence_data_map = {text: {"embedding": sentence_embeddings[i], "entities": meta.get("entities", [])} for i, (text, meta) in enumerate(zip(sentence_texts, sentence_meta))}
        else:
            logger.info(f"No complete {cache_prefix.upper()} cache found. Running full preprocessing...")
            with Pool(processes=num_workers) as pool:
                parsed_rows = list(tqdm(pool.imap(self._adaptive_parse_worker, dataset), total=len(dataset), desc=f"Stage 1: Parsing {cache_prefix.upper()}"))
            parsed_rows = [row for row in parsed_rows if row]
            if not parsed_rows: raise ValueError(f"Stage 1 parsing resulted in zero processable rows.")
            unique_sentences = {s['text']: s for row in parsed_rows for s in row['sentences']}
            unique_queries = sorted(list({row['query'] for row in parsed_rows}))
            all_unique_sentences = list(unique_sentences.values())
            sentence_embeddings, all_entities = self._batch_process_sentences(all_unique_sentences)
            query_embeddings = self.sbert_model.encode([f"query: {q}" for q in unique_queries], batch_size=self.config.embedding_batch_size, show_progress_bar=True, convert_to_numpy=True)
            query_embedding_map = {q: emb for q, emb in zip(unique_queries, query_embeddings)}
            sentence_texts = [s['text'] for s in all_unique_sentences]
            sentence_meta = [{"entities": all_entities[i]} for i in range(len(all_entities))]
            sentence_data_map = {text: {"embedding": sentence_embeddings[i], "entities": sentence_meta[i]["entities"]} for i, text in enumerate(sentence_texts)}
            temp_files = {f: f"{f}.tmp" for f in cache_files}
            try:
                with open(os.path.join(cache_dir, temp_files["parsed_rows.json"]), "w") as f: json.dump(parsed_rows, f)
                np.save(os.path.join(cache_dir, temp_files["sentence_embeddings.npy"]), sentence_embeddings)
                with open(os.path.join(cache_dir, temp_files["sentence_texts.json"]), "w") as f: json.dump(sentence_texts, f)
                with open(os.path.join(cache_dir, temp_files["sentence_meta.json"]), "w") as f: json.dump(sentence_meta, f)
                np.save(os.path.join(cache_dir, temp_files["query_embeddings.npy"]), query_embeddings)
                with open(os.path.join(cache_dir, temp_files["query_texts.json"]), "w") as f: json.dump(unique_queries, f)
                for final, temp in temp_files.items():
                    shutil.move(os.path.join(cache_dir, temp), os.path.join(cache_dir, final))
                logger.info(f"Saved {cache_prefix.upper()} cache to '{cache_dir}'.")
            except Exception as e:
                logger.error(f"Failed to write cache files: {e}. Cleaning up.")
                for temp in temp_files.values():
                    if os.path.exists(os.path.join(cache_dir, temp)): os.remove(os.path.join(cache_dir, temp))
        all_examples = []
        assembly_args = [(row, query_embedding_map.get(row['query']), sentence_data_map, self.config, sentence_embeddings.shape[1]) for row in parsed_rows if query_embedding_map.get(row['query']) is not None]
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = [executor.submit(self._assemble_examples_worker, arg) for arg in assembly_args]
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Stage 4: Assembling {cache_prefix.upper()}"):
                all_examples.extend(future.result())
        return all_examples

# ==============================================================================
# --- DATASET & TRAINER CLASSES ---
# ==============================================================================
class RankerDataset(Dataset):
    def __init__(self, examples: List[Tuple[str, int, Dict]], tokenizer, max_length: int):
        self.examples = examples; self.tokenizer = tokenizer; self.max_length = max_length
    def __len__(self): return len(self.examples)
    def __getitem__(self, idx):
        text, label, _ = self.examples[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.float)}

class WeightedTrainer(Trainer):
    """Custom Trainer to apply a weight to the positive class in the loss function."""
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # This weight penalizes the model more for misclassifying a "0" than a "1".
        # It forces the model to learn to say "no" and breaks the "always predict positive" habit.
        pos_weight = torch.tensor([1.2], device=model.device) # 20% more penalty for false negatives
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class ContinualNeuralRankerTrainer:
    def __init__(self, config: ContinualTrainingConfig):
        self.config = config
        self.sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
        self.data_processor = AdaptiveDataProcessor(config, self.sbert_model)
        logger.info(f"Loading pre-trained model from: {config.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=1, problem_type="regression")
        special_tokens = ['[HOP:0]', '[HOP:1]', '[ROLE:Main_Claim]', '[ROLE:Supporting_Evidence]', '[ROLE:Expert_Opinion]', '[ROLE:Background_Information]', '[ENT:', '[CAUSAL]', '[REASONING]']
        new_tokens = [token for token in special_tokens if token not in self.tokenizer.vocab]
        if new_tokens:
            self.tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
            self.model.resize_token_embeddings(len(self.tokenizer))

    def prepare_datasets(self, dataset_name: str, subset: str, train_split: str, max_train_samples: int, cache_file: str, num_workers: int):
        if os.path.exists(cache_file):
            logger.info(f"Loading preprocessed dataset from {cache_file}...")
            df = pd.read_parquet(cache_file)
            all_examples = [(row['text'], int(row['label']), json.loads(row['metadata'])) for _, row in df.iterrows()]
        else:
            logger.info(f"Loading dataset: {dataset_name}, subset: {subset}")
            dataset = load_dataset(dataset_name, name=subset, split=train_split)
            if max_train_samples:
                dataset = dataset.select(range(min(max_train_samples, len(dataset))))
            all_examples = self.data_processor.process_dataset(list(dataset), num_workers=num_workers, cache_prefix=subset)
            if not all_examples: raise RuntimeError("Data processing yielded zero examples. Aborting.")
            df_to_cache = pd.DataFrame([{"text": t, "label": l, "metadata": json.dumps(m)} for t, l, m in all_examples])
            df_to_cache.to_parquet(cache_file, index=False)
            logger.info(f"Saved preprocessed data to {cache_file}")
        np.random.shuffle(all_examples)
        split_point = int(0.9 * len(all_examples))
        train_examples, eval_examples = all_examples[:split_point], all_examples[split_point:]
        logger.info(f"Total examples: {len(all_examples)}, Train: {len(train_examples)}, Eval: {len(eval_examples)}")
        return (RankerDataset(train_examples, self.tokenizer, self.config.max_length),
                RankerDataset(eval_examples, self.tokenizer, self.config.max_length))

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        scores = torch.sigmoid(torch.tensor(predictions)).numpy().flatten()
        preds_binary = (scores > 0.5).astype(int)
        labels = labels.flatten()
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds_binary, average='binary', zero_division=0)
        return {'accuracy': accuracy_score(labels, preds_binary), 'precision': precision, 'recall': recall, 'f1': f1, 'auc': roc_auc_score(labels, scores)}

    def train(self, train_dataset, eval_dataset):
        os.makedirs(self.config.output_dir, exist_ok=True)
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_epochs,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=self.config.learning_rate,
            weight_decay=self.config.weight_decay,
            warmup_ratio=self.config.warmup_ratio,
            logging_steps=self.config.logging_steps,
            save_steps=self.config.save_steps,
            eval_strategy="steps",
            eval_steps=self.config.eval_steps,
            save_strategy="steps",
            load_best_model_at_end=self.config.load_best_model_at_end,
            metric_for_best_model=self.config.metric_for_best_model,
            greater_is_better=self.config.greater_is_better,
            fp16=self.config.fp16,
            dataloader_num_workers=self.config.dataloader_num_workers,
            max_grad_norm=self.config.max_grad_norm,
            report_to="none"
        )
        trainer = WeightedTrainer(
            model=self.model, args=training_args, train_dataset=train_dataset,
            eval_dataset=eval_dataset, compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
        )
        logger.info("Starting continual finetuning...")
        trainer.train()
        trainer.save_model()
        self.tokenizer.save_pretrained(self.config.output_dir)
        with open(os.path.join(self.config.output_dir, "continual_training_config.json"), 'w') as f:
            json.dump(self.config.__dict__, f, indent=2)
        logger.info(f"Continual finetuning completed. Model saved to {self.config.output_dir}")
        return trainer

# ==============================================================================
# --- MAIN EXECUTION FUNCTION ---
# ==============================================================================
def continual_finetune(pretrained_model_path: str,
                      output_dir: str,
                      dataset_subset: str,
                      batch_size: int,
                      learning_rate: float,
                      num_epochs: int,
                      max_train_samples: Optional[int],
                      cache_file: str,
                      gradient_accumulation_steps: int):

    logger.info("="*80)
    logger.info(f"STARTING CONTINUAL FINETUNING ON '{dataset_subset.upper()}' SUBSET")
    logger.info("="*80)

    num_workers = max(1, mp.cpu_count() - 2)
    config = ContinualTrainingConfig(
        model_name=pretrained_model_path,
        output_dir=output_dir,
        batch_size=batch_size,
        learning_rate=learning_rate,
        num_epochs=num_epochs,
        gradient_accumulation_steps=gradient_accumulation_steps,
        dataloader_num_workers=num_workers,
        embedding_batch_size=128,
        ner_batch_size=64
    )

    trainer_wrapper = ContinualNeuralRankerTrainer(config)

    train_dataset, eval_dataset = trainer_wrapper.prepare_datasets(
        dataset_name="TIGER-Lab/LongRAG",
        subset=dataset_subset,
        train_split="full",
        max_train_samples=max_train_samples,
        cache_file=cache_file,
        num_workers=num_workers
    )

    trainer = trainer_wrapper.train(train_dataset, eval_dataset)
    eval_results = trainer.evaluate()
    logger.info(f"Final evaluation metrics on {dataset_subset.upper()}: {eval_results}")

    print("\n" + "="*80)
    print("CONTINUAL FINETUNING COMPLETED SUCCESSFULLY!")
    print(f"Original model: {pretrained_model_path}")
    print(f"Enhanced model saved to: {config.output_dir}")
    print(f"Final F1 Score on {dataset_subset.upper()}: {eval_results.get('eval_f1', 'N/A'):.4f}")
    print(f"Final AUC Score on {dataset_subset.upper()}: {eval_results.get('eval_auc', 'N/A'):.4f}")
    print("Your neural ranker now handles both complex reasoning AND factual questions!")
    print("="*80)

    return eval_results

# ==============================================================================
# --- MAIN EXECUTION BLOCK ---
# ==============================================================================
if __name__ == "__main__":
    continual_finetune(
        pretrained_model_path="./models/neural_ranker_hotpot_stable", # The successfully trained HotpotQA model
        output_dir="./models/neural_ranker_hotpot_nq_final3",
        dataset_subset="nq",
        max_train_samples=None,  # Use the entire NQ dataset
        cache_file="preprocessed_nq_full_dataset.parquet",
        num_epochs=2,
        batch_size=12,
        gradient_accumulation_steps=3,
        learning_rate=2e-6
    )

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
250,0.6844,0.647508,0.804555,0.761579,0.883506,0.818024,0.871173
500,0.4511,0.436985,0.834447,0.793565,0.901564,0.844124,0.898016
750,0.4079,0.38441,0.858207,0.820498,0.914997,0.865174,0.918643
1000,0.3525,0.348749,0.872112,0.834822,0.926007,0.878054,0.931231
1250,0.3562,0.319746,0.883609,0.851171,0.92821,0.888023,0.940842
1500,0.337,0.305844,0.888974,0.850248,0.942744,0.89411,0.945165
1750,0.3316,0.292743,0.892916,0.847881,0.956177,0.898779,0.94926
2000,0.3292,0.276657,0.901237,0.865582,0.94869,0.905232,0.954524
2250,0.2779,0.271594,0.904413,0.881765,0.932834,0.906581,0.958086
2500,0.2767,0.265102,0.905836,0.869801,0.953314,0.909645,0.958486



CONTINUAL FINETUNING COMPLETED SUCCESSFULLY!
Original model: ./models/neural_ranker_hotpot_stable
Enhanced model saved to: ./models/neural_ranker_hotpot_nq_final3
Final F1 Score on NQ: 0.9182
Final AUC Score on NQ: 0.9644
Your neural ranker now handles both complex reasoning AND factual questions!
