## 1. FINE TUNING

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import sys
import subprocess
def _ensure(p):
    try:
        __import__(p)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", p, "-q"])
for _p in ["torch", "transformers", "peft", "tqdm"]:
    _ensure(_p)

import json
from pathlib import Path
from dataclasses import dataclass
from typing import Union, List, Dict, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, PeftModel
from tqdm.auto import tqdm

from peft import LoraConfig, TaskType
from peft import PeftModel, PeftConfig, TaskType
from transformers import AutoModelForMaskedLM

if hasattr(torch, "cuda"):
    try:
        import torch.cuda.graphs as _cg
        torch.cuda.is_available = lambda : False
        torch.cuda.is_initialized = lambda : False
        _cg.is_current_stream_capturing = lambda : False
    except Exception:
        pass

In [None]:
@dataclass
class Cfg:
    qa_dir: str = "/kaggle/input/province-dataset"
    base_model: str = "cis-lmu/glot500-base"
    out_dir: str = "/kaggle/working/glot500-2stage"
    pad_to_multiple_of: int = 8
    mlm_max_len: int = 128
    mlm_batch: int = 4
    mlm_epochs: int = 2
    mlm_lr: float = 2e-4
    mlm_warmup_ratio: float = 0.06
    mlm_mask_prob: float = 0.15
    mlm_patience: int = 30
    mlm_min_delta: float = 1e-4
    ctr_max_len: int = 192
    ctr_batch: int = 4
    ctr_epochs: int = 1
    ctr_lr: float = 2e-4
    ctr_warmup_ratio: float = 0.06
    temperature: float = 0.05
    proj_dim: int = 128
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05

In [None]:
class QAPairDataset(Dataset):
    def __init__(self, qa_dir: Union[str, Path], max_len: int = 192):
        self.samples: List[Tuple[str, str]] = []
        for fp in Path(qa_dir).glob("*.json"):
            try:
                arr = json.loads(fp.read_text(encoding="utf-8"))
            except Exception:
                continue
            for rec in arr:
                base_text = (rec.get("text") or "").strip()
                if not base_text:
                    continue
                for qa in rec.get("qa_pairs", []) or []:
                    q = (qa.get("question") or "").strip()
                    if q:
                        self.samples.append((q, base_text))
        self.max_len = max_len
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        q, a = self.samples[idx]
        return {"question": q, "answer": a}

In [None]:
class MLMDataset(Dataset):
    def __init__(self, qa_dir: Union[str, Path]):
        uniq: set = set()
        for fp in Path(qa_dir).glob("*.json"):
            try:
                arr = json.loads(fp.read_text(encoding="utf-8"))
            except Exception:
                continue
            for rec in arr:
                t = (rec.get("text") or "").strip()
                if t:
                    uniq.add(t)
                for qa in rec.get("qa_pairs", []) or []:
                    q = (qa.get("question") or "").strip()
                    if q:
                        uniq.add(q)
        self.texts = list(uniq)
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx): return {"text": self.texts[idx]}

In [None]:
def mean_pool(h, m):
    x = m.unsqueeze(-1).type_as(h)
    s = (h * x).sum(dim=1)
    d = x.sum(dim=1).clamp(min=1e-6)
    return s / d

In [None]:
class ContrastiveModel(nn.Module):
    def __init__(self, base_model_name: str, proj_dim: int, mlm_adapter_dir: str):
        super().__init__()

        acfg = PeftConfig.from_pretrained(mlm_adapter_dir)
        if acfg.base_model_name_or_path != base_model_name:
            print(f"⚠️ Adapter was trained on {acfg.base_model_name_or_path}, "
                  f"but you're loading {base_model_name}.")

        base = AutoModelForMaskedLM.from_pretrained(base_model_name)
        if hasattr(base, "gradient_checkpointing_enable"):
            base.gradient_checkpointing_enable()
        if hasattr(base.config, "use_cache"):
            base.config.use_cache = False

        self.backbone = PeftModel.from_pretrained(base, mlm_adapter_dir)
        hidden = self.backbone.config.hidden_size

        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden // 2),
            nn.GELU(),
            nn.Linear(hidden // 2, proj_dim),
        )

    def forward(self, input_ids, attention_mask):
        out = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )
        hidden = out.hidden_states[-1]
        pooled = mean_pool(hidden, attention_mask)
        z = F.normalize(self.proj(pooled), dim=-1)
        return z

In [None]:
class ContrastiveCollator:
    def __init__(self, tokenizer: AutoTokenizer, max_len: int = 192, pad_to_multiple_of: int = 8):
        self.tok = tokenizer
        self.max_len = max_len
        self.pad = pad_to_multiple_of
    def __call__(self, batch: List[Dict]) -> Dict[str, torch.Tensor]:
        qs = [b["question"] for b in batch]
        ans = [b["answer"] for b in batch]
        q = self.tok(qs, padding=True, truncation=True, max_length=self.max_len, return_tensors="pt", pad_to_multiple_of=self.pad)
        a = self.tok(ans, padding=True, truncation=True, max_length=self.max_len, return_tensors="pt", pad_to_multiple_of=self.pad)
        return {"q_ids": q["input_ids"], "q_mask": q["attention_mask"], "a_ids": a["input_ids"], "a_mask": a["attention_mask"]}

In [None]:
def lora_cfg(r, a, d):
    return LoraConfig(
        r=r,
        lora_alpha=a,
        lora_dropout=d,
        bias="none",
        target_modules=["query", "key", "value", "dense"],
        task_type=TaskType.FEATURE_EXTRACTION
    )

In [None]:
def train_mlm(cfg: Cfg, tok: AutoTokenizer, lcfg: LoraConfig) -> str:
    ds = MLMDataset(cfg.qa_dir)
    print(f"mlm texts: {len(ds)}")
    if len(ds) == 0:
        Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)
        tok.save_pretrained(cfg.out_dir)
        p = Path(cfg.out_dir) / "mlm"
        p.mkdir(parents=True, exist_ok=True)
        return str(p)

    coll = DataCollatorForLanguageModeling(tokenizer=tok, mlm=True, mlm_probability=cfg.mlm_mask_prob)
    def collate(feats):
        texts = [f["text"] for f in feats]
        t = tok(texts, truncation=True, max_length=cfg.mlm_max_len, return_special_tokens_mask=True)
        return coll([{"input_ids": i, "special_tokens_mask": m} for i, m in zip(t["input_ids"], t["special_tokens_mask"])])

    dl = DataLoader(ds, batch_size=cfg.mlm_batch, shuffle=True, num_workers=0, pin_memory=False, collate_fn=collate)

    dev = torch.device("cpu")
    model = AutoModelForMaskedLM.from_pretrained(cfg.base_model)
    if hasattr(model, "gradient_checkpointing_enable"): model.gradient_checkpointing_enable()
    if hasattr(model.config, "use_cache"): model.config.use_cache = False
    model = get_peft_model(model, lcfg)
    model.to(dev)
    print(sum("lora_A" in n for n, _ in model.named_parameters()), "LoRA A params")

    steps = cfg.mlm_epochs * max(1, len(dl))
    warm = int(cfg.mlm_warmup_ratio * steps)
    opt = torch.optim.AdamW(model.parameters(), lr=cfg.mlm_lr, weight_decay=0.01, foreach=False)
    sch = get_linear_schedule_with_warmup(opt, num_warmup_steps=warm, num_training_steps=steps)

    model.train()
    ga = 8
    c = 0

    best_loss = float("inf")
    no_improve = 0
    stopped_early = False

    accum_loss = 0.0
    micro_in_batch = 0

    for e in range(cfg.mlm_epochs):
        pbar = tqdm(dl, total=len(dl), desc=f"mlm {e+1}/{cfg.mlm_epochs}", leave=True)
        for b in pbar:
            b = {k: v.to(dev) for k, v in b.items()}
            out = model(**b)
            loss = out.loss / ga
            loss.backward()


            accum_loss += float(out.loss.item())
            micro_in_batch += 1

            c += 1
            if c % ga == 0:
                opt.step()
                opt.zero_grad(set_to_none=True)
                sch.step()
                mean_step_loss = accum_loss / max(1, micro_in_batch)
                accum_loss = 0.0
                micro_in_batch = 0

                improved = (best_loss - mean_step_loss) > cfg.mlm_min_delta
                if improved:
                    best_loss = mean_step_loss
                    no_improve = 0
                else:
                    no_improve += 1

                pbar.set_postfix(loss=mean_step_loss, best=best_loss, patience=f"{no_improve}/{cfg.mlm_patience}")

                if no_improve >= cfg.mlm_patience:
                    print(f"⏹️ Early stopping MLM: no improvement for {cfg.mlm_patience} steps. Best loss={best_loss:.6f}")
                    stopped_early = True
                    break
            else:
                pbar.set_postfix(loss=float(out.loss.item()))

        if stopped_early:
            break

    d = Path(cfg.out_dir) / "mlm"
    model.save_pretrained(str(d))
    tok.save_pretrained(cfg.out_dir)
    del model, opt, sch, dl
    return str(d)

In [None]:
def train_ctr(cfg: Cfg, tok: AutoTokenizer, mlm_dir: str) -> str:
    ds = QAPairDataset(cfg.qa_dir, max_len=cfg.ctr_max_len)
    print(f"contrastive pairs: {len(ds)}")
    if len(ds) > 0:
        print(f"sample Q: {ds.samples[0][0]}")
        print(f"sample A: {ds.samples[0][1][:120]} ...")
    if len(ds) == 0:
        p = Path(cfg.out_dir) / "final"
        p.mkdir(parents=True, exist_ok=True)
        tok.save_pretrained(str(p))
        return str(p)
    coll = ContrastiveCollator(tok, max_len=cfg.ctr_max_len, pad_to_multiple_of=cfg.pad_to_multiple_of)
    dl = DataLoader(ds, batch_size=cfg.ctr_batch, shuffle=True, num_workers=0, pin_memory=False, collate_fn=coll)
    dev = torch.device("cpu")
    enc = ContrastiveModel(cfg.base_model, cfg.proj_dim, mlm_dir)
    enc.to(dev)
    enc.train()
    steps = cfg.ctr_epochs * max(1, len(dl))
    warm = int(cfg.ctr_warmup_ratio * steps)
    opt = torch.optim.AdamW(list(enc.backbone.parameters()) + list(enc.proj.parameters()), lr=cfg.ctr_lr, weight_decay=0.01, foreach=False)
    sch = get_linear_schedule_with_warmup(opt, num_warmup_steps=warm, num_training_steps=steps)
    ga = 16
    c = 0
    for e in range(cfg.ctr_epochs):
        pbar = tqdm(dl, total=len(dl), desc=f"contrastive {e+1}/{cfg.ctr_epochs}", leave=True)
        for b in pbar:
            q_ids = b["q_ids"].to(dev)
            q_mask = b["q_mask"].to(dev)
            a_ids = b["a_ids"].to(dev)
            a_mask = b["a_mask"].to(dev)
            qz = enc(q_ids, q_mask)
            az = enc(a_ids, a_mask)
            sims = (qz @ az.t()) / cfg.temperature
            y = torch.arange(sims.size(0), device=dev)
            loss = (F.cross_entropy(sims, y) + F.cross_entropy(sims.t(), y)) / 2.0
            loss = loss / ga
            loss.backward()
            c += 1
            if c % ga == 0:
                opt.step()
                opt.zero_grad(set_to_none=True)
                sch.step()
            pbar.set_postfix(loss=float(loss.item() * ga))
    p = Path(cfg.out_dir) / "final"
    p.mkdir(parents=True, exist_ok=True)
    if isinstance(enc.backbone, PeftModel):
        enc.backbone.save_pretrained(str(p))
    else:
        torch.save(enc.backbone.state_dict(), str(p / "backbone.pt"))
    torch.save(enc.proj.state_dict(), str(p / "projection_head.pt"))
    tok.save_pretrained(str(p))
    del enc, opt, sch, dl
    return str(p)

In [None]:
def main():
    cfg = Cfg()
    Path(cfg.out_dir).mkdir(parents=True, exist_ok=True)
    tok = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=True)
    if not tok.pad_token:
        tok.pad_token = tok.eos_token if tok.eos_token else "[PAD]"
    lcfg = lora_cfg(cfg.lora_r, cfg.lora_alpha, cfg.lora_dropout)
    mlm_dir = train_mlm(cfg, tok, lcfg)
    _ = train_ctr(cfg, tok, mlm_dir)

In [None]:
main()

mlm texts: 9011
73 LoRA A params


mlm 1/2:   0%|          | 0/2253 [00:00<?, ?it/s]

⏹️ Early stopping MLM: no improvement for 30 steps. Best loss=2.285179
contrastive pairs: 7550
sample Q: گنبدهای نمکی در کدام نواحی استان فارس قرار دارند؟
sample A: گنبدهای نمکی ساختارهای زمین‌شناسی گنبدی‌شکل با هستهٔ مرکزی نمک هستند که در نواحی جنوبی و شرقی استان فارس، به‌ویژه در منط ...


contrastive 1/1:   0%|          | 0/1888 [00:00<?, ?it/s]

## 2. Evaluation

In [None]:
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Dict, Any
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from peft import PeftModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import PeftModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path

In [None]:
eval_json_path = "/kaggle/input/evaluation/evaluation_selected.json"
merged_json_path = "/kaggle/input/evaluation/merged.json"
out_json_path = "/kaggle/working/eval_top3_results.json"
base_model_id = "cis-lmu/glot500-base"
lora_dir = "/kaggle/working/glot500-2stage/final"
max_length = 128
device = "cuda"
batch_size = 64
top_k = 3
tfidf_analyzer = "char"
tfidf_ngram_min = 3
tfidf_ngram_max = 5

In [None]:
def pick_device(name: str | None) -> str:
    name = (name or "auto").lower()
    if name == "cpu":
        return "cpu"
    if name == "cuda":
        return "cuda" if torch.cuda.is_available() else "cpu"
    if name == "mps":
        return "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
    if torch.cuda.is_available():
        return "cuda"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return "mps"
    return "cpu"

In [None]:
@dataclass
class EmbedConfig:
    base_model_id: str
    model_dir: str
    max_length: int
    device: str | None

class _BaseEmbedder:
    def __init__(self, tokenizer: AutoTokenizer, max_length: int, device: str):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device

In [None]:
class Glot500ZeroShot(_BaseEmbedder):
    def __init__(self, cfg: EmbedConfig):
        device = pick_device(cfg.device)
        tokenizer = AutoTokenizer.from_pretrained(cfg.base_model_id, use_fast=True)
        super().__init__(tokenizer, cfg.max_length, device)
        self.model = AutoModel.from_pretrained(cfg.base_model_id, add_pooling_layer=False).to(device)
        self.model.eval()

    @torch.inference_mode()
    def embed_texts(self, texts: List[str]) -> torch.Tensor:
        ids = self.tokenizer(texts, truncation=True, max_length=self.max_length,
                             padding=True, return_tensors="pt").to(self.device)
        out = self.model(input_ids=ids["input_ids"],
                         attention_mask=ids["attention_mask"],
                         output_hidden_states=True, return_dict=True)
        cls = out.hidden_states[-1][:, 0]
        return F.normalize(cls, dim=-1)

In [None]:
class Glot500LoRA(_BaseEmbedder):
    def __init__(self, cfg: EmbedConfig):
        device = pick_device(cfg.device)
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_dir, use_fast=True)
        super().__init__(tokenizer, cfg.max_length, device)

        base = AutoModelForMaskedLM.from_pretrained(cfg.base_model_id)
        self.model: PeftModel = PeftModel.from_pretrained(base, cfg.model_dir).to(device)
        self.model.eval()

        proj_sd_path = Path(cfg.model_dir) / "projection_head.pt"
        proj_sd = torch.load(proj_sd_path, map_location="cpu")

        hidden = self.model.config.hidden_size
        mid = proj_sd["0.weight"].shape[0]
        proj_dim = proj_sd["2.weight"].shape[0]

        proj = nn.Sequential(
            nn.Linear(hidden, mid),
            nn.GELU(),
            nn.Linear(mid, proj_dim),
        )
        proj.load_state_dict(proj_sd)
        self.proj = proj.to(device).eval()

    @torch.inference_mode()
    def embed_texts(self, texts: List[str]) -> torch.Tensor:
        ids = self.tokenizer(
            texts,
            truncation=True,
            max_length=self.max_length,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        out = self.model(
            input_ids=ids["input_ids"],
            attention_mask=ids["attention_mask"],
            output_hidden_states=True,
            return_dict=True
        )
        last = out.hidden_states[-1]
        mask = ids["attention_mask"].unsqueeze(-1).float()
        pooled = (last * mask).sum(1) / mask.sum(1).clamp(min=1e-6)

        z = F.normalize(self.proj(pooled), dim=-1)
        return z

In [None]:
class TFIDFBaseline:
    def __init__(self, analyzer="char", ngram_range=(3, 5), min_df=1):
        self.vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range,
                                          min_df=min_df, lowercase=False)
        self.answer_matrix = None
    def fit(self, questions, candidates):
        _ = self.vectorizer.fit(questions + candidates)
        self.answer_matrix = self.vectorizer.transform(candidates)
    def topk(self, question, k=3):
        qv = self.vectorizer.transform([question])
        sims = cosine_similarity(qv, self.answer_matrix).ravel()
        top_idx = sims.argsort()[::-1][:k]
        return [(int(i), float(sims[i])) for i in top_idx]

In [None]:
def embed_all_with_progress(texts, embedder, batch_size=64, desc="Encode"):
    outs = []
    for start in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = texts[start : start + batch_size]
        outs.append(embedder.embed_texts(batch))
    return torch.cat(outs, dim=0)

def topk_from_precomputed(q_emb, cand_embs, k=3):
    sims = (cand_embs @ q_emb.T).squeeze(-1)
    vals, idxs = torch.topk(sims, k=min(k, cand_embs.size(0)))
    return [(int(i), float(v)) for i, v in zip(idxs.tolist(), vals.tolist())]

def load_eval(path):
    return json.loads(Path(path).read_text(encoding="utf-8"))

def load_candidates_from_merged(merged_path):
    data = json.loads(Path(merged_path).read_text(encoding="utf-8"))
    if isinstance(data, list):
        texts = [obj.get("text", "").strip() for obj in data if isinstance(obj, dict) and obj.get("text", "").strip()]
        if not texts:
            raise ValueError("No non-empty 'text' fields found in merged.json.")
        print(len(texts))
        return texts
    elif isinstance(data, dict):
        records = data.get("records") or data.get("items") or data.get("data")
        if isinstance(records, list):
            return [str(x.get("text", "")).strip() for x in records if isinstance(x, dict) and str(x.get("text", "")).strip()]
        raise ValueError("merged.json is a dict but doesn't contain a list under known keys.")
    else:
        raise ValueError("merged.json must be a list or dict containing a list.")

def save_json(rows, out_path):
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")

In [None]:
items = load_eval(eval_json_path)
questions = [it["question"] for it in items]
candidate_texts = load_candidates_from_merged(merged_json_path)

tfidf = TFIDFBaseline(analyzer=tfidf_analyzer, ngram_range=(tfidf_ngram_min, tfidf_ngram_max))
tfidf.fit(questions, candidate_texts)

cfg = EmbedConfig(base_model_id=base_model_id, model_dir=lora_dir, max_length=max_length, device=device)
glot_base = Glot500ZeroShot(cfg)
glot_lora = Glot500LoRA(cfg)

cand_embs_base = embed_all_with_progress(candidate_texts, glot_base, batch_size=batch_size, desc="Encode candidates (base)")
cand_embs_lora = embed_all_with_progress(candidate_texts, glot_lora, batch_size=batch_size, desc="Encode candidates (LoRA)")

results_json = []
for it in tqdm(items, desc="Questions"):
    q_id = it["id"]
    q_text = it["question"]
    gold_answer = it.get("answer")
    tfidf_topk = tfidf.topk(q_text, k=top_k)
    q_emb_base = glot_base.embed_texts([q_text])
    base_topk = topk_from_precomputed(q_emb_base, cand_embs_base, k=top_k)
    q_emb_lora = glot_lora.embed_texts([q_text])
    lora_topk = topk_from_precomputed(q_emb_lora, cand_embs_lora, k=top_k)
    def pack(topk_list):
        return [{"candidate_id": int(idx), "text": candidate_texts[idx], "similarity": float(score)} for idx, score in topk_list]
    results_json.append({
        "id": q_id, "question": q_text, "gold_answer": gold_answer,
        "results": {
            "tfidf_top3": pack(tfidf_topk),
            "glot500_base_top3": pack(base_topk),
            "glot500_lora_top3": pack(lora_topk)
        }
    })

save_json(results_json, out_json_path)
print(f"saved: {Path(out_json_path).resolve()}")

1510


Encode candidates (base): 100%|██████████| 24/24 [02:44<00:00,  6.84s/it]
Encode candidates (LoRA): 100%|██████████| 24/24 [14:09<00:00, 35.39s/it]
Questions: 100%|██████████| 50/50 [00:19<00:00,  2.60it/s]

saved: /kaggle/working/eval_top3_results.json





## 3. Duplicate Detection

In [None]:
import csv, json
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from sklearn.neighbors import NearestNeighbors
from tqdm.auto import tqdm

In [None]:
qa_dir = "/kaggle/input/province-dataset"
model_dir = "/kaggle/working/glot500-2stage/final"
base_model = "cis-lmu/glot500-base"
proj_dim = 128
out_csv = "duplicates_finetuned.csv"
threshold = 0.90
batch_size = 32
max_len = 192

In [None]:
def mean_pool(h, m):
    x = m.unsqueeze(-1).type_as(h)
    s = (h * x).sum(dim=1)
    d = x.sum(dim=1).clamp(min=1e-6)
    return s / d

In [None]:
class ContrastiveModel(nn.Module):
    def __init__(self, base_model_name, proj_dim, adapter_dir):
        super().__init__()
        acfg = PeftConfig.from_pretrained(adapter_dir)
        if acfg.base_model_name_or_path != base_model_name:
            print(f"⚠️ Adapter trained on {acfg.base_model_name_or_path}, "
                  f"but loading base {base_model_name}.")
        base = AutoModelForMaskedLM.from_pretrained(base_model_name)
        if hasattr(base, "gradient_checkpointing_enable"):
            base.gradient_checkpointing_enable()
        if hasattr(base.config, "use_cache"):
            base.config.use_cache = False
        self.backbone = PeftModel.from_pretrained(base, adapter_dir)
        hidden = self.backbone.config.hidden_size
        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden // 2),
            nn.GELU(),
            nn.Linear(hidden // 2, proj_dim),
        )

    def forward(self, input_ids, attention_mask):
        out = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )
        hidden = out.hidden_states[-1]
        pooled = mean_pool(hidden, attention_mask)
        z = F.normalize(self.proj(pooled), dim=-1)
        return z

In [None]:
def load_texts(qa_dir_path):
    texts = []
    idx = 0
    for fp in sorted(Path(qa_dir_path).glob("*.json")):
        try:
            arr = json.loads(fp.read_text(encoding="utf-8"))
        except Exception:
            continue
        for rec in arr:
            t = (rec.get("text") or "").strip()
            if t:
                texts.append((idx, t))
                idx += 1
    return texts

In [None]:
@torch.no_grad()
def encode_texts(texts, tokenizer, model, max_len=192, pad_to_multiple_of=8, batch_size=32, device="cpu"):
    embs = []
    model.eval().to(device)
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch = texts[i:i+batch_size]
        tok = tokenizer(batch, padding=True, truncation=True, max_length=max_len,
                        return_tensors="pt", pad_to_multiple_of=pad_to_multiple_of)
        tok = {k: v.to(device) for k, v in tok.items()}
        z = model(tok["input_ids"], tok["attention_mask"])
        embs.append(z.cpu())
    return torch.cat(embs, dim=0)

In [None]:
data = load_texts(qa_dir)
if not data:
    print("No texts found.")
else:
    ids, texts = zip(*data)
    print(f"Loaded {len(texts)} texts")

    tok_path = model_dir if (Path(model_dir) / "tokenizer.json").exists() else base_model
    tokenizer = AutoTokenizer.from_pretrained(tok_path, use_fast=True)
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "[PAD]"

    model = ContrastiveModel(base_model, proj_dim, str(model_dir))
    proj_file = Path(model_dir) / "projection_head.pt"
    if proj_file.exists():
        model.proj.load_state_dict(torch.load(proj_file, map_location="cpu"))
    else:
        raise FileNotFoundError(f"{proj_file} not found.")

    device = "cpu"
    Z = encode_texts(list(texts), tokenizer, model, max_len=max_len, batch_size=batch_size, device=device)
    Z = F.normalize(Z, dim=-1)

    radius = 1.0 - threshold
    nn = NearestNeighbors(metric="cosine", radius=radius, n_jobs=-1)
    nn.fit(Z.numpy())

    pairs = []
    for i in tqdm(range(Z.shape[0]), desc="Searching"):
        distances, indices = nn.radius_neighbors(Z[i].numpy().reshape(1, -1), radius=radius, return_distance=True)
        if len(indices) == 0:
            continue
        for j, dist in zip(indices[0], distances[0]):
            if j <= i:
                continue
            sim = 1.0 - float(dist)
            if sim >= threshold:
                pairs.append((ids[i], ids[j], texts[i], texts[j], sim))

    with open(out_csv, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["id1", "id2", "text1", "text2", "similarity"])
        for row in pairs:
            w.writerow([row[0], row[1], row[2], row[3], f"{row[4]:.6f}"])

    print(f"Wrote {len(pairs)} pairs to {out_csv}")