In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip -q install scikit-learn transformers accelerate sentencepiece

import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

BASE_DIR = "/content/drive/MyDrive/colab_notebooks"
MERGED_CSV = os.path.join(BASE_DIR, "mil_pairwise_merged_all.csv")
assert os.path.exists(MERGED_CSV), f"Bulunamadı: {MERGED_CSV}"

df = pd.read_csv(MERGED_CSV)

# --- Golden/weak flag (heuristic) ---
# Golden: ev2/ev3 dolu olma ihtimali yüksek
def is_nonempty(x):
    return (x is not None) and (not (isinstance(x, float) and np.isnan(x))) and (str(x).strip() != "")

df["is_golden"] = (
    df.apply(lambda r: is_nonempty(r.get("pos_ev2")) or is_nonempty(r.get("pos_ev3")) or
                       is_nonempty(r.get("neg_ev2")) or is_nonempty(r.get("neg_ev3")), axis=1)
).astype(int)

print("rows:", len(df), "golden%:", df["is_golden"].mean(), flush=True)

# Stratified split: train/valid içinde golden oranı aynı kalsın
train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=df["is_golden"]
)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print("train rows:", len(train_df), "golden%:", train_df["is_golden"].mean(), flush=True)
print("valid rows:", len(valid_df), "golden%:", valid_df["is_golden"].mean(), flush=True)

# is_golden kolonu training dataset'e girmesin
train_df = train_df.drop(columns=["is_golden"])
valid_df = valid_df.drop(columns=["is_golden"])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
rows: 10757 golden%: 0.19085246816026774
train rows: 8605 golden%: 0.19081929110981988
valid rows: 2152 golden%: 0.19098513011152415


In [None]:
import os, math
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from functools import partial
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

# ---- PATHS ----
BASE_DIR  = "/content/drive/MyDrive/colab_notebooks"
MODEL_DIR = os.path.join(BASE_DIR, "LegalBertTurk")
assert os.path.exists(MODEL_DIR), f"Bulunamadı: {MODEL_DIR}"

# ---- HYPERPARAMS (istenen) ----
MAX_LEN  = 256
BATCH    = 32
GRAD_ACC = 2
LR       = 1.2e-6
EPOCHS   = 10

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device, flush=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, num_labels=1).to(device)

# ---- COLS ----
POS_COLS = ["pos_ev1", "pos_ev2", "pos_ev3"]
NEG_COLS = ["neg_ev1", "neg_ev2", "neg_ev3"]

def _clean(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    return str(x).strip()

def _nonempty_list(xs):
    out = []
    for v in xs:
        t = _clean(v)
        if t:
            out.append(t)
    return out

class PairwiseMILDataset(Dataset):
    def __init__(self, df, pos_cols, neg_cols):
        self.df = df.reset_index(drop=True)
        self.pos_cols = pos_cols
        self.neg_cols = neg_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        r = self.df.iloc[i]
        q = _clean(r["query_text"])
        pos = _nonempty_list([r.get(c, "") for c in self.pos_cols])
        neg = _nonempty_list([r.get(c, "") for c in self.neg_cols])

        # dummy fallback
        if len(pos) == 0: pos = [""]
        if len(neg) == 0: neg = [""]

        return {"q": q, "pos": pos, "neg": neg}

def collate_fn(batch, tokenizer, max_len, max_pos=3, max_neg=3):
    pairs_q, pairs_e = [], []
    pos_sizes, neg_sizes = [], []

    for item in batch:
        q = item["q"]
        pos = item["pos"][:max_pos]
        neg = item["neg"][:max_neg]

        pos_sizes.append(len(pos))
        neg_sizes.append(len(neg))

        for ev in pos:
            pairs_q.append(q); pairs_e.append(ev)
        for ev in neg:
            pairs_q.append(q); pairs_e.append(ev)

    enc = tokenizer(
        pairs_q, pairs_e,
        padding=True, truncation=True, max_length=max_len,
        return_tensors="pt"
    )
    meta = {
        "pos_sizes": torch.tensor(pos_sizes, dtype=torch.long),
        "neg_sizes": torch.tensor(neg_sizes, dtype=torch.long),
    }
    return enc, meta

def pairwise_mil_loss(logits, pos_sizes, neg_sizes):
    scores = logits.squeeze(-1)  # [N]
    pos_pooled, neg_pooled = [], []
    offset = 0

    for p, n in zip(pos_sizes.tolist(), neg_sizes.tolist()):
        pos_scores = scores[offset: offset+p]; offset += p
        neg_scores = scores[offset: offset+n]; offset += n
        pos_pooled.append(torch.logsumexp(pos_scores, dim=0))
        neg_pooled.append(torch.logsumexp(neg_scores, dim=0))

    pos = torch.stack(pos_pooled, dim=0)
    neg = torch.stack(neg_pooled, dim=0)
    return F.softplus(-(pos - neg)).mean()

def make_loader(df, shuffle):
    ds = PairwiseMILDataset(df, POS_COLS, NEG_COLS)
    return DataLoader(
        ds,
        batch_size=BATCH,
        shuffle=shuffle,
        num_workers=0,
        collate_fn=partial(collate_fn, tokenizer=tokenizer, max_len=MAX_LEN, max_pos=3, max_neg=3)
    )

@torch.no_grad()
def eval_pairwise_loss_acc(model, loader):
    model.eval()
    losses = []
    correct, total = 0, 0

    for enc, meta in loader:
        enc = {k: v.to(device) for k, v in enc.items()}
        pos_sizes = meta["pos_sizes"].to(device)
        neg_sizes = meta["neg_sizes"].to(device)

        out = model(**enc)
        loss = pairwise_mil_loss(out.logits, pos_sizes, neg_sizes)
        losses.append(loss.item())

        # bag-level valid_acc: pos_score > neg_score
        scores = out.logits.squeeze(-1)
        offset = 0
        pos_list, neg_list = [], []
        for p, n in zip(pos_sizes.tolist(), neg_sizes.tolist()):
            pos_scores = scores[offset: offset+p]; offset += p
            neg_scores = scores[offset: offset+n]; offset += n
            pos_list.append(torch.logsumexp(pos_scores, dim=0))
            neg_list.append(torch.logsumexp(neg_scores, dim=0))

        pos = torch.stack(pos_list, dim=0)
        neg = torch.stack(neg_list, dim=0)
        correct += int((pos > neg).sum().item())
        total += int(pos.size(0))

    return float(np.mean(losses)) if losses else 0.0, (correct/total) if total else 0.0

# ---- nDCG@10 + MRR@10 (pairwise valid'den türetilmiş) ----
def dcg_at_k(rels, k):
    rels = rels[:k]
    return sum(((2**rel - 1) / np.log2(i+2)) for i, rel in enumerate(rels))

def ndcg_at_k(rels, k):
    ideal = dcg_at_k(sorted(rels, reverse=True), k)
    return (dcg_at_k(rels, k) / ideal) if ideal > 0 else 0.0

def mrr_at_k(rels, k, rel_threshold=2):
    for i, rel in enumerate(rels[:k], start=1):
        if rel >= rel_threshold:
            return 1.0 / i
    return 0.0

@torch.no_grad()
def eval_ranking_metrics_from_pairwise(model, df, K=10, max_ev=3):
    model.eval()

    qmap = {}
    for r in df.itertuples(index=False):
        qid = getattr(r, "query_id")
        qtext = getattr(r, "query_text")

        def put(case_id, label, evs):
            key = (qid, qtext)
            if key not in qmap:
                qmap[key] = {}
            if case_id not in qmap[key]:
                qmap[key][case_id] = {
                    "label": int(label),
                    "evs": [e for e in evs if isinstance(e, str) and e.strip()]
                }

        put(getattr(r, "pos_case_id"), getattr(r, "pos_label"),
            [getattr(r, "pos_ev1",""), getattr(r, "pos_ev2",""), getattr(r, "pos_ev3","")])
        put(getattr(r, "neg_case_id"), getattr(r, "neg_label"),
            [getattr(r, "neg_ev1",""), getattr(r, "neg_ev2",""), getattr(r, "neg_ev3","")])

    ndcgs, mrrs = [], []

    for (qid, qtext), cases in qmap.items():
        items = []
        for case_id, obj in cases.items():
            evs = obj["evs"][:max_ev]
            if not evs:
                continue

            enc = tokenizer([qtext]*len(evs), evs, padding=True, truncation=True,
                            max_length=MAX_LEN, return_tensors="pt")
            enc = {k: v.to(device) for k, v in enc.items()}
            logits = model(**enc).logits.squeeze(-1)
            score = torch.logsumexp(logits, dim=0).item()

            items.append((score, obj["label"]))

        if not items:
            continue

        items.sort(key=lambda x: x[0], reverse=True)
        rels = [lbl for _, lbl in items]
        ndcgs.append(ndcg_at_k(rels, K))
        mrrs.append(mrr_at_k(rels, K, rel_threshold=2))

    return {
        f"ndcg@{K}": float(np.mean(ndcgs)) if ndcgs else 0.0,
        f"mrr@{K}": float(np.mean(mrrs)) if mrrs else 0.0,
        "queries_eval": len(ndcgs)
    }

train_loader = make_loader(train_df, shuffle=True)
valid_loader = make_loader(valid_df, shuffle=False)

print("train batches:", len(train_loader), "valid batches:", len(valid_loader), flush=True)


device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/colab_notebooks/LegalBertTurk and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train batches: 269 valid batches: 68


In [None]:
OUT_DIR = os.path.join(BASE_DIR, "reranker_runs", "merged_all_bs32_lr12e-6")
os.makedirs(OUT_DIR, exist_ok=True)

optim = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.0)

steps_per_epoch = int(math.ceil(len(train_loader) / GRAD_ACC))
total_steps = EPOCHS * steps_per_epoch

sched = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=max(10, int(0.1 * total_steps)),
    num_training_steps=total_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))

best_loss = 1e9
best_acc = -1.0
best_ndcg = -1.0

for epoch in range(1, EPOCHS+1):
    model.train()
    optim.zero_grad(set_to_none=True)
    running = []

    for step, (enc, meta) in enumerate(train_loader, start=1):
        enc = {k: v.to(device) for k, v in enc.items()}
        pos_sizes = meta["pos_sizes"].to(device)
        neg_sizes = meta["neg_sizes"].to(device)

        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            out = model(**enc)
            loss = pairwise_mil_loss(out.logits, pos_sizes, neg_sizes) / GRAD_ACC

        scaler.scale(loss).backward()
        running.append(loss.item() * GRAD_ACC)

        if step % GRAD_ACC == 0:
            scaler.step(optim)
            scaler.update()
            optim.zero_grad(set_to_none=True)
            sched.step()

    tr_loss = float(np.mean(running)) if running else 0.0
    va_loss, va_acc = eval_pairwise_loss_acc(model, valid_loader)
    rm = eval_ranking_metrics_from_pairwise(model, valid_df, K=10)

    print(
        f"Epoch {epoch} | train={tr_loss:.4f} | valid_loss={va_loss:.4f} | valid_acc={va_acc:.4f} | "
        f"ndcg@10={rm['ndcg@10']:.4f} | mrr@10={rm['mrr@10']:.4f} | queries_eval={rm['queries_eval']}",
        flush=True
    )

    # best-by-loss
    if va_loss < best_loss:
        best_loss = va_loss
        save_dir = os.path.join(OUT_DIR, "best_by_valid_loss")
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

    # best-by-acc
    if va_acc > best_acc:
        best_acc = va_acc
        save_dir = os.path.join(OUT_DIR, "best_by_valid_acc")
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

    # best-by-ndcg
    if rm["ndcg@10"] > best_ndcg:
        best_ndcg = rm["ndcg@10"]
        save_dir = os.path.join(OUT_DIR, "best_by_ndcg10")
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

# last checkpoint
model.save_pretrained(os.path.join(OUT_DIR, "last"))
tokenizer.save_pretrained(os.path.join(OUT_DIR, "last"))

print("DONE | best_loss:", best_loss, "| best_acc:", best_acc, "| best_ndcg@10:", best_ndcg, flush=True)


  scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device=="cuda")):


Epoch 1 | train=0.6966 | valid_loss=0.6875 | valid_acc=0.6162 | ndcg@10=0.7070 | mrr@10=0.5075 | queries_eval=119
Epoch 2 | train=0.6703 | valid_loss=0.6389 | valid_acc=0.6645 | ndcg@10=0.7287 | mrr@10=0.4924 | queries_eval=119
Epoch 3 | train=0.6155 | valid_loss=0.5708 | valid_acc=0.7110 | ndcg@10=0.7673 | mrr@10=0.5169 | queries_eval=119
Epoch 4 | train=0.5481 | valid_loss=0.4992 | valid_acc=0.7756 | ndcg@10=0.8112 | mrr@10=0.5465 | queries_eval=119
Epoch 5 | train=0.4701 | valid_loss=0.4204 | valid_acc=0.8271 | ndcg@10=0.8535 | mrr@10=0.5802 | queries_eval=119
Epoch 6 | train=0.3945 | valid_loss=0.3551 | valid_acc=0.8499 | ndcg@10=0.8771 | mrr@10=0.5963 | queries_eval=119
Epoch 7 | train=0.3397 | valid_loss=0.3177 | valid_acc=0.8676 | ndcg@10=0.8891 | mrr@10=0.6058 | queries_eval=119
Epoch 8 | train=0.3076 | valid_loss=0.2964 | valid_acc=0.8741 | ndcg@10=0.8929 | mrr@10=0.5967 | queries_eval=119
Epoch 9 | train=0.2898 | valid_loss=0.2841 | valid_acc=0.8806 | ndcg@10=0.8976 | mrr@10=