In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/config.json
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/training_args.bin
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/tokenizer.json
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/tokenizer_config.json
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/model.safetensors
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/special_tokens_map.json
/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk/vocab.txt
/kaggle/input/pairwise-merged-valid/mil_pairwise_merged_valid.csv
/kaggle/input/golden-pairwise21/mil_pairwise_dataset_golden.csv
/kaggle/input/pairwise-merged-train/mil_pairwise_merged_train.csv
/kaggle/input/weak-pairwise100/mil_pairwise_dataset_2v1_2v0_1v0_from_feedback.csv


# Checked the JSON content


In [2]:
# import json

# path = "/kaggle/input/feedback-json/feedback (4).jsonl"
# N = 5  # kaç satır görmek istiyorsan artır/azalt

# items = []
# with open(path, "r", encoding="utf-8") as f:
#     for i, line in enumerate(f):
#         if i >= N:
#             break
#         items.append(json.loads(line))

# # Notebook/terminalde okunabilir yazdırma
# print(json.dumps(items, ensure_ascii=False, indent=2))

# # İstersen dosyaya da yaz (JSON array olarak)
# with open("feedback_preview.json", "w", encoding="utf-8") as out:
#     json.dump(items, out, ensure_ascii=False, indent=2)

# print(f"\nYazıldı: feedback_preview.json (ilk {N} satır)")


# Generated files
 Golden labeled and weak labeled (LLM) data. Each row contains query_text, case_id, label, and 1-3 evidence texts (ev1/ev2/ev3).

#The pairwise dataset contains approximately 10,000 pairs within the same query in the format (2 > 1, 2 > 0, 1 > 0).

# How will this pairwise data be used? (brief)
 For each pair:

 Score the positive case’s 1-3 evidences using LegalBERT cross-encoder → aggregate into a single score with LogSumExp.

 Do the same for the negative case.

 Loss: Ensure pos_score > neg_score (pairwise ranking loss).


In [3]:
import numpy as np
import math
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader



POS_COLS = ["pos_ev1", "pos_ev2", "pos_ev3"]
NEG_COLS = ["neg_ev1", "neg_ev2", "neg_ev3"]

def _clean(x):
    if x is None:
        return ""
    if isinstance(x, float) and np.isnan(x):
        return ""
    return str(x).strip()

def _nonempty_list(xs):
    return [t for t in (_clean(v) for v in xs) if t]

class MilPairwiseDataset(Dataset):
    def __init__(self, df, require_nonempty=True):
        self.df = df.reset_index(drop=True)
        self.require_nonempty = require_nonempty

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        q = _clean(row.get("query_text", ""))

        pos = _nonempty_list([row.get(c, "") for c in POS_COLS])
        neg = _nonempty_list([row.get(c, "") for c in NEG_COLS])

        # Seçenek A: boş gelirse ya dummy koy, ya da örneği dışarıda bırakacak şekilde handle et.
        # En pratik (train loop bozulmasın): dummy ekle.
        if self.require_nonempty:
            if len(pos) == 0:
                pos = [""]
            if len(neg) == 0:
                neg = [""]

        return {"q": q, "pos": pos, "neg": neg}


In [4]:
MODEL_PATH = "/kaggle/input/legalbertturk/pytorch/default/1/LegalBertTurk"

# weak training set kullanıyorsan bunu train'e ver:
TRAIN_PATH = "/kaggle/input/pairwise-merged-train/mil_pairwise_merged_train.csv"
VALID_PATH = "/kaggle/input/pairwise-merged-valid/mil_pairwise_merged_valid.csv"

MAX_LEN = 256
BATCH = 16
GRAD_ACC = 2          # efektif batch = BATCH * GRAD_ACC
LR = 5e-6
EPOCHS = 6

device = "cuda" if torch.cuda.is_available() else "cpu"


# Cell 2: DataFrame + tokenizer


In [5]:
import pandas as pd
from transformers import AutoTokenizer

train_df = pd.read_csv(TRAIN_PATH)
valid_df = pd.read_csv(VALID_PATH)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    local_files_only=True,
    use_fast=True
)

# Not: _clean/_nonempty_list zaten önceki cell'de tanımlı kalsın.
# Eğer yine de burada kalsın istiyorsan strip'li sürüm:
def _clean(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    return str(x).strip()


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from functools import partial

class PairwiseMILDataset(Dataset):
    def __init__(self, df, pos_cols, neg_cols):
        self.df = df.reset_index(drop=True)
        self.pos_cols = pos_cols
        self.neg_cols = neg_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        r = self.df.iloc[i]
        q = _clean(r["query_text"])
        pos = _nonempty_list([r.get(c, "") for c in self.pos_cols])
        neg = _nonempty_list([r.get(c, "") for c in self.neg_cols])

        # Not: pos/neg boşsa bu örnek veri kalitesi açısından sorunlu olabilir.
        # İstersen burada "" koymak yerine bu satırları drop ederek dataset'i temizleyebiliriz.
        if len(pos) == 0:
            pos = [""]
        if len(neg) == 0:
            neg = [""]

        return {"q": q, "pos": pos, "neg": neg}


def collate_fn(batch, tokenizer, max_len, max_pos=3, max_neg=3):
    """
    Çıkış:
      - enc: tokenizer output for ALL pairs (flatten edilmiş)
      - meta: bag yapısını geri kurmak için
          pos_sizes: (B,) her örnekte kaç pos evidence
          neg_sizes: (B,) her örnekte kaç neg evidence
    """
    pairs_q, pairs_e = [], []
    pos_sizes, neg_sizes = [], []

    for item in batch:
        q = item["q"]
        pos = item["pos"][:max_pos]
        neg = item["neg"][:max_neg]

        pos_sizes.append(len(pos))
        neg_sizes.append(len(neg))

        for ev in pos:
            pairs_q.append(q)
            pairs_e.append(ev)

        for ev in neg:
            pairs_q.append(q)
            pairs_e.append(ev)

    enc = tokenizer(
        pairs_q,
        pairs_e,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )

    meta = {
        "pos_sizes": torch.tensor(pos_sizes, dtype=torch.long),
        "neg_sizes": torch.tensor(neg_sizes, dtype=torch.long),
    }
    return enc, meta


# ---- DataLoader'lar ----
train_ds = PairwiseMILDataset(train_df, POS_COLS, NEG_COLS)
valid_ds = PairwiseMILDataset(valid_df, POS_COLS, NEG_COLS)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH,
    shuffle=True,
    num_workers=0,
    collate_fn=partial(collate_fn, tokenizer=tokenizer, max_len=MAX_LEN, max_pos=3, max_neg=3),
)

valid_loader = DataLoader(
    valid_ds,
    batch_size=BATCH,
    shuffle=False,
    num_workers=0,
    collate_fn=partial(collate_fn, tokenizer=tokenizer, max_len=MAX_LEN, max_pos=3, max_neg=3),
)


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH, local_files_only=True, num_labels=1
).to(device)

def pairwise_mil_loss(logits, pos_sizes, neg_sizes):
    """
    logits: [N, 1] (flatten edilmiş tüm (q,ev) çiftleri)
    pos_sizes: [B]
    neg_sizes: [B]
    """
    scores = logits.squeeze(-1)  # [N]

    pos_pooled = []
    neg_pooled = []

    offset = 0
    for p, n in zip(pos_sizes.tolist(), neg_sizes.tolist()):
        pos_scores = scores[offset : offset + p]
        offset += p
        neg_scores = scores[offset : offset + n]
        offset += n

        # Pooling: logsumexp = "softmax pooling"
        pos_pooled.append(torch.logsumexp(pos_scores, dim=0))
        neg_pooled.append(torch.logsumexp(neg_scores, dim=0))

    pos = torch.stack(pos_pooled, dim=0)  # [B]
    neg = torch.stack(neg_pooled, dim=0)  # [B]

    return F.softplus(-(pos - neg)).mean()


2026-01-01 21:43:04.125840: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767303784.148611     100 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767303784.155373     100 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767303784.173621     100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767303784.173644     100 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767303784.173647     100 computation_placer.cc:177] computation placer alr

In [8]:
import torch.nn.functional as F

# Optimizer
optim = torch.optim.AdamW(model.parameters(), lr=1.1e-6, weight_decay=0.00)

# Scheduler
steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACC)
total_steps = EPOCHS * steps_per_epoch
sched = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=max(10, int(0.1 * total_steps)),
    num_training_steps=total_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))


@torch.no_grad()
def eval_metrics():
    model.eval()
    losses = []
    correct = 0
    total = 0

    for enc, meta in valid_loader:
        enc = {k: v.to(device) for k, v in enc.items()}
        pos_sizes = meta["pos_sizes"].to(device)
        neg_sizes = meta["neg_sizes"].to(device)

        with torch.cuda.amp.autocast(enabled=(device == "cuda")):
            out = model(**enc)

            # Seçenek A loss
            loss = pairwise_mil_loss(out.logits, pos_sizes, neg_sizes)

            # Seçenek A pairwise accuracy: bag-level pos vs neg
            scores = out.logits.squeeze(-1)  # [N]
            offset = 0
            pos_list, neg_list = [], []
            for p, n in zip(pos_sizes.tolist(), neg_sizes.tolist()):
                pos_scores = scores[offset: offset + p]; offset += p
                neg_scores = scores[offset: offset + n]; offset += n
                pos_list.append(torch.logsumexp(pos_scores, dim=0))
                neg_list.append(torch.logsumexp(neg_scores, dim=0))

            pos = torch.stack(pos_list, dim=0)  # [B]
            neg = torch.stack(neg_list, dim=0)  # [B]

        losses.append(loss.item())
        correct += (pos > neg).sum().item()
        total += pos.size(0)

    return float(np.mean(losses)), (correct / total if total > 0 else 0.0)


best_loss = 1e9
best_acc = -1.0

for epoch in range(1, EPOCHS + 1):
    model.train()
    optim.zero_grad(set_to_none=True)
    running = []

    for step, (enc, meta) in enumerate(train_loader, start=1):
        enc = {k: v.to(device) for k, v in enc.items()}
        pos_sizes = meta["pos_sizes"].to(device)
        neg_sizes = meta["neg_sizes"].to(device)

        with torch.cuda.amp.autocast(enabled=(device == "cuda")):
            out = model(**enc)
            loss = pairwise_mil_loss(out.logits, pos_sizes, neg_sizes) / GRAD_ACC

        scaler.scale(loss).backward()
        running.append(loss.item() * GRAD_ACC)

        if step % GRAD_ACC == 0:
            scaler.step(optim)
            scaler.update()
            optim.zero_grad(set_to_none=True)
            sched.step()

    tr = float(np.mean(running)) if len(running) else 0.0
    va_loss, va_acc = eval_metrics()

    print(f"Epoch {epoch} | train={tr:.4f} | valid_loss={va_loss:.4f} | valid_acc={va_acc:.4f}")

    if va_loss < best_loss:
        best_loss = va_loss
        model.save_pretrained("best_reranker_loss")
        tokenizer.save_pretrained("best_reranker_loss")

    if va_acc > best_acc:
        best_acc = va_acc
        model.save_pretrained("best_reranker_acc")
        tokenizer.save_pretrained("best_reranker_acc")


  scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


Epoch 1 | train=0.6884 | valid_loss=0.6879 | valid_acc=0.5912
Epoch 2 | train=0.6241 | valid_loss=0.6605 | valid_acc=0.5866
Epoch 3 | train=0.5386 | valid_loss=0.6099 | valid_acc=0.6611
Epoch 4 | train=0.4416 | valid_loss=0.5544 | valid_acc=0.7104
Epoch 5 | train=0.3697 | valid_loss=0.5377 | valid_acc=0.7346
Epoch 6 | train=0.3383 | valid_loss=0.5342 | valid_acc=0.7356
