In [None]:
import ir_datasets
import pyterrier as pt
import pandas as pd
from pyterrier.transformer import Transformer
import numpy as np

# Initialize PyTerrier
if not pt.started():
    pt.init()

# Load Cranfield dataset
dataset = ir_datasets.load("cranfield")

# Convert qrels, queries, and docs into lists
qrels = list(dataset.qrels_iter())
queries = list(dataset.queries_iter())
docs = list(dataset.docs_iter())

# Prepare dataframes
queries_df = pd.DataFrame([{ "qid": q.query_id, "query": q.text } for q in queries])
docs_df = pd.DataFrame([{ "docno": d.doc_id, "text": d.text, "title": d.title } for d in docs])
qrels_df = pd.DataFrame([{ "qid": q.query_id, "docno": q.doc_id, "label": q.relevance } for q in qrels])

In [None]:
import ir_datasets
dataset = ir_datasets.load("cranfield")
for query in dataset.queries_iter():
    display(query) # namedtuple<query_id, text>

In [8]:
mapping = {-1: 0, 1: 1, 2: 1, 3: 1, 4: 1}
qrels_df['label'] = qrels_df['label'].map(mapping)

In [10]:
lst = queries_df['qid'].tolist()
print(lst)

['1', '2', '4', '8', '9', '10', '12', '13', '15', '18', '22', '23', '26', '27', '29', '31', '32', '33', '34', '35', '39', '40', '41', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '61', '62', '66', '67', '68', '69', '71', '72', '74', '79', '80', '81', '82', '83', '84', '85', '86', '87', '93', '94', '95', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '116', '118', '119', '120', '121', '122', '123', '126', '128', '130', '131', '132', '133', '135', '136', '137', '138', '139', '140', '141', '142', '143', '145', '146', '147', '148', '149', '150', '152', '153', '154', '155', '156', '157', '158', '160', '161', '163', '164', '165', '167', '168', '169', '170', '171', '173', '175', '176', '177', '181', '182', '183', '184', '187', '189', '190', '196', '200', '201', '202', '203', '204', '205', '206', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '223', '224', '225'

In [13]:
qrels_df = qrels_df[qrels_df['qid'].isin(lst)]

In [17]:
# indexer = pt.IterDictIndexer(
#     "./index3",
#     meta={
#         "docno": 60,   # cukup panjang untuk docno
#         "text": "text" # simpan isi text
#     }
# )
!rm -rf ./index
indexer = pt.IterDictIndexer(
    "./index",
    meta={
        "docno": 60,   # docno max length 60 chars
        "text": 10000  # text max length 10,000 chars (misalnya)
    }
)


index_ref = indexer.index(docs_df.to_dict(orient="records"))

# import shutil

# # Specify the folder to save
# folder_path = "./index"  # Replace with the path to your folder
# output_zip = "index.zip"  # Name for the compressed file

# # Compress the folder
# shutil.make_archive("index", 'zip', folder_path)

# # Download the file
# from google.colab import files
# files.download(output_zip)



10:39:07.837 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer -- Indexed 2 empty documents


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
# Baseline BM25
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")

  bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")


In [19]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Kita gunakan tokenizer BERT.
# Ganti "bert-base-uncased" menjadi model "modern BERT"
# atau varian BERT lain sesuai kebutuhan.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Buat mapping dokumen docno -> text agar mudah diakses
doc_map = dict(zip(docs_df["docno"], docs_df["text"]))

# Buat mapping query qid -> query text
query_map = dict(zip(queries_df["qid"], queries_df["query"]))

class RankDataset(Dataset):
    def __init__(self, qrels_df, query_map, doc_map, tokenizer, max_len=256):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        for row in qrels_df.itertuples(index=False):
            qid = row.qid
            docno = row.docno
            label = row.label

            query_text = query_map[qid]
            doc_text = doc_map.get(docno, "")

            self.samples.append((query_text, doc_text, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        query_text, doc_text, label = self.samples[idx]

        # Tokenisasi dengan format [CLS] query [SEP] doc ...
        encoded = self.tokenizer(
            query_text,
            doc_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "token_type_ids": encoded["token_type_ids"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Siapkan dataset
train_dataset = RankDataset(qrels_df, query_map, doc_map, tokenizer, max_len=128)

# Contoh: split train-valid 80-20 (opsional, karena dataset Cranfield kecil)
train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [train_size, valid_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Inisialisasi model BERT untuk classification
# Ganti "bert-base-uncased" dengan "modern BERT" yang ingin digunakan
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Gunakan GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Fungsi training sederhana
def train_one_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# Fungsi validasi sederhana
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return total_loss / len(dataloader), correct / total

# Contoh training loop singkat 1-3 epoch (silakan sesuaikan)
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer)
    val_loss, val_acc = evaluate(model, valid_loader)

    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Valid Loss: {val_loss:.4f}, Valid Acc: {val_acc:.4f}")

# Setelah training, kita asumsikan model siap untuk dipakai re-ranking
# Simpan model kalau mau digunakan kembali
model.save_pretrained("path/to/modern_bert_reranker")
tokenizer.save_pretrained("path/to/modern_bert_reranker")


In [21]:
import shutil

# Specify the folder to save
folder_path = "./path"  # Replace with the path to your folder
output_zip = "path.zip"  # Name for the compressed file

# Compress the folder
shutil.make_archive("path", 'zip', folder_path)

# Download the file
from google.colab import files
files.download(output_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
class BertReranker(Transformer):
    def __init__(self, model, tokenizer, device, max_len=128):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_len = max_len
        self.model.eval()

    def transform(self, df):
        # df akan berisi kolom [qid, query, docno, text, rank, score] dsb.
        # Kita perlu menghitung skor relevansi BERT lalu menimpa kolom "score".

        # Buat list of queries, list of docs
        queries = df["query"].tolist()
        docs = df["text"].tolist()

        all_scores = []

        with torch.no_grad():
            for query_text, doc_text in zip(queries, docs):
                inputs = self.tokenizer(
                    query_text,
                    doc_text,
                    truncation=True,
                    max_length=self.max_len,
                    padding="max_length",
                    return_tensors="pt"
                )
                input_ids = inputs["input_ids"].to(self.device)
                attention_mask = inputs["attention_mask"].to(self.device)
                token_type_ids = inputs["token_type_ids"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids
                )
                # logits shape: [batch_size=1, num_labels=2]
                logits = outputs.logits
                # Kita ambil logit untuk label "relevan" (label=1), misal index=1
                score = logits[0, 1].item()
                all_scores.append(score)

        df["score"] = all_scores
        return df


In [23]:
from transformers import BertForSequenceClassification, BertTokenizer

model_path = "/content/path/to/modern_bert_reranker"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)



In [None]:
import re

# Preprocess queries to remove invalid characters
def clean_query(query):
    # Hapus karakter yang tidak valid seperti "/"
    return re.sub(r"[^a-zA-Z0-9\s]", "", query)

# Terapkan pembersihan pada kueri
queries_df["query"] = queries_df["query"].apply(clean_query)

# Baseline BM25
bm25 = pt.terrier.Retriever(index_ref,
                            metadata=["docno", "text"],
                            wmodel="BM25")

# Define pipelines
bm25_pipeline = bm25 % 30
bm25_bert_pipeline = bm25 % 50 >> bert_reranker % 30

# Perform retrieval for both pipelines
bm25_results = bm25_pipeline.transform(queries_df)
bm25_bert_results = bm25_bert_pipeline.transform(queries_df)

# Evaluate both pipelines
evaluator = pt.Utils.evaluate
metrics = ["map", "ndcg", "P@10"]

bm25_scores = evaluator(bm25_results, qrels_df, metrics)
bm25_bert_scores = evaluator(bm25_bert_results, qrels_df, metrics)

# Print comparison
print("BM25 Scores:")
print(bm25_scores)
print("\nBM25 + BERT Reranker Scores:")
print(bm25_bert_scores)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    "Metric": metrics,
    "BM25": [bm25_scores[metric] for metric in metrics],
    "BM25 + Reranker": [bm25_bert_scores[metric] for metric in metrics]
})

print("\nComparison of Metrics:")
print(comparison_df)


In [24]:
test_queries_df = pd.DataFrame([
    {"qid": "q_test", "query": "transport"}
])


In [None]:
retriever = pt.terrier.Retriever(index_ref, wmodel="BM25", metadata=["docno","text"])
bm25 = pt.BatchRetrieve(
    index_ref,
    wmodel="BM25",
    metadata=["docno", "text"]
)
bm25_top50 = bm25 % 50

bert_reranker = BertReranker(model, tokenizer, device=device)

pipeline = bm25_top50 >> bert_reranker

res = pipeline.transform(test_queries_df)

# 6) Ambil top 30
res_top30 = res.groupby("qid", as_index=False).apply(lambda df: df.nlargest(30, "score")).reset_index(drop=True)


In [26]:
res_top30

Unnamed: 0,qid,docid,docno,text,rank,score,query
0,q_test,189,190,on magnetohydrodynamic shock waves . in the e...,13,2.165906,transport
1,q_test,622,623,on the coupling between heat and mass transfer...,17,2.028297,transport
2,q_test,102,103,theory of mixing and chemical reaction in the ...,11,1.96295,transport
3,q_test,301,302,approximations for the thermodynamic and trans...,4,1.948233,transport
4,q_test,404,405,tables of thermal properties of gases .tables ...,2,1.908118,transport
5,q_test,327,328,the boundary layer near the stagnation point i...,20,1.891811,transport
6,q_test,809,810,the shock wave noise problem of supersonic air...,14,1.877725,transport
7,q_test,578,579,further developments of new methods in heat fl...,15,1.843284,transport
8,q_test,690,691,calculation procedure for thermodynamic transp...,3,1.834661,transport
9,q_test,96,97,a mixing theory for the interaction between di...,19,1.819655,transport


In [27]:
res_top30['text']

Unnamed: 0,text
0,on magnetohydrodynamic shock waves . in the e...
1,on the coupling between heat and mass transfer...
2,theory of mixing and chemical reaction in the ...
3,approximations for the thermodynamic and trans...
4,tables of thermal properties of gases .tables ...
5,the boundary layer near the stagnation point i...
6,the shock wave noise problem of supersonic air...
7,further developments of new methods in heat fl...
8,calculation procedure for thermodynamic transp...
9,a mixing theory for the interaction between di...
