In [3]:
from datasets import load_dataset

ds = load_dataset("allenai/multixscience_sparse_mean")

In [4]:
train_df = ds["train"].to_pandas()

train_df.head()

Unnamed: 0,aid,mid,abstract,related_work,ref_abstract
0,math9912167,1631980677,"Author(s): Kuperberg, Greg; Thurston, Dylan P....",Two other generalizations that can be consider...,"{'cite_N': ['@cite_16', '@cite_26'], 'mid': ['..."
1,cs9910011,2168463568,A statistical model for segmentation and word ...,"Model Based Dynamic Programming, hereafter ref...","{'cite_N': ['@cite_0'], 'mid': ['2074546930'],..."
2,cs9911003,2950670108,We solve the subgraph isomorphism problem in p...,Recently we were able to characterize the grap...,"{'cite_N': ['@cite_41'], 'mid': ['2074992286']..."
3,hep-th9908200,2160091034,Daviau showed the equivalence of matrix Dirac ...,A further genuine and important approach to th...,"{'cite_N': ['@cite_6'], 'mid': ['2082565556'],..."
4,cs9903014,1612660921,We present an open architecture for just-in-ti...,Pioneering research in dynamic runtime optimiz...,"{'cite_N': ['@cite_8'], 'mid': ['2101776604'],..."


In [None]:
import json
import os
import hashlib
from tqdm import tqdm

# Setup
output_dir = "final_dataset_retrieval"
os.makedirs(output_dir, exist_ok=True)

def get_hash(text):
    """Membuat ID unik berdasarkan konten teks"""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

# Variabel penampung
seen_train_pairs = set()
corpus_set = set() # Untuk menampung semua dokumen unik yang akan masuk Chroma
train_data = []
val_data = []
test_data = []

# Proses TRAIN Split (Untuk Fine-Tuning Model)
print(">>> Memproses TRAIN Split (Membuat Pasangan)...")
for row in tqdm(ds['train']):
    anchor = row['abstract']
    refs = row['ref_abstract']['abstract']
    
    if not anchor or len(anchor) < 50: continue
    if not refs: continue

    for ref in refs:
        if not ref or len(ref) < 50: continue
        if anchor == ref: continue 
        
        # Simpan ke Corpus (Database Chroma nanti)
        corpus_set.add(ref)
        
        # Deduplikasi Pasangan untuk Training
        pair_sig = (get_hash(anchor), get_hash(ref))
        if pair_sig not in seen_train_pairs:
            seen_train_pairs.add(pair_sig)
            train_data.append({
                "anchor": anchor,
                "positive": ref
            })

# Proses VALIDATION & TEST Split (Untuk Evaluasi)
# Di sini kita TIDAK flatten jadi pasangan, tapi simpan format:
# { "query": "...", "ground_truths": ["...", "..."] }
# Agar kita bisa hitung Recall@10 (berapa banyak GT yang ketangkap)

def process_eval_split(split_name, dataset_split):
    processed = []
    print(f">>> Memproses {split_name} Split...")
    for row in tqdm(dataset_split):
        query = row['abstract']
        refs = row['ref_abstract']['abstract']
        
        if not query or len(query) < 50: continue
        
        valid_refs = []
        for ref in refs:
            if ref and len(ref) > 50:
                valid_refs.append(ref)
                corpus_set.add(ref)
        
        if valid_refs:
            processed.append({
                "query": query,
                "ground_truths": valid_refs
            })
    return processed

val_data = process_eval_split("VALIDATION", ds['validation'])
test_data = process_eval_split("TEST", ds['test'])

# Finalisasi Corpus (Isi ChromaDB)
print(">>> Menyusun Corpus Unik...")
# Ubah set menjadi list of dict biar rapi
corpus_list = [{"id": get_hash(text), "text": text} for text in corpus_set]

# Simpan ke File
print(f"\n--- RINGKASAN DATA ---")
print(f"1. Training Pairs (Finetune) : {len(train_data)} pasang")
print(f"2. Validation Queries        : {len(val_data)} soal")
print(f"3. Test Queries (Ujian)      : {len(test_data)} soal")
print(f"4. Corpus (Isi ChromaDB)     : {len(corpus_list)} dokumen unik")

def save_json(data, filename):
    path = os.path.join(output_dir, filename)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Tersimpan: {path}")

save_json(train_data, "train_pairs.json")
save_json(val_data, "val_queries.json")
save_json(test_data, "test_queries.json")
save_json(corpus_list, "corpus.json")

print("\nSELESAI! Gunakan file di folder 'final_dataset_retrieval'.")

>>> Memproses TRAIN Split (Membuat Pasangan)...


100%|██████████| 30369/30369 [00:03<00:00, 10027.02it/s]


>>> Memproses VALIDATION Split...


100%|██████████| 5066/5066 [00:00<00:00, 10136.34it/s]


>>> Memproses TEST Split...


100%|██████████| 5093/5093 [00:00<00:00, 12539.49it/s]


>>> Menyusun Corpus Unik...

--- RINGKASAN DATA ---
1. Training Pairs (Finetune) : 97292 pasang
2. Validation Queries        : 5066 soal
3. Test Queries (Ujian)      : 5093 soal
4. Corpus (Isi ChromaDB)     : 74015 dokumen unik
Tersimpan: final_dataset_retrieval\train_pairs.json
Tersimpan: final_dataset_retrieval\val_queries.json
Tersimpan: final_dataset_retrieval\test_queries.json
Tersimpan: final_dataset_retrieval\corpus.json

SELESAI! Gunakan file di folder 'final_dataset_skripsi'.


In [7]:
import json
import os

output_dir = "final_dataset_retrieval"

def load_and_inspect(filename, file_type):
    path = os.path.join(output_dir, filename)
    
    if not os.path.exists(path):
        print(f"File tidak ditemukan: {path}")
        return

    print(f"\n{'='*20} {filename.upper()} {'='*20}")
    
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Tipe Data  : {file_type}")
    print(f"Total Data : {len(data)} item")
    
    sample = data[0]
    
    print("\nCONTOH DATA PERTAMA:")
    if file_type == "TRAIN PAIRS":
        print(f"   [Anchor]   : {sample['anchor'][:100]}...")
        print(f"   [Positive] : {sample['positive'][:100]}...")
        
    elif file_type == "QUERIES (Test/Val)":
        print(f"   [Query]         : {sample['query'][:100]}...")
        print(f"   [Ground Truths] : Ada {len(sample['ground_truths'])} jawaban benar.")
        print(f"     -> Jwbn 1     : {sample['ground_truths'][0][:100]}...")
        
    elif file_type == "CORPUS":
        print(f"   [ID Unik] : {sample['id']}")
        print(f"   [Text]    : {sample['text'][:100]}...")

load_and_inspect("train_pairs.json", "TRAIN PAIRS")
load_and_inspect("val_queries.json", "QUERIES (Test/Val)")
load_and_inspect("test_queries.json", "QUERIES (Test/Val)")
load_and_inspect("corpus.json", "CORPUS")


Tipe Data  : TRAIN PAIRS
Total Data : 97292 item

CONTOH DATA PERTAMA:
   [Anchor]   : Author(s): Kuperberg, Greg; Thurston, Dylan P. | Abstract: We give a purely topological definition o...
   [Positive] : This note is a sequel to our earlier paper of the same title [4] and describes invariants of rationa...

Tipe Data  : QUERIES (Test/Val)
Total Data : 5066 item

CONTOH DATA PERTAMA:
   [Query]         : One of the key concepts in testing is that of adequate test sets. A test selection criterion decides...
   [Ground Truths] : Ada 1 jawaban benar.
     -> Jwbn 1     : An approach to functional testing is described in which the design of a program is viewed as an inte...

Tipe Data  : QUERIES (Test/Val)
Total Data : 5093 item

CONTOH DATA PERTAMA:
   [Query]         : We present our approach to the problem of how an agent, within an economic Multi-Agent System, can d...
   [Ground Truths] : Ada 4 jawaban benar.
     -> Jwbn 1     : Cooperative multi-agent systems (MAS) are ones in wh

In [6]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch

# Config
DATA_DIR = "final_dataset_retrieval"
OUTPUT_DIR = "models-retrieval"
BATCH_SIZE = 16
EPOCHS = 3

# Daftar Model yang akan diuji
MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/all-mpnet-base-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Data Function
def load_json(filename):
    with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf-8') as f:
        return json.load(f)

print("1. Loading Dataset...")
train_raw = load_json("train_pairs.json")
test_queries_raw = load_json("test_queries.json")
corpus_raw = load_json("corpus.json")

# Evaluator Setup
# Evaluator butuh format khusus: Corpus Dict, Queries Dict, dan Relevance Dict
print("Menyiapkan Evaluator Standard")

# Buat Dictionary Corpus {id: text} & Reverse Map {text: id} untuk lookup
corpus_dict = {}
text_to_id_map = {}

for item in corpus_raw:
    doc_id = item['id']
    text = item['text']
    corpus_dict[doc_id] = text
    text_to_id_map[text] = doc_id

# Buat Dictionary Queries & Relevance
queries_dict = {}
relevant_docs = {}

for i, item in enumerate(test_queries_raw):
    qid = f"q_{i}"
    queries_dict[qid] = item['query']
    
    # Cari ID dokumen untuk setiap ground truth text
    ground_truth_ids = set()
    for gt_text in item['ground_truths']:
        found_id = text_to_id_map.get(gt_text)
        if found_id:
            ground_truth_ids.add(found_id)
            
    if ground_truth_ids:
        relevant_docs[qid] = ground_truth_ids

# Setup Object Evaluator
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    queries_dict,
    corpus_dict,
    relevant_docs,
    show_progress_bar=True,
    name="test",
    mrr_at_k=[10],
    ndcg_at_k=[10],
    accuracy_at_k=[1, 5, 10], # Ini sama dengan Recall@K / Hit Rate
    precision_recall_at_k=[10]
)

print(f"   - Siap mengevaluasi {len(queries_dict)} queries terhadap {len(corpus_dict)} dokumen.")

# FUNGSI TRAINING & EVALUASI 
results_table = []

def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*50}")
    print(f"EXPERIMENT: {short_name}")
    print(f"{'='*50}")

    # --- PHASE A: BASELINE EVALUATION (Sebelum Fine-tune) ---
    print(f"A. Menguji Baseline {short_name}...")
    model_baseline = SentenceTransformer(model_name)
    
    # Run Evaluator
    metrics_base = ir_evaluator(model_baseline)

    print(metrics_base.keys())
    
    # Simpan hasil
    results_table.append({
        "Model": short_name,
        "Type": "Baseline (Pre-trained)",
        "Recall@10": metrics_base['test_accuracy@10'], # Hit Rate
        "MRR@10": metrics_base['test_mrr@10']
    })
    print(f"   -> Baseline Recall@10: {metrics_base['test_accuracy@10']:.4f}")
    print(f"   -> Baseline MRR@10   : {metrics_base['test_mrr@10']:.4f}")
    
    del model_baseline # Hapus dari RAM
    torch.cuda.empty_cache() # Bersihkan GPU

    # --- PHASE B: FINE-TUNING ---
    print(f"B. Melakukan Fine-Tuning {short_name}...")
    
    # Setup Model (Max Length 300 agar aman memori)
    word_emb = models.Transformer(model_name, max_seq_length=300)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_finetune = SentenceTransformer(modules=[word_emb, pooling])
    
    # Setup Data Loader
    train_examples = [InputExample(texts=[d['anchor'], d['positive']]) for d in train_raw]
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss
    train_loss = losses.MultipleNegativesRankingLoss(model_finetune)
    
    # Train
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    model_finetune.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dataloader) * 0.1),
        show_progress_bar=True
    )
    
    # PHASE C: EVALUASI FINETUNED MODEL
    print(f"Menguji Model Finetuned...")
    metrics_ft = ir_evaluator(model_finetune)
    
    results_table.append({
        "Model": short_name,
        "Type": "Fine-Tuned",
        "Recall@10": metrics_ft['test_accuracy@10'],
        "MRR@10": metrics_ft['test_mrr@10']
    })
    print(f"   -> Finetuned Recall@10: {metrics_ft['test_accuracy@10']:.4f}")
    print(f"   -> Finetuned MRR@10   : {metrics_ft['test_mrr@10']:.4f}")
    
    # Save model lokal jika mau dipakai nanti
    model_finetune.save(save_path)
    
    del model_finetune
    torch.cuda.empty_cache()

# EKSEKUSI
for model_name in MODELS_TO_TEST:
    run_experiment(model_name)

# TAMPILKAN HASIL AKHIR
print("\n\n" + "="*20 + " HASIL AKHIR RETRIEVAL " + "="*20)
df_results = pd.DataFrame(results_table)
# Format tampilan agar cantik
pd.options.display.float_format = '{:,.4f}'.format
print(df_results)

# Opsional: Simpan ke CSV untuk laporan
df_results.to_csv("hasil_benchmark_retrieval.csv", index=False)

1. Loading Dataset...
Menyiapkan Evaluator Standard
   - Siap mengevaluasi 5093 queries terhadap 74015 dokumen.

EXPERIMENT: all-MiniLM-L6-v2
A. Menguji Baseline all-MiniLM-L6-v2...


Batches: 100%|██████████| 160/160 [00:08<00:00, 19.73it/s]
Batches: 100%|██████████| 1563/1563 [01:14<00:00, 20.97it/s]
Batches: 100%|██████████| 751/751 [00:36<00:00, 20.31it/s]t]
Corpus Chunks: 100%|██████████| 2/2 [01:54<00:00, 57.29s/it]


dict_keys(['test_cosine_accuracy@1', 'test_cosine_accuracy@5', 'test_cosine_accuracy@10', 'test_cosine_precision@10', 'test_cosine_recall@10', 'test_cosine_ndcg@10', 'test_cosine_mrr@10', 'test_cosine_map@100'])


KeyError: 'test_accuracy@10'

In [2]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval"
OUTPUT_DIR = "models-retrieval"
BATCH_SIZE = 64
EPOCHS = 3

# Daftar Model
MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2",
    # "sentence-transformers/all-mpnet-base-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# LOAD DATA
def load_json(filename):
    print(f"Loading {filename}...")
    with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf-8') as f:
        return json.load(f)

train_raw = load_json("train_pairs.json")
test_queries_raw = load_json("test_queries.json")
corpus_raw = load_json("corpus.json")

# SETUP EVALUATOR
print("\nMenyiapkan Evaluator...")

# Corpus Dictionary
corpus_dict = {}
text_to_id_map = {}
for item in corpus_raw:
    corpus_dict[item['id']] = item['text']
    text_to_id_map[item['text']] = item['id']

# Queries & Relevance Dictionary
queries_dict = {}
relevant_docs = {}

for i, item in enumerate(test_queries_raw):
    qid = f"q_{i}"
    queries_dict[qid] = item['query']
    
    ground_truth_ids = set()
    for gt_text in item['ground_truths']:
        found_id = text_to_id_map.get(gt_text)
        if found_id:
            ground_truth_ids.add(found_id)
            
    if ground_truth_ids:
        relevant_docs[qid] = ground_truth_ids

# Init Evaluator
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    queries_dict,
    corpus_dict,
    relevant_docs,
    show_progress_bar=True,
    name="test",  # Prefix key nanti jadi "test_cosine_..."
    mrr_at_k=[10],
    ndcg_at_k=[10],
    accuracy_at_k=[1, 5, 10],      # Hit Rate @ K
    precision_recall_at_k=[10]     # Precision & Recall @ K
)
print(f"   -> Evaluator siap: {len(queries_dict)} queries vs {len(corpus_dict)} corpus.")

# FUNGSI EKSPERIMEN
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    """Helper untuk mengambil semua metrik yang diminta user"""
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get('test_cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get('test_cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get('test_cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get('test_cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get('test_cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get('test_cosine_precision@10', 0),
        "Recall@10": metrics_dict.get('test_cosine_recall@10', 0)
    }

def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: {short_name}")
    print(f"{'='*60}")

    # BASELINE
    print(f"Evaluasi Baseline (Pre-trained)...")
    model_base = SentenceTransformer(model_name)
    metrics_base = ir_evaluator(model_base)
    
    # Ambil & Simpan Metrik
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    
    print(f"   [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f} | NDCG@10: {res_base['NDCG@10']:.4f} | Recall@10: {res_base['Recall@10']:.4f} | Precision@10: {res_base['Precision@10']:.4f}")
    
    del model_base
    torch.cuda.empty_cache()

    # FINE-TUNING
    print(f"Training (Fine-tuning)...")
    # Init Model
    word_emb = models.Transformer(model_name, max_seq_length=256)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # DataLoader
    train_ex = [InputExample(texts=[d['anchor'], d['positive']]) for d in train_raw]
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    # Fit
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True
    )
    
    # EVALUASI FINETUNED
    print(f"C. Evaluasi Model Finetuned...")
    metrics_ft = ir_evaluator(model_ft)
    
    # Ambil & Simpan Metrik
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    
    print(f"   [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f} | NDCG@10: {res_ft['NDCG@10']:.4f} | Recall@10: {res_ft['Recall@10']:.4f} | Precision@10: {res_ft['Precision@10']:.4f}")

    # Save
    model_ft.save(save_path)
    del model_ft
    torch.cuda.empty_cache()
    gc.collect()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR LENGKAP " + "="*30)
df = pd.DataFrame(results_table)

# Urutkan kolom agar rapi
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

# Print Tabel
pd.options.display.float_format = '{:,.4f}'.format
print(df)

# Simpan ke CSV
csv_path = "hasil_retrieval.csv"
df.to_csv(csv_path, index=False)
print(f"\nHasil lengkap tersimpan di: {csv_path}")

Loading train_pairs.json...
Loading test_queries.json...
Loading corpus.json...

Menyiapkan Evaluator...
   -> Evaluator siap: 5093 queries vs 74015 corpus.

EXPERIMENT: all-MiniLM-L6-v2
Evaluasi Baseline (Pre-trained)...


Batches: 100%|██████████| 160/160 [00:07<00:00, 20.06it/s]
Batches: 100%|██████████| 1563/1563 [01:12<00:00, 21.44it/s]
Batches: 100%|██████████| 751/751 [00:35<00:00, 21.28it/s]t]
Corpus Chunks: 100%|██████████| 2/2 [01:51<00:00, 55.54s/it]


   [Base] Hit@10: 0.4129 | MRR@10: 0.2572 | NDCG@10: 0.1663 | Recall@10: 0.1797 | Precision@10: 0.0719
Training (Fine-tuning)...


                                                                     

Step,Training Loss
500,1.0739
1000,0.9937
1500,0.944
2000,0.8453
2500,0.835
3000,0.8333
3500,0.7786
4000,0.7708
4500,0.7629


C. Evaluasi Model Finetuned...


Batches: 100%|██████████| 160/160 [00:03<00:00, 52.06it/s]
Batches: 100%|██████████| 1563/1563 [00:29<00:00, 53.34it/s]
Batches: 100%|██████████| 751/751 [00:14<00:00, 52.88it/s]t]
Corpus Chunks: 100%|██████████| 2/2 [00:47<00:00, 23.69s/it]


   [Fine] Hit@10: 0.4127 | MRR@10: 0.2557 | NDCG@10: 0.1653 | Recall@10: 0.1802 | Precision@10: 0.0721


              Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-MiniLM-L6-v2    Baseline 0.1932 0.3442  0.4129  0.2572   0.1663   
1  all-MiniLM-L6-v2  Fine-Tuned 0.1907 0.3405  0.4127  0.2557   0.1653   

   Precision@10  Recall@10  
0        0.0719     0.1797  
1        0.0721     0.1802  

Hasil lengkap tersimpan di: hasil_retrieval.csv


In [1]:
from datasets import load_dataset

ds = load_dataset("allenai/multixscience_sparse_mean")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json
import os
import hashlib
from tqdm import tqdm

# Setup
output_dir = "final_dataset_retrieval_split"
os.makedirs(output_dir, exist_ok=True)

def get_hash(text):
    """Membuat ID unik berdasarkan konten teks"""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

# Load Dataset
print("Loading Dataset...")

# --- FUNGSI HELPER UNTUK MEMPROSES TIAP SPLIT ---
def process_split_complete(dataset_split, split_name):
    pairs = []          # Untuk Training (Anchor, Positive)
    queries = []        # Untuk Evaluasi (Query, Ground Truth IDs)
    split_corpus = set() # Corpus KHUSUS untuk split ini
    
    seen_pairs = set()
    
    print(f">>> Memproses {split_name} Split...")
    
    for row in tqdm(dataset_split):
        anchor = row['abstract']
        refs = row['ref_abstract']['abstract']
        
        if not anchor or len(anchor) < 50: continue
        if not refs: continue
        
        # Simpan Query (untuk Val/Test)
        valid_refs = []
        for ref in refs:
            if ref and len(ref) > 50 and ref != anchor:
                valid_refs.append(ref)
                split_corpus.add(ref) # Masukkan ke corpus spesifik split ini
        
        if valid_refs:
            # 1. Simpan format Query (untuk Evaluasi)
            queries.append({
                "query": anchor,
                "ground_truths": valid_refs
            })
            
            # 2. Simpan format Pair (hanya untuk TRAIN)
            if split_name == 'train':
                for ref in valid_refs:
                    pair_sig = (get_hash(anchor), get_hash(ref))
                    if pair_sig not in seen_pairs:
                        seen_pairs.add(pair_sig)
                        pairs.append({
                            "anchor": anchor,
                            "positive": ref
                        })

    # Convert corpus set to list of dicts
    corpus_list = [{"id": get_hash(text), "text": text} for text in split_corpus]
    
    return pairs, queries, corpus_list

# --- EKSEKUSI PROSES ---

# 1. TRAIN
train_pairs, _, corpus_train = process_split_complete(ds['train'], 'train')

# 2. VALIDATION
# (Pairs kosong karena val tidak butuh training pairs, cuma butuh queries & corpus)
_, val_queries, corpus_val = process_split_complete(ds['validation'], 'validation')

# 3. TEST
_, test_queries, corpus_test = process_split_complete(ds['test'], 'test')

# --- SIMPAN DATA ---
def save_json(data, filename):
    path = os.path.join(output_dir, filename)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Tersimpan: {path} ({len(data)} items)")

print(f"\n--- PENYIMPANAN DATA ---")

# Train Data
save_json(train_pairs, "train_pairs.json")
save_json(corpus_train, "corpus_train.json")

# Validation Data
save_json(val_queries, "val_queries.json")
save_json(corpus_val, "corpus_val.json")

# Test Data
save_json(test_queries, "test_queries.json")
save_json(corpus_test, "corpus_test.json")

print("\nSELESAI! Folder 'final_dataset_retrieval_split' berisi corpus yang terpisah.")

Loading Dataset...
>>> Memproses train Split...


100%|██████████| 30369/30369 [00:03<00:00, 8311.42it/s]


>>> Memproses validation Split...


100%|██████████| 5066/5066 [00:00<00:00, 13677.96it/s]


>>> Memproses test Split...


100%|██████████| 5093/5093 [00:00<00:00, 13528.06it/s]



--- PENYIMPANAN DATA ---
Tersimpan: final_dataset_retrieval_split\train_pairs.json (97292 items)
Tersimpan: final_dataset_retrieval_split\corpus_train.json (62989 items)
Tersimpan: final_dataset_retrieval_split\val_queries.json (5066 items)
Tersimpan: final_dataset_retrieval_split\corpus_val.json (13612 items)
Tersimpan: final_dataset_retrieval_split\test_queries.json (5093 items)
Tersimpan: final_dataset_retrieval_split\corpus_test.json (11568 items)

SELESAI! Folder 'final_dataset_retrieval_split' berisi corpus yang terpisah.


In [3]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_split" # Pastikan folder ini benar
OUTPUT_DIR = "models-retrieval"
BATCH_SIZE = 64
EPOCHS = 1 # Kita set agak lama, nanti 'save_best_model' yang akan pilih yang terbaik

MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2",
    # "sentence-transformers/all-mpnet-base-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# LOAD DATA HELPER
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# --- 1. LOAD DATA ---
train_raw = load_json("train_pairs.json")

# Data Test (Ujian Akhir)
test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json") 

# Data Validation (Try Out saat Training) - BARU DITAMBAHKAN
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json") 

# --- 2. SETUP EVALUATOR ---

def create_evaluator(queries_raw, corpus_raw, name_prefix):
    """Fungsi helper untuk bikin evaluator biar tidak copy-paste kode"""
    print(f"Menyiapkan Evaluator: {name_prefix}...")
    
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict,
        corpus_dict,
        relevant_docs,
        show_progress_bar=False, # False biar log training tidak berantakan
        name=name_prefix,
        mrr_at_k=[10],
        accuracy_at_k=[10],
        precision_recall_at_k=[10]
    )

# Buat 2 Evaluator Berbeda
print("\n--- BUILDING EVALUATORS ---")
# 1. Evaluator VALIDATION (Dipakai saat training berjalan)
val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")

# 2. Evaluator TEST (Dipakai setelah training selesai)
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")


# --- 3. FUNGSI EKSPERIMEN ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    # Perhatikan prefix key-nya menyesuaikan nama evaluator ('test_' atau 'val_')
    prefix = "test_" if "test" in run_type.lower() else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name}\n{'='*60}")

    # A. BASELINE (Cek performa awal di Test Set)
    print(f"Evaluasi Baseline...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    torch.cuda.empty_cache()
    gc.collect()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning)...")
    word_emb = models.Transformer(model_name, max_seq_length=256)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    train_ex = [InputExample(texts=[d['anchor'], d['positive']]) for d in train_raw]
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    # --- PERUBAHAN PENTING DI SINI ---
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        
        # Masukkan Evaluator Validasi
        evaluator=val_evaluator,
        evaluation_steps=500,     # Cek nilai validasi setiap 500 step
        save_best_model=True,     # Simpan model HANYA jika skor validasi naik
        output_path=save_path     # Folder tempat nyimpan model terbaik
    )
    
    # C. EVALUASI MODEL TERBAIK (TEST SET)
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    # Kita load ulang model dari folder save_path 
    # untuk memastikan kita pakai model TERBAIK (bukan model epoch terakhir)
    best_model = SentenceTransformer(save_path)
    
    metrics_ft = test_evaluator(best_model)
    
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del model_ft
    del best_model
    torch.cuda.empty_cache()
    gc.collect()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@10", "MRR@10", "Recall@10"]
print(df[cols])

csv_path = "hasil_retrieval_final_validated.csv"
df.to_csv(csv_path, index=False)

Loading train_pairs.json...
Loading test_queries.json...
Loading corpus_test.json...
Loading val_queries.json...
Loading corpus_val.json...

--- BUILDING EVALUATORS ---
Menyiapkan Evaluator: val...
Menyiapkan Evaluator: test...

EXPERIMENT: all-MiniLM-L6-v2
Evaluasi Baseline...
 [Base] Hit@10: 0.0000 | MRR@10: 0.0000
Training (Fine-tuning)...


                                                                     

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
# Folder data hasil split sebelumnya
DATA_DIR = "final_dataset_retrieval_split" 
OUTPUT_DIR = "models-retrieval"
BATCH_SIZE = 64  # Batch besar = Bagus untuk Contrastive Learning
EPOCHS = 3       # Model akan stop otomatis jika validasi tidak membaik

# Daftar Model
MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2",
    # "sentence-transformers/all-mpnet-base-v2" # Uncomment jika ingin tes MPNet
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA FUNCTION ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data Training
train_raw = load_json("train_pairs.json")

# Load Data Validasi (Untuk pemantauan saat training)
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

# Load Data Test (Untuk evaluasi akhir)
test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")

# --- 2. HELPER UNTUK MEMBUAT EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    """
    Membuat object InformationRetrievalEvaluator.
    name_prefix: 'val' atau 'test' (mempengaruhi nama key di hasil)
    """
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    
    # 1. Corpus Dictionary
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    # 2. Queries & Ground Truths
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}" # ID unik per split
        queries_dict[qid] = item['query']
        
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    # 3. Return Evaluator Object
    return evaluation.InformationRetrievalEvaluator(
        queries_dict,
        corpus_dict,
        relevant_docs,
        show_progress_bar=False, # Matikan progress bar evaluasi agar log training bersih
        name=name_prefix,        # Prefix key: test_cosine_... atau val_cosine_...
        mrr_at_k=[10],
        ndcg_at_k=[10],
        accuracy_at_k=[1, 5, 10],      # Hit Rate
        precision_recall_at_k=[10]     # Precision & Recall
    )

# Buat dua evaluator
val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. FUNGSI EKSTRAKSI METRIK LENGKAP ---
def extract_metrics(metrics_dict, model_name, run_type):
    """Mengambil semua metrik spesifik yang diminta user"""
    
    # Tentukan prefix key berdasarkan tipe run (test atau val)
    # Default library: '{name}_cosine_{metric}@{k}'
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. CORE EXPERIMENT LOOP ---
results_table = []

def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name}\n{'='*60}")

    # --- A. BASELINE EVALUATION (Zero-Shot) ---
    print(f"A. Evaluasi Baseline (Pre-trained)...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    
    print(f"   [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f} | Recall@10: {res_base['Recall@10']:.4f}")
    
    del model_base
    torch.cuda.empty_cache()
    gc.collect()

    # --- B. FINE-TUNING ---
    print(f"B. Training (Fine-tuning)...")
    
    # Setup Model
    word_emb = models.Transformer(model_name, max_seq_length=256)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # DataLoader
    train_ex = [InputExample(texts=[d['anchor'], d['positive']]) for d in train_raw]
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss Function
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    # Training dengan Validasi
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        
        # Validasi Integrasi
        evaluator=val_evaluator,
        evaluation_steps=500,     # Cek validasi tiap 500 steps
        save_best_model=True,     # Simpan checkpoint terbaik berdasarkan validasi
        output_path=save_path
    )
    
    # --- C. FINAL EVALUATION (Best Model) ---
    print(f"C. Evaluasi Model Finetuned (Best Checkpoint)...")
    
    # Load model terbaik dari hasil save_best_model
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    
    print(f"   [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f} | Recall@10: {res_ft['Recall@10']:.4f}")

    del model_ft
    del best_model
    torch.cuda.empty_cache()
    gc.collect()

# --- EKSEKUSI ---
for model in MODELS_TO_TEST:
    run_experiment(model)

# --- SAVE RESULT ---
print("\n\n" + "="*30 + " HASIL AKHIR SKRIPSI " + "="*30)
df = pd.DataFrame(results_table)

# Urutkan kolom sesuai permintaan
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

# Print & Save
pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_final_lengkap.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_pairs.json...
Loading val_queries.json...
Loading corpus_val.json...
Loading test_queries.json...
Loading corpus_test.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: all-MiniLM-L6-v2
A. Evaluasi Baseline (Pre-trained)...
   [Base] Hit@10: 0.5307 | MRR@10: 0.3230 | Recall@10: 0.2715
B. Training (Fine-tuning)...


                                                                     

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,1.0657,No log,0.224832,0.508291,0.632254,0.119049,0.404492,0.303725,0.343669,0.25274
1000,0.9939,No log,0.22878,0.510067,0.63107,0.120114,0.40676,0.306328,0.346431,0.255234
1500,0.9409,No log,0.22957,0.501974,0.635215,0.120114,0.407959,0.306647,0.346601,0.255486
1521,0.9409,No log,0.226411,0.506317,0.632057,0.120351,0.406992,0.3052,0.344171,0.254223
2000,0.8454,No log,0.227201,0.505922,0.637386,0.120786,0.411006,0.306812,0.345004,0.254911
2500,0.8411,No log,0.229175,0.509672,0.635807,0.120707,0.408708,0.307771,0.347395,0.257104
3000,0.8263,No log,0.226806,0.508488,0.634426,0.120845,0.408421,0.306626,0.34529,0.256028
3042,0.8263,No log,0.227201,0.511054,0.634426,0.120944,0.407723,0.306762,0.34598,0.256153
3500,0.7817,No log,0.227793,0.506514,0.631859,0.120292,0.406238,0.305928,0.344895,0.256093
4000,0.7639,No log,0.225424,0.504935,0.637189,0.121674,0.411293,0.307302,0.34443,0.255817


C. Evaluasi Model Finetuned (Best Checkpoint)...
   [Fine] Hit@10: 0.5417 | MRR@10: 0.3299 | Recall@10: 0.2860


              Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-MiniLM-L6-v2    Baseline 0.2348 0.4420  0.5307  0.3230   0.2377   
1  all-MiniLM-L6-v2  Fine-Tuned 0.2382 0.4516  0.5417  0.3299   0.2474   

   Precision@10  Recall@10  
0        0.1069     0.2715  
1        0.1126     0.2860  

✅ Hasil lengkap tersimpan di: hasil_retrieval_final_lengkap.csv


In [1]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG KHUSUS VRAM 6GB ---
DATA_DIR = "final_dataset_retrieval_split"
OUTPUT_DIR = "models-retrieval-mpnet"

# Batch Size 8-10 adalah batas aman MPNet di 6GB VRAM.
# Jika masih OOM, turunkan ke 6 atau 4.
BATCH_SIZE = 8  

EPOCHS = 2 # MPNet cepat pintar, 1 epoch biasanya cukup untuk data sitasi

MODELS_TO_TEST = [
    "sentence-transformers/all-mpnet-base-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data
train_raw = load_json("train_pairs.json")
test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

# --- 2. SETUP EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict, corpus_dict, relevant_docs,
        show_progress_bar=False, name=name_prefix,
        mrr_at_k=[10], ndcg_at_k=[10], accuracy_at_k=[1, 5, 10], precision_recall_at_k=[10]
    )

val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. METRIK ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. EXPERIMENT LOOP (HEMAT MEMORI) ---
def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name}\n{'='*60}")

    # Bersihkan memori seagresif mungkin
    gc.collect()
    torch.cuda.empty_cache()

    # A. BASELINE
    print(f"Evaluasi Baseline (Pre-trained)...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    gc.collect()
    torch.cuda.empty_cache()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning)...")
    
    # 1. Batasi Panjang Sequence (Wajib untuk 6GB VRAM)
    # 256 adalah batas aman. Jika masih OOM, turunkan ke 200.
    word_emb = models.Transformer(model_name, max_seq_length=256)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    train_ex = [InputExample(texts=[d['anchor'], d['positive']]) for d in train_raw]
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    # 2. Fit dengan Trik Hemat Memori
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        
        # Learning rate standar untuk MPNet
        optimizer_params={'lr': 2e-5},
        
        # WAJIB: Mixed Precision (Hemat 40% VRAM)
        use_amp=True,
        
        show_progress_bar=True,
        evaluator=val_evaluator,
        evaluation_steps=500,
        save_best_model=True,
        output_path=save_path
    )
    
    # C. FINAL EVALUATION
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    # Hapus model lama dari memori dulu sebelum load yang baru
    del model_ft
    gc.collect()
    torch.cuda.empty_cache()
    
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del best_model
    gc.collect()
    torch.cuda.empty_cache()

# --- EKSEKUSI DENGAN ERROR HANDLING ---
for model in MODELS_TO_TEST:
    try:
        run_experiment(model)
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"\n❌ GPU OOM (Out of Memory)!")
            print("Saran: Turunkan BATCH_SIZE dari {BATCH_SIZE} menjadi {int(BATCH_SIZE/2)}.")
            print("Atau turunkan max_seq_length di word_emb menjadi 200.")
        else:
            raise e

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR SKRIPSI " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_mpnet_final.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_pairs.json...
Loading test_queries.json...
Loading corpus_test.json...
Loading val_queries.json...
Loading corpus_val.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: all-mpnet-base-v2
Evaluasi Baseline (Pre-trained)...
 [Base] Hit@10: 0.5447 | MRR@10: 0.3319
Training (Fine-tuning)...


                                                                     

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,0.2771,No log,0.227991,0.52803,0.644887,0.123826,0.422869,0.317109,0.351705,0.264804
1000,0.2829,No log,0.225622,0.516186,0.640742,0.122365,0.418806,0.313083,0.347654,0.259996
1500,0.3088,No log,0.221871,0.501382,0.618634,0.117687,0.400327,0.30092,0.337989,0.250532
2000,0.2684,No log,0.218121,0.498816,0.620608,0.117687,0.399945,0.297802,0.335014,0.246123
2500,0.2817,No log,0.221674,0.495065,0.618437,0.117884,0.3998,0.298461,0.335567,0.247712
3000,0.2655,No log,0.225424,0.495065,0.623766,0.11741,0.40094,0.300601,0.341318,0.249271
3500,0.2951,No log,0.219108,0.49625,0.617055,0.116818,0.395478,0.297041,0.335518,0.247651
4000,0.2413,No log,0.217331,0.495855,0.618634,0.11664,0.397736,0.296727,0.333537,0.246923
4500,0.2703,No log,0.222858,0.496842,0.61745,0.116088,0.396141,0.296368,0.337157,0.245625
5000,0.241,No log,0.218121,0.490722,0.611923,0.114706,0.392136,0.292891,0.331954,0.242937


Evaluasi Model Finetuned (Best Checkpoint)...
 [Fine] Hit@10: 0.5513 | MRR@10: 0.3364


               Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-mpnet-base-v2    Baseline 0.2417 0.4530  0.5447  0.3319   0.2468   
1  all-mpnet-base-v2  Fine-Tuned 0.2446 0.4598  0.5513  0.3364   0.2510   

   Precision@10  Recall@10  
0        0.1116     0.2835  
1        0.1137     0.2886  

✅ Hasil lengkap tersimpan di: hasil_retrieval_mpnet_final.csv


In [1]:
import json
import os
import random
from tqdm import tqdm

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_split"
OUTPUT_FILE = "train_triplets_random.json"

# --- LOAD DATA ---
def load_json(filename):
    path = os.path.join(DATA_DIR, filename)
    print(f"Loading {path}...")
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

train_pairs = load_json("train_pairs.json")
corpus_train = load_json("corpus_train.json")

# Ambil semua teks dari corpus untuk dijadikan kolam pengambilan acak
corpus_texts = [item['text'] for item in corpus_train]

print(f"Total Pasangan: {len(train_pairs)}")
print(f"Total Corpus: {len(corpus_texts)}")

# --- GENERATE RANDOM NEGATIVES ---
print("\n>>> Membuat Triplets dengan Random Negatives...")
triplets = []

for item in tqdm(train_pairs):
    anchor = item['anchor']
    positive = item['positive']
    
    # Ambil 1 Negative secara Acak
    while True:
        negative = random.choice(corpus_texts)
        
        # Pastikan Negative tidak sama dengan Anchor maupun Positive
        if negative != anchor and negative != positive:
            break
            
    triplets.append({
        "anchor": anchor,
        "positive": positive,
        "negative": negative
    })

# --- SIMPAN ---
save_path = os.path.join(DATA_DIR, OUTPUT_FILE)
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(triplets, f, ensure_ascii=False, indent=4)

print(f"\n✅ Selesai! File tersimpan di: {save_path}")
print(f"Contoh data: {triplets[0]}")

Loading final_dataset_retrieval_split\train_pairs.json...
Loading final_dataset_retrieval_split\corpus_train.json...
Total Pasangan: 97292
Total Corpus: 62989

>>> Membuat Triplets dengan Random Negatives...


100%|██████████| 97292/97292 [00:00<00:00, 707428.29it/s]



✅ Selesai! File tersimpan di: final_dataset_retrieval_split\train_triplets_random.json
Contoh data: {'anchor': 'Author(s): Kuperberg, Greg; Thurston, Dylan P. | Abstract: We give a purely topological definition of the perturbative quantum invariants of links and 3-manifolds associated with Chern-Simons field theory. Our definition is as close as possible to one given by Kontsevich. We will also establish some basic properties of these invariants, in particular that they are universally finite type with respect to algebraically split surgery and with respect to Torelli surgery. Torelli surgery is a mutual generalization of blink surgery of Garoufalidis and Levine and clasper surgery of Habiro.', 'positive': 'This note is a sequel to our earlier paper of the same title [4] and describes invariants of rational homology 3-spheres associated to acyclic orthogonal local systems. Our work is in the spirit of the Axelrod–Singer papers [1], generalizes some of their results, and furnishes a ne

In [1]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_split" 
OUTPUT_DIR = "models-retrieval-minilm-triplets"

# File Training (Gunakan yang Triplets/Random Negatives)
TRAIN_FILE = "train_triplets_random.json" 

# MiniLM ringan, bisa pakai Batch Size besar
BATCH_SIZE = 64  
EPOCHS = 3       

MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data Training (Triplets)
train_raw = load_json(TRAIN_FILE)

# Load Data Validasi & Test
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")

# --- 2. SETUP EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict, corpus_dict, relevant_docs,
        show_progress_bar=False, name=name_prefix,
        mrr_at_k=[10], ndcg_at_k=[10], accuracy_at_k=[1, 5, 10], precision_recall_at_k=[10]
    )

val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. METRIK ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. EXPERIMENT LOOP ---
def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name} (Triplets)\n{'='*60}")

    gc.collect()
    torch.cuda.empty_cache()

    # A. BASELINE
    print(f"Evaluasi Baseline...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    gc.collect()
    torch.cuda.empty_cache()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning with Triplets)...")
    
    # Init Model (Max Length 512 agar baca abstrak full)
    # MiniLM kuat handle 512 token
    word_emb = models.Transformer(model_name, max_seq_length=216)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # --- PERUBAHAN UTAMA: FORMAT DATA TRIPLET ---
    train_ex = []
    for d in train_raw:
        # Kita masukkan 3 kolom: [Anchor, Positive, Negative]
        train_ex.append(InputExample(texts=[d['anchor'], d['positive'], d['negative']]))
        
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss Function otomatis tahu kalau inputnya 3 kolom, dia pakai Triplet Logic
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        evaluator=val_evaluator,
        evaluation_steps=500,
        save_best_model=True,
        output_path=save_path
    )
    
    # C. EVALUASI
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    del model_ft
    gc.collect()
    torch.cuda.empty_cache()
    
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del best_model
    gc.collect()
    torch.cuda.empty_cache()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR LENGKAP " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_minilm_triplets.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_triplets_random.json...
Loading val_queries.json...
Loading corpus_val.json...
Loading test_queries.json...
Loading corpus_test.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: all-MiniLM-L6-v2 (Triplets)
Evaluasi Baseline...
 [Base] Hit@10: 0.5307 | MRR@10: 0.3230
Training (Fine-tuning with Triplets)...


                                                                             

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,1.4256,No log,0.222661,0.505922,0.627912,0.119266,0.402493,0.301585,0.339831,0.250103
1000,1.3201,No log,0.22345,0.511646,0.636597,0.120134,0.408043,0.30502,0.344333,0.253021
1500,1.2584,No log,0.223648,0.504145,0.635413,0.120371,0.407484,0.30371,0.342071,0.251774
1521,1.2584,No log,0.223648,0.505922,0.633241,0.119779,0.405764,0.303349,0.342276,0.252092
2000,1.1464,No log,0.224832,0.50908,0.641334,0.121931,0.412311,0.307114,0.344443,0.254344
2500,1.1485,No log,0.224635,0.508685,0.640347,0.121575,0.411153,0.306124,0.343862,0.253904
3000,1.1209,No log,0.227398,0.508488,0.637386,0.121141,0.407987,0.305982,0.346252,0.254727
3042,1.1209,No log,0.228188,0.508488,0.639953,0.121457,0.411575,0.307509,0.34669,0.255622
3500,1.0598,No log,0.226214,0.506119,0.638373,0.121141,0.408842,0.306045,0.344626,0.255208
4000,1.0425,No log,0.228188,0.510857,0.636794,0.121319,0.409269,0.306492,0.345992,0.255341


Evaluasi Model Finetuned (Best Checkpoint)...
 [Fine] Hit@10: 0.5453 | MRR@10: 0.3291


              Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-MiniLM-L6-v2    Baseline 0.2348 0.4420  0.5307  0.3230   0.2377   
1  all-MiniLM-L6-v2  Fine-Tuned 0.2370 0.4526  0.5453  0.3291   0.2466   

   Precision@10  Recall@10  
0        0.1069     0.2715  
1        0.1123     0.2853  

✅ Hasil lengkap tersimpan di: hasil_retrieval_minilm_triplets.csv


In [1]:
import json
import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# --- 1. SETUP CLEANING TOOLS ---
print("Downloading NLTK resources...")
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Stopwords Custom + Bawaan
custom_stopwords = set([
    'based', 'proposed', 'using', 'paper', 'data', 'results', 'method', 
    'model', 'approach', 'analysis', 'study', 'performance', 'new', 
    'presented', 'show', 'demonstrate', 'investigate', 'we', 'our'
])
stop_words = set(stopwords.words('english')) | custom_stopwords

def clean_text(text):
    if not isinstance(text, str): return ""
    
    # 1. Lowercase
    text = text.lower()
    
    # 2. Hapus karakter non-alphanumeric (sisakan spasi)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    # 3. Tokenize (Split by space)
    tokens = text.split()
    
    # 4. Remove Stopwords & Lemmatization
    cleaned_tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 2 # Hapus kata kependekan (1-2 huruf)
    ]
    
    # 5. Join kembali
    return " ".join(cleaned_tokens)

# --- 2. CONFIG IO ---
INPUT_DIR = "final_dataset_retrieval_split"
OUTPUT_DIR = "final_dataset_retrieval_cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_json(filename):
    path = os.path.join(INPUT_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, filename):
    path = os.path.join(OUTPUT_DIR, filename)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"✅ Tersimpan: {path}")

# --- 3. PROSES CLEANING ---

# A. Clean CORPUS (Val & Test)
# PENTING: Corpus harus bersih, agar pencarian match dengan query yang bersih
for corp_file in ["corpus_val.json", "corpus_test.json", "corpus_train.json"]:
    if not os.path.exists(os.path.join(INPUT_DIR, corp_file)): continue
    
    print(f"\nCleaning {corp_file}...")
    raw_data = load_json(corp_file)
    cleaned_data = []
    
    for item in tqdm(raw_data):
        # ID jangan diubah! Hanya text yang dibersihkan
        cleaned_text = clean_text(item['text'])
        if cleaned_text:
            cleaned_data.append({"id": item['id'], "text": cleaned_text})
            
    save_json(cleaned_data, corp_file)

# B. Clean TRAIN PAIRS
print(f"\nCleaning train_pairs.json...")
train_raw = load_json("train_triplets_random.json")
train_cleaned = []

for item in tqdm(train_raw):
    # Bersihkan Anchor & Positive
    anc = clean_text(item['anchor'])
    pos = clean_text(item['positive'])
    neg = clean_text(item['negative'])  # Negative tidak dipakai di training sekarang
    
    if anc and pos:
        train_cleaned.append({"anchor": anc, "positive": pos, "negative": neg})

save_json(train_cleaned, "train_triplets_random.json")

# C. Clean QUERIES (Val & Test)
# PENTING: Ground Truths (teks jawaban) JUGA harus dibersihkan agar match dengan Corpus bersih
for query_file in ["val_queries.json", "test_queries.json"]:
    print(f"\nCleaning {query_file}...")
    q_raw = load_json(query_file)
    q_cleaned = []
    
    for item in tqdm(q_raw):
        q_text = clean_text(item['query'])
        
        # Bersihkan list ground truths juga!
        gt_cleaned = [clean_text(gt) for gt in item['ground_truths']]
        gt_cleaned = [gt for gt in gt_cleaned if gt] # Hapus yang kosong
        
        if q_text and gt_cleaned:
            q_cleaned.append({
                "query": q_text,
                "ground_truths": gt_cleaned
            })
            
    save_json(q_cleaned, query_file)

print("\n🎉 SEMUA DATA SELESAI DIBERSIHKAN! Siap untuk Training.")

Downloading NLTK resources...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Iskandar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Iskandar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Iskandar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Cleaning corpus_val.json...


100%|██████████| 13612/13612 [00:08<00:00, 1551.56it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\corpus_val.json

Cleaning corpus_test.json...


100%|██████████| 11568/11568 [00:05<00:00, 2220.90it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\corpus_test.json

Cleaning corpus_train.json...


100%|██████████| 62989/62989 [00:20<00:00, 3077.51it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\corpus_train.json

Cleaning train_pairs.json...


100%|██████████| 97292/97292 [01:30<00:00, 1075.14it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\train_triplets_random.json

Cleaning val_queries.json...


100%|██████████| 5066/5066 [00:07<00:00, 708.19it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\val_queries.json

Cleaning test_queries.json...


100%|██████████| 5093/5093 [00:08<00:00, 586.08it/s]


✅ Tersimpan: final_dataset_retrieval_cleaned\test_queries.json

🎉 SEMUA DATA SELESAI DIBERSIHKAN! Siap untuk Training.


In [1]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_cleaned" 
OUTPUT_DIR = "models-retrieval-minilm-triplets"

# File Training (Gunakan yang Triplets/Random Negatives)
TRAIN_FILE = "train_triplets_random.json" 

# MiniLM ringan, bisa pakai Batch Size besar
BATCH_SIZE = 64  
EPOCHS = 3       

MODELS_TO_TEST = [
    "sentence-transformers/all-MiniLM-L6-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data Training (Triplets)
train_raw = load_json(TRAIN_FILE)

# Load Data Validasi & Test
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")

# --- 2. SETUP EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict, corpus_dict, relevant_docs,
        show_progress_bar=False, name=name_prefix,
        mrr_at_k=[10], ndcg_at_k=[10], accuracy_at_k=[1, 5, 10], precision_recall_at_k=[10]
    )

val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. METRIK ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. EXPERIMENT LOOP ---
def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name} (Triplets)\n{'='*60}")

    gc.collect()
    torch.cuda.empty_cache()

    # A. BASELINE
    print(f"Evaluasi Baseline...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    gc.collect()
    torch.cuda.empty_cache()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning with Triplets)...")
    
    # Init Model (Max Length 512 agar baca abstrak full)
    # MiniLM kuat handle 512 token
    word_emb = models.Transformer(model_name, max_seq_length=216)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # --- PERUBAHAN UTAMA: FORMAT DATA TRIPLET ---
    train_ex = []
    for d in train_raw:
        # Kita masukkan 3 kolom: [Anchor, Positive, Negative]
        train_ex.append(InputExample(texts=[d['anchor'], d['positive'], d['negative']]))
        
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss Function otomatis tahu kalau inputnya 3 kolom, dia pakai Triplet Logic
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        evaluator=val_evaluator,
        evaluation_steps=500,
        save_best_model=True,
        output_path=save_path
    )
    
    # C. EVALUASI
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    del model_ft
    gc.collect()
    torch.cuda.empty_cache()
    
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del best_model
    gc.collect()
    torch.cuda.empty_cache()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR LENGKAP " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_minilm_triplets_cleaned.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_triplets_random.json...
Loading val_queries.json...
Loading corpus_val.json...
Loading test_queries.json...
Loading corpus_test.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: all-MiniLM-L6-v2 (Triplets)
Evaluasi Baseline...
 [Base] Hit@10: 0.4962 | MRR@10: 0.2998
Training (Fine-tuning with Triplets)...


                                                                     

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,1.5268,No log,0.216739,0.495263,0.617647,0.114686,0.389826,0.292851,0.334827,0.241964
1000,1.3978,No log,0.220292,0.498223,0.623569,0.115476,0.393255,0.294458,0.337229,0.24347
1500,1.3232,No log,0.218121,0.50533,0.623766,0.116403,0.393437,0.295295,0.337728,0.245066
1521,1.3232,No log,0.21891,0.50079,0.627319,0.116305,0.395835,0.296322,0.338302,0.245284
2000,1.2212,No log,0.221082,0.50454,0.624753,0.116463,0.393338,0.296374,0.339522,0.246323
2500,1.1915,No log,0.217923,0.505922,0.628504,0.117608,0.397122,0.297175,0.338689,0.245888
3000,1.1756,No log,0.219503,0.507501,0.629688,0.117075,0.396334,0.297338,0.339785,0.247057
3042,1.1756,No log,0.218516,0.506711,0.627122,0.117252,0.395736,0.296986,0.338839,0.246852
3500,1.1019,No log,0.220095,0.507896,0.628306,0.117193,0.396476,0.297298,0.339459,0.247037
4000,1.1084,No log,0.218713,0.506119,0.630872,0.117233,0.396516,0.296978,0.339449,0.246392


Evaluasi Model Finetuned (Best Checkpoint)...
 [Fine] Hit@10: 0.5380 | MRR@10: 0.3260


              Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-MiniLM-L6-v2    Baseline 0.2148 0.4151  0.4962  0.2998   0.2154   
1  all-MiniLM-L6-v2  Fine-Tuned 0.2348 0.4489  0.5380  0.3260   0.2436   

   Precision@10  Recall@10  
0        0.0963     0.2462  
1        0.1106     0.2825  

✅ Hasil lengkap tersimpan di: hasil_retrieval_minilm_triplets_cleaned.csv


In [None]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_cleaned" 
OUTPUT_DIR = "models-retrieval-minilm-triplets"

# File Training (Gunakan yang Triplets/Random Negatives)
TRAIN_FILE = "train_triplets_random.json" 

# MiniLM ringan, bisa pakai Batch Size besar
BATCH_SIZE = 16  
EPOCHS = 1       

MODELS_TO_TEST = [
    "sentence-transformers/all-mpnet-base-v2"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data Training (Triplets)
train_raw = load_json(TRAIN_FILE)

# Load Data Validasi & Test
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")

# --- 2. SETUP EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict, corpus_dict, relevant_docs,
        show_progress_bar=False, name=name_prefix,
        mrr_at_k=[10], ndcg_at_k=[10], accuracy_at_k=[1, 5, 10], precision_recall_at_k=[10]
    )

val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. METRIK ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. EXPERIMENT LOOP ---
def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name} (Triplets)\n{'='*60}")

    gc.collect()
    torch.cuda.empty_cache()

    # A. BASELINE
    print(f"Evaluasi Baseline...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    gc.collect()
    torch.cuda.empty_cache()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning with Triplets)...")
    
    # Init Model (Max Length 512 agar baca abstrak full)
    # MiniLM kuat handle 512 token
    word_emb = models.Transformer(model_name, max_seq_length=216)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # --- PERUBAHAN UTAMA: FORMAT DATA TRIPLET ---
    train_ex = []
    for d in train_raw:
        # Kita masukkan 3 kolom: [Anchor, Positive, Negative]
        train_ex.append(InputExample(texts=[d['anchor'], d['positive'], d['negative']]))
        
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss Function otomatis tahu kalau inputnya 3 kolom, dia pakai Triplet Logic
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        evaluator=val_evaluator,
        evaluation_steps=500,
        save_best_model=True,
        output_path=save_path
    )
    
    # C. EVALUASI
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    del model_ft
    gc.collect()
    torch.cuda.empty_cache()
    
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del best_model
    gc.collect()
    torch.cuda.empty_cache()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR LENGKAP " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_mpnet_triplets_cleaned.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_triplets_random.json...
Loading val_queries.json...
Loading corpus_val.json...
Loading test_queries.json...
Loading corpus_test.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: all-mpnet-base-v2 (Triplets)
Evaluasi Baseline...
 [Base] Hit@10: 0.5087 | MRR@10: 0.3092
Training (Fine-tuning with Triplets)...


                                                                     

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,0.7165,No log,0.22424,0.493486,0.623569,0.116088,0.398826,0.298515,0.339568,0.247492
1000,0.7059,No log,0.220095,0.495263,0.626925,0.117035,0.397223,0.29628,0.336403,0.24555
1500,0.6719,No log,0.228977,0.502566,0.630478,0.119029,0.40404,0.30391,0.344741,0.252438
2000,0.6116,No log,0.224635,0.505724,0.634228,0.119424,0.406716,0.302966,0.343942,0.250688
2500,0.6146,No log,0.22345,0.508685,0.632057,0.119049,0.406127,0.302871,0.343312,0.251267
3000,0.5769,No log,0.223648,0.501974,0.628899,0.118358,0.404006,0.301631,0.34208,0.250825
3500,0.5688,No log,0.226411,0.514607,0.633439,0.120549,0.406185,0.304729,0.345533,0.253602
4000,0.5555,No log,0.224437,0.501382,0.637386,0.120312,0.409071,0.303789,0.342345,0.251953
4500,0.5335,No log,0.230557,0.50908,0.63936,0.120529,0.410707,0.30785,0.348747,0.25728
5000,0.5398,No log,0.229175,0.507698,0.638571,0.120707,0.412128,0.307363,0.348229,0.25602


Evaluasi Model Finetuned (Best Checkpoint)...
 [Fine] Hit@10: 0.5376 | MRR@10: 0.3297


               Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  NDCG@10  \
0  all-mpnet-base-v2    Baseline 0.2229 0.4274  0.5087  0.3092   0.2252   
1  all-mpnet-base-v2  Fine-Tuned 0.2374 0.4557  0.5376  0.3297   0.2461   

   Precision@10  Recall@10  
0        0.1010     0.2581  
1        0.1111     0.2839  

✅ Hasil lengkap tersimpan di: hasil_retrieval_mpnet_triplets_cleaned.csv


In [1]:
import json
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import torch
import gc

# --- CONFIG ---
DATA_DIR = "final_dataset_retrieval_cleaned" 
OUTPUT_DIR = "models-retrieval-minilm-triplets"

# File Training (Gunakan yang Triplets/Random Negatives)
TRAIN_FILE = "train_triplets_random.json" 

# MiniLM ringan, bisa pakai Batch Size besar
BATCH_SIZE = 64  
EPOCHS = 3       

MODELS_TO_TEST = [
    "Snowflake/snowflake-arctic-embed-xs"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 1. LOAD DATA ---
def load_json(filename):
    print(f"Loading {filename}...")
    path = os.path.join(DATA_DIR, filename)
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load Data Training (Triplets)
train_raw = load_json(TRAIN_FILE)

# Load Data Validasi & Test
val_queries_raw = load_json("val_queries.json")
corpus_val_raw = load_json("corpus_val.json")

test_queries_raw = load_json("test_queries.json")
corpus_test_raw = load_json("corpus_test.json")

# --- 2. SETUP EVALUATOR ---
def create_evaluator(queries_raw, corpus_raw, name_prefix):
    print(f"Menyiapkan Evaluator: {name_prefix.upper()}...")
    corpus_dict = {item['id']: item['text'] for item in corpus_raw}
    text_to_id_map = {item['text']: item['id'] for item in corpus_raw}
    
    queries_dict = {}
    relevant_docs = {}

    for i, item in enumerate(queries_raw):
        qid = f"{name_prefix}_q_{i}"
        queries_dict[qid] = item['query']
        ground_truth_ids = set()
        for gt_text in item['ground_truths']:
            found_id = text_to_id_map.get(gt_text)
            if found_id:
                ground_truth_ids.add(found_id)
        if ground_truth_ids:
            relevant_docs[qid] = ground_truth_ids
            
    return evaluation.InformationRetrievalEvaluator(
        queries_dict, corpus_dict, relevant_docs,
        show_progress_bar=False, name=name_prefix,
        mrr_at_k=[10], ndcg_at_k=[10], accuracy_at_k=[1, 5, 10], precision_recall_at_k=[10]
    )

val_evaluator = create_evaluator(val_queries_raw, corpus_val_raw, "val")
test_evaluator = create_evaluator(test_queries_raw, corpus_test_raw, "test")

# --- 3. METRIK ---
results_table = []

def extract_metrics(metrics_dict, model_name, run_type):
    prefix = "test_" if "Baseline" in run_type or "Fine-Tuned" in run_type else "val_"
    return {
        "Model": model_name,
        "Type": run_type,
        "Hit@1": metrics_dict.get(f'{prefix}cosine_accuracy@1', 0),
        "Hit@5": metrics_dict.get(f'{prefix}cosine_accuracy@5', 0),
        "Hit@10": metrics_dict.get(f'{prefix}cosine_accuracy@10', 0),
        "MRR@10": metrics_dict.get(f'{prefix}cosine_mrr@10', 0),
        "NDCG@10": metrics_dict.get(f'{prefix}cosine_ndcg@10', 0),
        "Precision@10": metrics_dict.get(f'{prefix}cosine_precision@10', 0),
        "Recall@10": metrics_dict.get(f'{prefix}cosine_recall@10', 0)
    }

# --- 4. EXPERIMENT LOOP ---
def run_experiment(model_name):
    short_name = model_name.split("/")[-1]
    print(f"\n{'='*60}\nEXPERIMENT: {short_name} (Triplets)\n{'='*60}")

    gc.collect()
    torch.cuda.empty_cache()

    # A. BASELINE
    print(f"Evaluasi Baseline...")
    model_base = SentenceTransformer(model_name)
    metrics_base = test_evaluator(model_base)
    res_base = extract_metrics(metrics_base, short_name, "Baseline")
    results_table.append(res_base)
    print(f" [Base] Hit@10: {res_base['Hit@10']:.4f} | MRR@10: {res_base['MRR@10']:.4f}")
    
    del model_base
    gc.collect()
    torch.cuda.empty_cache()

    # B. FINE-TUNING
    print(f"Training (Fine-tuning with Triplets)...")
    
    # Init Model (Max Length 512 agar baca abstrak full)
    # MiniLM kuat handle 512 token
    word_emb = models.Transformer(model_name, max_seq_length=216)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model_ft = SentenceTransformer(modules=[word_emb, pooling])
    
    # --- PERUBAHAN UTAMA: FORMAT DATA TRIPLET ---
    train_ex = []
    for d in train_raw:
        # Kita masukkan 3 kolom: [Anchor, Positive, Negative]
        train_ex.append(InputExample(texts=[d['anchor'], d['positive'], d['negative']]))
        
    train_dl = DataLoader(train_ex, shuffle=True, batch_size=BATCH_SIZE)
    
    # Loss Function otomatis tahu kalau inputnya 3 kolom, dia pakai Triplet Logic
    train_loss = losses.MultipleNegativesRankingLoss(model_ft)
    
    save_path = os.path.join(OUTPUT_DIR, f"{short_name}-finetuned")
    
    model_ft.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=EPOCHS,
        warmup_steps=int(len(train_dl) * 0.1),
        optimizer_params={'lr': 2e-5},
        use_amp=True,
        show_progress_bar=True,
        evaluator=val_evaluator,
        save_best_model=True,
        output_path=save_path
    )
    
    # C. EVALUASI
    print(f"Evaluasi Model Finetuned (Best Checkpoint)...")
    
    del model_ft
    gc.collect()
    torch.cuda.empty_cache()
    
    best_model = SentenceTransformer(save_path)
    metrics_ft = test_evaluator(best_model)
    res_ft = extract_metrics(metrics_ft, short_name, "Fine-Tuned")
    results_table.append(res_ft)
    print(f" [Fine] Hit@10: {res_ft['Hit@10']:.4f} | MRR@10: {res_ft['MRR@10']:.4f}")

    del best_model
    gc.collect()
    torch.cuda.empty_cache()

# EKSEKUSI
for model in MODELS_TO_TEST:
    run_experiment(model)

# HASIL AKHIR
print("\n\n" + "="*30 + " HASIL AKHIR LENGKAP " + "="*30)
df = pd.DataFrame(results_table)
cols = ["Model", "Type", "Hit@1", "Hit@5", "Hit@10", "MRR@10", "NDCG@10", "Precision@10", "Recall@10"]
df = df[cols]

pd.options.display.float_format = '{:,.4f}'.format
print(df)

csv_path = "hasil_retrieval_snowflake_triplets_cleaned.csv"
df.to_csv(csv_path, index=False)
print(f"\n✅ Hasil lengkap tersimpan di: {csv_path}")

  from .autonotebook import tqdm as notebook_tqdm


Loading train_triplets_random.json...
Loading val_queries.json...
Loading corpus_val.json...
Loading test_queries.json...
Loading corpus_test.json...
Menyiapkan Evaluator: VAL...
Menyiapkan Evaluator: TEST...

EXPERIMENT: snowflake-arctic-embed-xs (Triplets)
Evaluasi Baseline...
 [Base] Hit@10: 0.4765 | MRR@10: 0.2949
Training (Fine-tuning with Triplets)...


Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-xs and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                     

Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@10,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,2.0427,,,,,,,,,
1000,1.65,,,,,,,,,
1500,1.554,,,,,,,,,
1521,1.554,No log,0.212396,0.474931,0.597710,0.110422,0.371832,0.279819,0.323464,0.229532
2000,1.4542,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log
2500,1.4121,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log
3000,1.38,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log
3042,1.38,No log,0.214568,0.480458,0.599487,0.111153,0.373200,0.283003,0.327443,0.234204
3500,1.3236,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log
4000,1.3117,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log


Evaluasi Model Finetuned (Best Checkpoint)...
 [Fine] Hit@10: 0.5299 | MRR@10: 0.3215


                       Model        Type  Hit@1  Hit@5  Hit@10  MRR@10  \
0  snowflake-arctic-embed-xs    Baseline 0.2158 0.4039  0.4765  0.2949   
1  snowflake-arctic-embed-xs  Fine-Tuned 0.2303 0.4443  0.5299  0.3215   

   NDCG@10  Precision@10  Recall@10  
0   0.2066        0.0899     0.2301  
1   0.2390        0.1075     0.2748  

✅ Hasil lengkap tersimpan di: hasil_retrieval_snowflake_triplets_cleaned.csv
