In [1]:
import os
from datasets import load_from_disk

data_path = "processed_multixscience_data"

print(f"Memuat dataset dari {data_path}...")
try:
    tokenized_datasets = load_from_disk(data_path)
    print("‚úÖ Dataset berhasil dimuat!")
    print(tokenized_datasets)
except FileNotFoundError:
    print("‚ùå Error: Path salah. Cek kembali lokasi folder di panel 'Input' sebelah kanan.")

print("\nContoh Kolom Dataset:", tokenized_datasets.column_names)

  from .autonotebook import tqdm as notebook_tqdm


Memuat dataset dari processed_multixscience_data...
‚úÖ Dataset berhasil dimuat!
Dataset({
    features: ['input_text', 'related_work'],
    num_rows: 30369
})

Contoh Kolom Dataset: ['input_text', 'related_work']


In [2]:
# EOS_TOKEN biasanya diperlukan agar model tahu kapan harus berhenti menulis
# Jika pakai Unsloth, biasanya sudah otomatis, tapi definisikan manual untuk aman.
EOS_TOKEN = "<|eot_id|>"

def format_prompt_llama3(examples):
    # Ambil list data dari batch
    inputs = examples["input_text"]       # Context (Abstract + Refs)
    outputs = examples["related_work"]    # Ground Truth (Target)

    prompts = []

    # System Prompt: Instruksi peran untuk AI
    system_msg = (
        "You are an academic writing assistant. "
        "Write a 'Related Work' section based on the provided text. "
        "The input contains the Current Abstract followed by References (marked with @cite_n). "
        "Synthesize these references and highlight the novelty of the Current Abstract."
    )

    for input_text, output_text in zip(inputs, outputs):
        # Struktur Llama 3 Instruct Resmi
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{output_text}{EOS_TOKEN}"""

        prompts.append(text)

    # Kembalikan dalam kolom baru bernama 'text' (biasanya SFTTrainer mencari kolom ini)
    return { "text": prompts }

# --- CARA PAKAI ---
# Asumsi 'tokenized_datasets' adalah HuggingFace Dataset yang sudah Anda load
# Terapkan fungsi format_prompt_llama3 ke seluruh dataset
formatted_datasets = tokenized_datasets.map(format_prompt_llama3, batched=True)
# print(formatted_datasets['text'][0])

In [3]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    prepare_model_for_kbit_training
)

# 1. Cek Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"PyTorch: {torch.__version__}, Device: {device}")

# 2. Konfigurasi Model & Quantization (Pengganti load_in_4bit=True di Unsloth)
model_id = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" # Kita tetap pakai model ini karena sudah optimized

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,           # Aktifkan 4-bit loading
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",   # Tipe quantisasi standar Llama
    bnb_4bit_compute_dtype=torch.float16, # Gunakan float16 untuk komputasi di GPU
)

print("Sedang meload model...")

# 3. Load Model Base
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",           # Otomatis sebar ke GPU
    use_cache=False              # False saat training untuk hemat VRAM
)

# 4. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token # Fix untuk Llama 3
tokenizer.padding_side = "right"

# 5. Persiapan Model untuk Training 4-bit
model = prepare_model_for_kbit_training(model)

# 6. Konfigurasi LoRA (Pengganti FastLanguageModel.get_peft_model)
peft_config = LoraConfig(
    r=16,                        # Rank LoRA (sama seperti kode Anda)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], # Bisa ditambah: "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# 7. Pasang Adapter LoRA ke Model
model = get_peft_model(model, peft_config)

print("\n‚úÖ Model Llama 3 (Windows Native) siap! Berikut parameter yang akan dilatih:")
model.print_trainable_parameters()

# --- BATAS KODE INISIALISASI ---
# Di bawah ini nanti Anda bisa lanjut ke kode Training (SFTTrainer)

PyTorch: 2.5.1+cu121, Device: cuda
Sedang meload model...





‚úÖ Model Llama 3 (Windows Native) siap! Berikut parameter yang akan dilatih:
trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [4]:
from transformers import DataCollatorForLanguageModeling

# 1. Pastikan Tokenizer sudah diset padding-nya (PENTING buat Llama 3)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix untuk fp16

# 2. Fungsi Mapping
def tokenize_function(examples):
    # Ambil teks dari kolom "text"
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024, # Sesuaikan panjang konteks
        padding=False    # Kita padding nanti di DataCollator biar hemat memori
    )
    # Untuk Causal LM (text generation), labels = input_ids
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

# 3. Terapkan ke Dataset
print("‚è≥ Sedang men-tokenisasi dataset...")
tokenized_dataset = formatted_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_datasets.column_names # Hapus kolom teks asli agar tidak error masuk model
)

print("‚úÖ Tokenisasi Selesai!")
print(f"Contoh data: {tokenized_dataset[0].keys()}") 
# Harusnya muncul: dict_keys(['input_ids', 'attention_mask', 'labels'])

‚è≥ Sedang men-tokenisasi dataset...
‚úÖ Tokenisasi Selesai!
Contoh data: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [6]:
import accelerate
from trl import SFTTrainer
from transformers import TrainingArguments

# Define Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size = 2, # Batch size sesuai VRAM Anda
    gradient_accumulation_steps = 4, # Jumlah step akumulasi gradien
    warmup_steps = 5, # Warmup steps awal
    # max_steps = 60, # Total step training (sesuaikan dengan budget Anda)
    learning_rate = 2e-4, # Learning rate optimal
    fp16 = not torch.cuda.is_bf16_supported(), # Menggunakan fp16 jika bf16 tidak didukung
    bf16 = torch.cuda.is_bf16_supported(), # Menggunakan bf16 jika didukung (lebih baik)
    logging_steps = 50, # Logging setiap 1 step
    optim = "adamw_8bit", # Optimizer
    weight_decay = 0.01, # Weight decay
    lr_scheduler_type = "linear", # Linear scheduler
    num_train_epochs=1,
    seed = 3407, # Random seed
    output_dir = "outputs", # Direktori output
    save_strategy="epoch",
    push_to_hub=False,
    report_to=[]
)

# Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_datasets, # Menggunakan dataset yang sudah diformat
    dataset_text_field = "text", # Menunjukkan kolom 'text' sebagai input utama
    max_seq_length = 1024,
    args = training_args,
)

# Train the model
trainer.train()

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30369/30369 [00:15<00:00, 2015.32 examples/s]
  super().__init__(
  return fn(*args, **kwargs)


Step,Training Loss
50,2.7441
100,2.48
150,2.4692
200,2.4717
250,2.4406
300,2.4502
350,2.4297
400,2.4478
450,2.4336
500,2.4333


TrainOutput(global_step=3797, training_loss=2.3992343789061215, metrics={'train_runtime': 27412.2762, 'train_samples_per_second': 1.108, 'train_steps_per_second': 0.139, 'total_flos': 1.4058449996064768e+17, 'train_loss': 2.3992343789061215, 'epoch': 1.0})

In [9]:
import torch

model.eval()
print("Model set to evaluation mode.")

eval_dataset = formatted_datasets.select(range(10))

def format_prompt_llama3_val(examples):
    # Ambil list data dari batch
    inputs = examples["input_text"]       # Context (Abstract + Refs)
    outputs = examples["related_work"]    # Ground Truth (Target)

    prompts = []

    # System Prompt: Instruksi peran untuk AI
    system_msg = (
        "You are an academic writing assistant. "
        "Write a 'Related Work' section based on the provided text. "
        "The input contains the Current Abstract followed by References (marked with @cite_n). "
        "Synthesize these references and highlight the novelty of the Current Abstract."
    )

    for input_text, output_text in zip(inputs, outputs):
        # Struktur Llama 3 Instruct Resmi
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>

{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

        prompts.append(text)

    # Kembalikan dalam kolom baru bernama 'text' (biasanya SFTTrainer mencari kolom ini)
    return { "text": prompts }

eval_dataset = eval_dataset.map(format_prompt_llama3_val, batched=True)

Model set to evaluation mode.


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 404.36 examples/s]


In [10]:
### 5. Evaluate Model Performance

# After training, we evaluate the model's performance on a held-out validation set. We generate predictions for a subset of the validation data and compare them against the original 'Related Work' sections using various metrics like ROUGE, BERTScore, and length analysis.



print(f"Selected {len(eval_dataset)} samples for evaluation.")

# 3. Define the generation function
def generate_related_work(input_text):
    # Ensure pad_token_id is set for the tokenizer to prevent reorder_cache error
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    print(input_text)
    inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=False)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

  # 2. Jalankan Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,                         # Unpack input_ids & attention_mask
            # **generation_kwargs,              # Unpack config di atas
            use_cache=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id, # Gunakan pad token yang benar
        )

    generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return decoded_output.strip()

predictions = []
original_texts = []
print("Generating predictions...")
for i, sample in enumerate(eval_dataset):
    generated_text = generate_related_work(sample["input_text"])
    predictions.append(generated_text)
    original_texts.append(sample["related_work"])
    if (i + 1) % 10 == 0:
        print(f"Generated prediction for {i + 1}/{len(eval_dataset)} samples.")

print("Prediction generation complete.")
print(f"Generated {len(predictions)} predictions.")

print("\n--- First Generated Prediction ---")
print(predictions[0])
print("\n--- First Original Related Work ---")
print(original_texts[0])

Selected 10 samples for evaluation.
Generating predictions...
Author(s): Kuperberg, Greg; Thurston, Dylan P. | Abstract: We give a purely topological definition of the perturbative quantum invariants of links and 3-manifolds associated with Chern-Simons field theory. Our definition is as close as possible to one given by Kontsevich. We will also establish some basic properties of these invariants, in particular that they are universally finite type with respect to algebraically split surgery and with respect to Torelli surgery. Torelli surgery is a mutual generalization of blink surgery of Garoufalidis and Levine and clasper surgery of Habiro. <doc-sep> @cite_0 This note is a sequel to our earlier paper of the same title [4] and describes invariants of rational homology 3-spheres associated to acyclic orthogonal local systems. Our work is in the spirit of the Axelrod‚ÄìSinger papers [1], generalizes some of their results, and furnishes a new setting for the purely topological implicati

KeyboardInterrupt: 

In [14]:
# presaved_model = model.merge_and_unload()

save_path = "summarization/related_works_generation_model"

os.makedirs(save_path, exist_ok=True)
# Save full merged model
presaved_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

Model saved to summarization/related_works_generation_model


In [None]:
!pip install evaluate rouge-score bert-score

In [None]:
### 6. Calculate Comprehensive Evaluation Metrics

#This section defines a function to compute ROUGE scores (for lexical similarity), BERTScore (for semantic similarity), and analyze the length of generated texts compared to the references. These metrics provide a holistic view of the model's generation quality.

def calculate_comprehensive_metrics(predictions, references):
    """
    Menghitung ROUGE, BERTScore, dan Rasio Panjang.

    Args:
        predictions (list): List string hasil output model.
        references (list): List string kunci jawaban asli (ground truth).

    Returns:
        dict: Dictionary berisi semua skor evaluasi.
    """

    print(f"üìä Memulai Evaluasi untuk {len(predictions)} sampel data...")
    results = {}

    # --- 1. ROUGE SCORE (Lexical / Kata) ---
    print("‚è≥ Menghitung ROUGE...")
    rouge_metric = evaluate.load("rouge")
    rouge_scores = rouge_metric.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True # Penting untuk bahasa Inggris
    )
    # Konversi ke Persen (0-100)
    results['ROUGE-1'] = round(rouge_scores['rouge1'] * 100, 2)
    results['ROUGE-2'] = round(rouge_scores['rouge2'] * 100, 2)
    results['ROUGE-L'] = round(rouge_scores['rougeL'] * 100, 2)

    # --- 2. BERTSCORE (Semantic / Makna) ---
    print("‚è≥ Menghitung BERTScore (Mungkin butuh waktu & download model)...")
    bertscore_metric = evaluate.load("bertscore")
    # Gunakan batch_size agar tidak OOM
    bert_scores = bertscore_metric.compute(
        predictions=predictions,
        references=references,
        lang="en",
        batch_size=16
    )
    # Kita ambil rata-rata F1 Score dari semua data
    results['BERTScore-F1'] = round(np.mean(bert_scores['f1']) * 100, 2)
    results['BERTScore-Precision'] = round(np.mean(bert_scores['precision']) * 100, 2)
    results['BERTScore-Recall'] = round(np.mean(bert_scores['recall']) * 100, 2)

    # --- 3. LENGTH ANALYSIS (Analisis Panjang) ---
    print("‚è≥ Menghitung Statistik Panjang Teks...")
    pred_lens = [len(p.split()) for p in predictions]
    ref_lens = [len(r.split()) for r in references]

    avg_pred_len = np.mean(pred_lens)
    avg_ref_len = np.mean(ref_lens)
    length_ratio = (avg_pred_len / avg_ref_len) * 100

    results['Avg Gen Length'] = round(avg_pred_len, 1)
    results['Avg Ref Length'] = round(avg_ref_len, 1)
    results['Length Ratio (%)'] = round(length_ratio, 2)

    return results

# Jalankan Fungsi
final_metrics = calculate_comprehensive_metrics(predictions, original_texts)

# Tampilkan Hasil Rapih
print("\n" + "="*40)
print("      LAPORAN HASIL EVALUASI AKHIR      ")
print("="*40)
for metric, score in final_metrics.items():
    print(f"{metric:<20} : {score}")
print("="*40)

# Interpretasi Singkat
print("\n--- Interpretasi Cepat ---")
if final_metrics['BERTScore-F1'] > 85:
    print("Kualitas Makna SANGAT BAIK (Mirip manusia).")
elif final_metrics['BERTScore-F1'] > 80:
    print("Kualitas Makna CUKUP BAIK.")
else:
    print("Kualitas Makna KURANG (Model mungkin halusinasi/tidak nyambung).")

if final_metrics['Length Ratio (%)'] < 80:
    print("WARNING: Output model terlalu pendek dibanding referensi asli.")