In [1]:
# Cell 0: load SQuAD dataset (try corey-johnson namespace, fallback to Hugging Face "squad")
# Run this cell in your notebook
!pip install -q datasets

from datasets import load_dataset

try:
    squad_dataset = load_dataset("corey-johnson/squad")
except Exception:
    squad_dataset = load_dataset("squad")

squad_dataset

/home/apalah/Documents/uasdl/task2/virtualenvdl/bin/pip: 2: exec: /home/apalah/Documents/uasdl/task someting/vritualenvdl/bin/python3: not found


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

## 2. Load dan eksplorasi dataset
Lihat ukuran dataset dan contoh untuk memahami format (context, question, answers).

In [2]:
# Quick exploration
print(squad_dataset)
print('Train size:', len(squad_dataset['train']))
print('Validation size:', len(squad_dataset['validation']))
# Show a sample
sample = squad_dataset['train'][0]
from pprint import pprint
pprint({k: sample[k] for k in ('context','question','answers')})

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
Train size: 87599
Validation size: 10570
{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the '
            "Main Building's gold dome is a golden statue of the Virgin Mary. "
            'Immediately in front of the Main Building and facing it, is a '
            'copper statue of Christ with arms upraised with the legend '
            '"Venite Ad Me Omnes". Next to the Main Building is the Basilica '
            'of the Sacred Heart. Immediately behind the basilica is the '
            'Grotto, a Marian place of prayer and reflection. It is a replica '
            'of the grotto at Lourdes, France where the Virgin Mary reputed

## 3. Tokenisasi dengan T5 tokenizer
Kita siapkan input sebagai teks sekuens: question: ...  context: ... dan target sebagai teks jawaban. Gunakan `t5-base` tokenizer karena model yang akan kita fine-tune adalah T5-base.

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

max_input_length = 512
max_target_length = 64

def make_input_examples(example):
    answer = example['answers']['text'][0] if example['answers']['text'] else ''
    input_text = f'question: {example["question"]}  context: {example["context"]}'
    return {'input_text': input_text, 'target_text': answer}

# Map to train/validation with text fields
squad_text = squad_dataset.map(lambda ex: make_input_examples(ex), remove_columns=squad_dataset['train'].column_names)

def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')
    labels['input_ids'] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = squad_text.map(preprocess_function, batched=True)
tokenized_datasets.set_format(type='torch')
tokenized_datasets['train'] = tokenized_datasets['train'].shuffle(seed=42)

print(tokenized_datasets)

Map: 100%|██████████| 10570/10570 [00:03<00:00, 2901.32 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})





## 4. Konfigurasi model untuk fine-tuning
Siapkan `T5ForConditionalGeneration`, data collator, dan fungsi metrik sederhana (Exact Match, token-level F1).

In [4]:
from transformers import T5ForConditionalGeneration, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import numpy as np

model = T5ForConditionalGeneration.from_pretrained('t5-base')
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

def normalize_answer(s):
    return ' '.join(str(s).lower().strip().split())

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = normalize_answer(a_gold).split()
    pred_toks = normalize_answer(a_pred).split()
    if not gold_toks and not pred_toks:
        return 1.0
    if not gold_toks or not pred_toks:
        return 0.0
    common = set(gold_toks) & set(pred_toks)
    num_same = sum(min(gold_toks.count(w), pred_toks.count(w)) for w in common)
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_toks)
    recall = num_same / len(gold_toks)
    return 2 * precision * recall / (precision + recall)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    exacts = [compute_exact(g, p) for g, p in zip(decoded_labels, decoded_preds)]
    f1s = [compute_f1(g, p) for g, p in zip(decoded_labels, decoded_preds)]
    return {'exact_match': float(np.mean(exacts)), 'f1': float(np.mean(f1s))}

## 5. Training dengan Trainer API
Konfigurasi `TrainingArguments` dan jalankan `trainer.train()` (jalankan ini hanya kalau siap — butuh GPU/lebih waktu).

In [5]:
import torch
import inspect
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

use_cuda = torch.cuda.is_available()
n_gpu = torch.cuda.device_count() if use_cuda else 0
device = 'cuda' if use_cuda else 'cpu'
print(f'Using device: {device}, n_gpu: {n_gpu}')

per_device_train_batch_size = 8 if use_cuda else 4
per_device_eval_batch_size = 16 if use_cuda else 8
fp16_flag = True if use_cuda else False

# Common kwargs for TrainingArguments
common_kwargs = dict(
    output_dir='t5-finetuned-squad',
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    num_train_epochs=4,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=fp16_flag,
)

# Add evaluation strategy if supported by the installed Transformers
sig_params = inspect.signature(Seq2SeqTrainingArguments.__init__).parameters
if 'evaluation_strategy' in sig_params:
    common_kwargs['evaluation_strategy'] = 'steps'
else:
    # Older Transformers versions expect flags like do_eval
    common_kwargs['do_eval'] = True

training_args = Seq2SeqTrainingArguments(**common_kwargs)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# To start training, uncomment the next line. Training requires resources (GPU/TPU) and time.
# trainer.train()

Using device: cuda, n_gpu: 1


  trainer = Seq2SeqTrainer(


## 6. Evaluasi dan inference
Contoh: generate jawaban dari validation set dan hitung metrik yang sudah didefinisikan.

In [6]:
# Run inference on a few validation examples
examples = tokenized_datasets['validation'].select(range(16))
preds = trainer.predict(examples, max_length=64)
decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
labels = np.where(preds.label_ids != -100, preds.label_ids, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
for q, p, g in zip(examples['input_text'][:8], decoded_preds[:8], decoded_labels[:8]):
    print('INPUT:', q)
    print('PRED :', p)
    print('GOLD :', g)
    print('---')

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


INPUT: question: Which NFL team represented the AFC at Super Bowl 50?  context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
PRED : Denver Broncos
GOLD : Denver Broncos
---
INPUT: question: Which NFL team represented the NFC at Super Bowl 50?  context: Super Bowl 50 wa