In [None]:
! pip install transformers datasets peft accelerate bitsandbytes fair-esm nltk rouge-score bertscore

import nltk
nltk.download('punkt')

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from transformers.trainer_callback import TrainerCallback
from sklearn.metrics import accuracy_score
import torch
import json

loss_history = []
accuracy_history = []

MODEL_NAME = "gpt2"
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/swissprot_llm_dataset.jsonl"})["train"]

def format_prompt(example):
    return {
        "text": f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['response']}"
    }

dataset = dataset.map(format_prompt)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    acc = accuracy_score(labels.flatten(), preds.flatten())
    accuracy_history.append(acc)
    return {"accuracy": acc}

class LossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            loss_history.append(logs["loss"])

training_args = TrainingArguments(
    output_dir="./gpt2-annotation",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=1,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    report_to="none",
    logging_dir="./logs",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[LossCallback()],
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained("gpt2-functional-annotator")
tokenizer.save_pretrained("gpt2-functional-annotator")


with open("loss_history.json", "w") as f:
    json.dump(loss_history, f)
with open("accuracy_history.json", "w") as f:
    json.dump(accuracy_history, f)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm
import torch
import numpy as np
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

model_path = "gpt2-functional-annotator"  # path to your fine-tuned model
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

dataset = load_dataset("json", data_files={"test": "/content/drive/MyDrive/swissprot_llm_dataset.jsonl"})["test"]

def format_prompt(example):
    return f"### Instruction:\n{example['prompt']}\n\n### Response:"

prompts = [format_prompt(ex) for ex in dataset]
references = [ex["response"] for ex in dataset]

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("### Response:")[-1].strip()

print("Generating predictions...")
predictions = [generate_response(p) for p in tqdm(prompts)]

# Metric 1: BLEU
smoothie = SmoothingFunction().method1
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for ref, pred in zip(references, predictions)
]
bleu_avg = np.mean(bleu_scores)

# Metric 2: ROUGE
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_f1_scores = [
    scorer.score(ref, pred)["rougeL"].fmeasure
    for ref, pred in zip(references, predictions)
]
rouge_l_f1 = np.mean(rouge_l_f1_scores)

# Metric 3: Exact Match
exact_matches = [
    int(pred.strip().lower() == ref.strip().lower())
    for pred, ref in zip(predictions, references)
]
exact_match_acc = np.mean(exact_matches)

print(f"BLEU Score (avg): {bleu_avg:.4f}")
print(f"ROUGE-L F1 Score: {rouge_l_f1:.4f}")
print(f"Exact Match Accuracy: {exact_match_acc:.4f}")


In [None]:
from bert_score import score
P, R, F1 = score(predictions, references, lang="en", verbose=True)
print(f"BERTScore (F1 avg): {F1.mean().item():.4f}")


In [None]:
import json
import matplotlib.pyplot as plt

with open("loss_history.json") as f:
    loss_history = json.load(f)
with open("accuracy_history.json") as f:
    accuracy_history = json.load(f)

# Plot Loss
plt.figure(figsize=(8, 5))
plt.plot(loss_history, label="Training Loss")
plt.xlabel("Log Steps")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.grid(True)
plt.legend()
plt.show()

# Plot Accuracy (if any recorded)
if accuracy_history:
    plt.figure(figsize=(8, 5))
    plt.plot(accuracy_history, color="green", label="Accuracy")
    plt.xlabel("Evaluation Steps")
    plt.ylabel("Accuracy")
    plt.title("Exact Match Accuracy")
    plt.grid(True)
    plt.legend()
    plt.show()
