
# Q-2 — LLM Fine-Tuning (RLHF‑Lite) with Raw Student Answers

This notebook builds an **end-to-end** pipeline for Q‑2 using raw student answers from `midterm_1-q2.txt`.  
It includes:
1) Data loading & cleaning (raw text, no stopword removal)  
2) Baseline semantic scoring using multiple models (SBERT, BERTScore, optional T5/ROUGE)  
3) Reward model construction (ensemble of scorers)  
4) **RLHF‑Lite** fine‑tuning of a small causal LM (DistilGPT‑2) using a reward‑weighted loss  
5) Evaluation against the teacher reference answer (before vs after fine‑tuning)  
6) Transparent selection and display of **5 human reference samples** (very_low → very_high)  
7) Optional strict‑zero & damping rules for exam policy alignment  

> This notebook is modular: you can swap/distil models or add your own scorers.  
> Run each section in order.


## 0) Configuration

In [1]:

# Teacher reference answer (Q-2)
TEACHER_ANSWER = 'The Roman Empire developed comprehensive legal codes and principles that form the basis of Western legal systems. Roman law, particularly in the areas of property rights, contractual obligations, inheritance laws, and criminal law, has left a lasting impact. Known as the "Law of Laws," Roman Law, including the Justinian Codes and the compilation of Roman legal thought in the Corpus Juris Civilis, remains a pivotal influence in legal theory and practice today.'

# File paths
RAW_TXT = "midterm_1-q2.txt"   # format: StudentID:::Answer
OUTPUT_DIR = "rlhf_q2_outputs"

# Policy knobs
STRICT_ZERO = True            # zero for empty or trivial 'copy'
DAMP_LOW_SCORES = True        # if True, <=5 human scores gain at most +5
MAX_DAMP_GAIN = 5
RANDOM_SEED = 42


## 1) Setup (install & imports)

In [2]:

# You may need internet to install these in your own environment.
# If they're already available, you can skip installation cells.

%pip -q install transformers datasets sentence-transformers bert-score evaluate accelerate einops --upgrade

import os, math, random
import pandas as pd
import numpy as np
from pathlib import Path

import torch
from datasets import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sentence_transformers import SentenceTransformer, util as st_util
from bert_score import score as bert_score
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)


Note: you may need to restart the kernel to use updated packages.


  Referenced from: <F6236B89-E4CA-3330-B665-E463D537EAF3> /Users/bkkas/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


## 2) Load raw student answers

In [3]:

def load_raw_txt(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line or ":::" not in line:
                continue
            sid, ans = line.split(":::", 1)
            rows.append({"StudentID": sid.strip(), "Answer": ans.strip()})
    return pd.DataFrame(rows)

df_raw = load_raw_txt(RAW_TXT)
# Drop empty purely
df_raw = df_raw[df_raw["Answer"].str.strip()!=""].copy().reset_index(drop=True)
print(f"Loaded {len(df_raw)} answers.")
df_raw.head()


Loaded 128 answers.


Unnamed: 0,StudentID,Answer
0,20180808086,Romans achievements are in a Modern Goverment....
1,20190808014,"Romans advanced in modern legal system, civil ..."
2,20190808021,The ten main achievements of romans were the b...
3,20190808035,Roads and highways using geometry military...
4,20200808008,"Romans achieved civil and army engineering, re..."


## 3) Baseline scoring with multiple models

In [4]:

# 3.1 Sentence-BERT semantic similarity
sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
teacher_emb = sbert.encode(TEACHER_ANSWER, convert_to_tensor=True)

def sbert_sim(text):
    emb = sbert.encode(text, convert_to_tensor=True)
    return st_util.cos_sim(emb, teacher_emb).item()

df = df_raw.copy()
df["SBERT_Sim"] = df["Answer"].apply(sbert_sim)

# 3.2 BERTScore (precision-oriented, can choose F1 too)
P, R, F1 = bert_score([df["Answer"].tolist()], [ [TEACHER_ANSWER]*len(df) ], lang="en", rescale_with_baseline=True)
df["BERTScore_F1"] = [float(x) for x in F1]

# 3.3 ROUGE (optional lexical baseline)
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=df["Answer"].tolist(), references=[TEACHER_ANSWER]*len(df))
# ROUGE returns corpus-level. We'll also compute per-sample ROUGE-L by batching:
def rougeL_single(pred, ref):
    sc = rouge.compute(predictions=[pred], references=[ref])
    return sc["rougeL"]

df["ROUGE_L"] = df["Answer"].apply(lambda x: rougeL_single(x, TEACHER_ANSWER))

df.head()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: unhashable type: 'list'

## 4) Build reward model (ensemble of scorers)

In [None]:

# Normalize each metric to [0,1] via MinMax
for col in ["SBERT_Sim", "BERTScore_F1", "ROUGE_L"]:
    scaler = MinMaxScaler()
    df[col+"_N"] = scaler.fit_transform(df[[col]])

# Simple linear ensemble; you can tune weights
df["Reward_Ensemble"] = 0.5*df["SBERT_Sim_N"] + 0.4*df["BERTScore_F1_N"] + 0.1*df["ROUGE_L_N"]

# Optional: learn a tiny ridge regressor on pseudo-targets if you have human scores
# For Q-2 we assume only teacher ref; so keep ensemble as reward
df[["SBERT_Sim","BERTScore_F1","ROUGE_L","Reward_Ensemble"]].describe()


## 5) Pick 5 human reference samples (very_low → very_high)

In [None]:

# We ensure exactly 5 bins are represented.
bins = [df["Reward_Ensemble"].min()-1e-9, 0.2, 0.4, 0.6, 0.8, df["Reward_Ensemble"].max()+1e-9]
labels = ["very_low","low","medium","high","very_high"]
df["level"] = pd.cut(df["Reward_Ensemble"], bins=bins, labels=labels, include_lowest=True)

rep_list = []
for lab in labels:
    sub = df[df["level"]==lab]
    if sub.empty:
        # backoff: take nearest neighbors from adjacent bins
        # if really no sample, duplicate the closest from next non-empty bin
        # (keeps 5 representatives always)
        continue
    rep_list.append(sub.sample(1, random_state=RANDOM_SEED))
rep_df = pd.concat(rep_list).reset_index(drop=True)

# If fewer than 5, fill from global extremes/middle
while len(rep_df) < 5:
    # pick from remaining by distance to target quantiles
    q_targets = [0.1,0.3,0.5,0.7,0.9]
    target = q_targets[len(rep_df)%5]
    tgt_val = df["Reward_Ensemble"].quantile(target)
    pick = df.iloc[(df["Reward_Ensemble"]-tgt_val).abs().argsort().values[0]]
    if pick["StudentID"] not in rep_df["StudentID"].tolist():
        rep_df = pd.concat([rep_df, pd.DataFrame([pick])], ignore_index=True)

rep_df = rep_df.drop_duplicates(subset=["StudentID"]).head(5)
rep_df[["StudentID","Reward_Ensemble","level","Answer"]]


## 6) Prepare dataset for RLHF‑Lite fine‑tuning

In [None]:

# Tokenize answers; reward becomes per-sample weight
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_with_reward(ex):
    toks = tokenizer(
        ex["Answer"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    toks = {k: v.squeeze(0) for k,v in toks.items()}
    toks["labels"] = toks["input_ids"].clone()
    # attach reward as tensor
    toks["reward"] = torch.tensor(ex["Reward_Ensemble"], dtype=torch.float32)
    return toks

train_ds = Dataset.from_pandas(df[["Answer","Reward_Ensemble"]].copy())
train_ds = train_ds.map(tokenize_with_reward)

train_ds


## 7) Fine‑tune causal LM with reward‑weighted loss (RLHF‑Lite)

In [None]:

class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards = inputs.pop("reward")  # shape: (B,)
        outputs = model(**inputs)
        loss = outputs.loss  # (B,) or scalar
        if loss.ndim == 0:
            loss = loss * (1.0 + rewards.mean())
        else:
            loss = (loss * (1.0 + rewards)).mean()
        return (loss, outputs) if return_outputs else loss

model = AutoModelForCausalLM.from_pretrained(model_name)
training_args = TrainingArguments(
    output_dir=f"{OUTPUT_DIR}/rlhf_q2_ckpt",
    per_device_train_batch_size=2,
    num_train_epochs=2,
    learning_rate=5e-5,
    logging_steps=20,
    save_strategy="epoch",
    report_to=[]
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    tokenizer=tokenizer,
)
trainer.train()

trainer.save_model(f"{OUTPUT_DIR}/rlhf_q2_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/rlhf_q2_model")


## 8) Evaluation: before vs after fine‑tuning

In [None]:

def generate_answer(prompt, mdl, tok, max_new_tokens=80):
    inputs = tok(prompt, return_tensors="pt")
    inputs = {k: v.to(mdl.device) for k,v in inputs.items()}
    with torch.no_grad():
        out = mdl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, top_p=0.95, top_k=50)
    return tok.decode(out[0], skip_special_tokens=True)

# Use the same student answers as prompts or a question template
PROMPT_PREFIX = "Q-2: Explain the lasting impact of Roman Law on Western legal systems. Student answer:\n"

base_model = AutoModelForCausalLM.from_pretrained(model_name)
finetuned_model = AutoModelForCausalLM.from_pretrained(f"{OUTPUT_DIR}/rlhf_q2_model")

eval_samples = df.sample(min(20, len(df)), random_state=RANDOM_SEED).copy()

def eval_model(mdl, name):
    outs = []
    for _, row in eval_samples.iterrows():
        prompt = PROMPT_PREFIX + row["Answer"] + "\nRefine:"
        gen = generate_answer(prompt, mdl, tokenizer, max_new_tokens=80)
        outs.append(gen)
    eval_samples[name] = outs

eval_model(base_model, "gen_base")
eval_model(finetuned_model, "gen_finetuned")

# Score generations against teacher reference
def batch_bertscore(preds, ref):
    P, R, F1 = bert_score(preds, [ref]*len(preds), lang="en", rescale_with_baseline=True)
    return np.array([float(x) for x in F1])

eval_samples["BS_base"] = batch_bertscore(eval_samples["gen_base"].tolist(), TEACHER_ANSWER)
eval_samples["BS_ft"]   = batch_bertscore(eval_samples["gen_finetuned"].tolist(), TEACHER_ANSWER)

gain = (eval_samples["BS_ft"] - eval_samples["BS_base"]).mean()
print(f"Avg BERTScore F1 gain (finetuned - base): {gain:.4f}")
eval_samples.head()


## 9) Save outputs

In [None]:

out_xlsx = Path(OUTPUT_DIR) / "rlhf_q2_results.xlsx"
with pd.ExcelWriter(out_xlsx) as writer:
    df.to_excel(writer, sheet_name="Raw + Scores + Reward", index=False)
    rep_df[["StudentID","Reward_Ensemble","level","Answer"]].to_excel(writer, sheet_name="Human Reference (5)", index=False)
    eval_samples.to_excel(writer, sheet_name="Gen Eval (Base vs FT)", index=False)

print("Saved:", out_xlsx)
