# Fine-Tuning a Small LLM with LoRA for Automated Paper Review Insights (code)

In [10]:

# ---------------------- IMPORTS ----------------------
%pip install -r requirements.txt
import pandas as pd
import os
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,r2_score
from scipy.stats import pearsonr, spearmanr
from bert_score import score as bertscore
from huggingface_hub import login
login("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")


Collecting transformers==4.30.0 (from -r requirements.txt (line 1))
  Using cached transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
Collecting torch==2.8.0 (from -r requirements.txt (line 3))
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting bitsandbytes==0.38.1 (from -r requirements.txt (line 4))
  Using cached bitsandbytes-0.38.1-py3-none-any.whl.metadata (9.8 kB)
Collecting peft==0.4.0 (from -r requirements.txt (line 5))
  Using cached peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting tqdm==4.65.0 (from -r requirements.txt (line 6))
  Using cached tqdm-4.65.0-py3-none-any.whl.metadata (56 kB)
Collecting pandas==2.0.1 (from -r requirements.txt (line 10))
  Using cached pandas-2.0.1-cp312-cp312-linux_x86_64.whl
Collecting openpyxl==3.1.2 (from -r requirements.txt (line 11))
  Using cached openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0->-r requirements.t

This cell loads and preprocesses the paper review dataset. It reads the Excel file, removes incomplete entries, cleans text fields, merges reviews for duplicate titles, and prepares the data for training and evaluation. Several utility functions are defined for cleaning and extracting relevant information from the dataset.

In [None]:
# ---------------------- LOAD & CLEAN DATA ----------------------

df = pd.read_excel("../data/tp_2017conference.xlsx")

# Drop rows missing title/abstract/review
df = df.dropna(subset=["title", "abstract", "review"])

# Clean abstract field
df["abstract"] = df["abstract"].str.replace("Abstract:###", "", regex=False).str.strip()

# Deduplicate by title (merge reviews)
grouped = df.groupby("title").agg({
    "abstract": "first",  # assume same abstract
    "review": lambda r: "\n\n".join(r),  # concat reviews
    "rate": list,
    "confidence": list,
    "decision": "first"
}).reset_index()

# CLEANING FUNCTIONS

def clean_text(text):
    '''Rimuove caratteri non stampabili, tag HTML, codifiche Unicode e spazi extra'''
    if not isinstance(text, str):
        return ""
    text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"_x[0-9a-fA-F]{4}_", " ", text)
    text = re.sub(r"\\s+", " ", text)
    return text.strip()

def clean_dataset(df):
    '''Pulisce il DataFrame rimuovendo righe con campi essenziali mancanti e applicando clean_text'''
    # Tieni solo righe con i campi essenziali
    df = df.dropna(subset=["title", "abstract", "review"]).reset_index(drop=True)

    # Applica clean_text su tutte le colonne testuali
    for col in ["title", "abstract", "review"]:
        df[col] = df[col].map(clean_text)

    return df

def clean_response(text):
    """Rimuove token speciali e fallback in caso di errore"""
    if "<|start_header_id|>assistant<|end_header_id|>\n" in text:
        text = text.split("<|start_header_id|>assistant<|end_header_id|>\n", 1)[-1]
    return text.replace("<|eot_id|>", "").strip()

def extract_number(text):
    """Estrae numero da rate/confidence tipo 'Rating:###7: ...' """
    if pd.isna(text):
        return None
    match = re.search(r"(\d+)", str(text))
    return int(match.group(1)) if match else None

# APPLY CLEANING
grouped = clean_dataset(grouped)

# Rimozione del campo decision (evita leakage)
if "decision" in grouped.columns:
    grouped = grouped.drop(columns=["decision"])

# Normalizzazione numerica rate/confidence
grouped["rating_num"] = grouped["rate"].apply(lambda lst: extract_number(lst[0]) if isinstance(lst, list) and lst else None)
grouped["confidence_num"] = grouped["confidence"].apply(lambda lst: extract_number(lst[0]) if isinstance(lst, list) and lst else None)

# Split into train/val/test
train_val, test = train_test_split(grouped, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.1, random_state=42)

# Save for future use
train.to_csv("../data/train.csv", index=False)
val.to_csv("../data/val.csv", index=False)
test.to_csv("../data/test.csv", index=False)

This cell initializes the tokenizer and loads the Llama-3 model for causal language modeling. It sets up quantization with BitsAndBytes for efficient memory usage and configures the tokenizer for padding and device placement. The model is loaded with automatic device mapping to utilize available GPU resources.

In [None]:
# ---------------------- TOKENIZER & MODEL LLAMA ----------------------

model_name = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # this will choose GPU
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

This cell prints the device map used by the loaded Llama-3 model, showing how model layers are distributed across available hardware (such as GPUs or CPUs). This helps verify that the model is utilizing the intended devices for inference and training.

In [None]:
print(model.hf_device_map)

{'': 0}


This cell defines the function used to build prompts for the language model. It formats the paper's title, abstract, and review into a structured input, instructing the model to generate strengths, weaknesses, a numeric rating, and a confidence score in a specific format.

In [None]:
# ---------------------- PROMPT FUNCTION ----------------------
def build_prompt(example):
    title = clean_text(example["title"])
    abstract = clean_text(example["abstract"])
    review = clean_text(example["review"])
    return (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
        f"Title: {title}\n"
        f"Abstract: {abstract}\n"
        f"Review: {review}\n"
        "Please write a structured peer review with this exact format:\n\n"
        "Strengths:\n1. ...\n2. ...\n\n"
        "Weaknesses:\n1. ...\n2. ...<|eot_id|>\n"
        "Then, give a numeric rating (1-10) and a confidence score (1-5) in this format:\n"
        "Rating: <number>\n"
        "Confidence: <number><|eot_id|>\n"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )

This cell provides functions to generate structured peer reviews using the language model. It takes each paper, builds a prompt, and uses the model to produce strengths, weaknesses, rating, and confidence predictions. The second function applies this process to an entire DataFrame, saving the generated reviews and handling errors during generation.

In [None]:
# ---------------------- GENERATE STRENGTHS AND WEAKNESSES (both zero-shot/ft) ----------------------
def generate_review(model, tokenizer, paper, max_new_tokens=300):
    prompt = build_prompt(paper)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            use_cache=True
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    decoded = clean_response(decoded)
    return decoded

def generate_reviews_for_df(df, model, tokenizer, output_col="generated_review", output_file=None):
    preds = []

    print(f"Generating predictions for {len(df)} papers")

    model.eval()
    model.config.use_cache = True
    if hasattr(model, "gradient_checkpointing_disable"):
        model.gradient_checkpointing_disable()

    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            response = generate_review(model, tokenizer, row)
            preds.append(response)
        except Exception as e:
            preds.append(f"[ERROR: {e}]")
            print(f"❌ Error on row {i}: {e}")

    df[output_col] = preds

    if output_file:
        df.to_csv(output_file, index=False)
        print(f"✅ Saved to {output_file}")

    return df

These two cells generate structured peer reviews for the training and validation sets using the base language model. They clean the training and validation data, apply the review generation function, and save the resulting structured reviews to two CSV files for use as supervised targets in fine-tuning.

In [None]:
# ---------------------- STRUCTURE REVIEW (Distillation) ----------------------

# Generate structured reviews (this is your supervised target)
train = pd.read_csv("../data/train.csv")

#CLEANING
train= clean_dataset(train)
train_df = generate_reviews_for_df(train, model, tokenizer, output_file="train_structured.csv")
train_df.to_csv("../data/train_structured.csv", index=False)


Generating predictions for 350 papers


  0%|          | 0/350 [00:00<?, ?it/s]

100%|██████████| 350/350 [1:18:33<00:00, 13.47s/it]

✅ Saved to train_structured.csv





In [None]:
val = pd.read_csv("../data/val.csv")
val = clean_dataset(val)
val_df = generate_reviews_for_df(val, model, tokenizer, output_file="val_structured.csv")
val_df.to_csv("../data/val_structured.csv", index=False)

Generating predictions for 40 papers


  0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40/40 [08:56<00:00, 13.40s/it]

✅ Saved to val_structured.csv





This cell configures the model for parameter-efficient fine-tuning using LoRA (Low-Rank Adaptation). It prepares the model for k-bit training and sets up the LoRA configuration, specifying target modules and hyperparameters. The LoRA adapter is then applied to the model to enable efficient adaptation during fine-tuning.

In [None]:
# ---------------------- LoRA CONFIG ----------------------

model_ft = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model_ft = get_peft_model(model_ft, lora_config)

This cell prepares the training and validation datasets for fine-tuning. It builds prompt-response pairs from the generated reviews, tokenizes the data for the language model, and removes unnecessary columns to create datasets suitable for supervised training.

In [None]:
# ---------------------- FINE-TUNING PREP ----------------------
def ft_prompt(example):
    prompt = build_prompt(example)
    response = example["generated_review"].strip() + "\n<|eot_id|>"
    return {"prompt": prompt, "response": response}

train_dataset = Dataset.from_pandas(train_df).map(ft_prompt)
val_dataset = Dataset.from_pandas(val_df).map(ft_prompt)

def tokenize(example):
    tokens= tokenizer(
        example["prompt"] + example["response"], 
        truncation=True, 
        padding="max_length", 
        max_length=300
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

train_dataset = train_dataset.map(tokenize)
val_dataset = val_dataset.map(tokenize)

drop_cols = list(set(train_df.columns) | {"prompt", "response"})
train_dataset = train_dataset.remove_columns([c for c in drop_cols if c in train_dataset.column_names])
val_dataset = val_dataset.remove_columns([c for c in drop_cols if c in val_dataset.column_names])

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

This cell sets up and runs the training loop for fine-tuning the model using the Hugging Face Trainer API. It defines training arguments such as batch size, evaluation strategy, number of epochs, and learning rate, then starts the training process and saves the fine-tuned model and tokenizer.

In [None]:
# ---------------------- TRAINING ----------------------
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./finetuned-llama3",
    remove_unused_columns=False,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Save model
model_ft.save_pretrained("finetuned-llama3-lora")
tokenizer.save_pretrained("finetuned-llama3-lora")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,2.7525,2.618532
200,2.5855,2.538239
300,2.6513,2.521123
400,2.5451,2.514292
500,2.518,2.511523




('finetuned-llama3-lora/tokenizer_config.json',
 'finetuned-llama3-lora/special_tokens_map.json',
 'finetuned-llama3-lora/chat_template.jinja',
 'finetuned-llama3-lora/tokenizer.json')

This cell performs inference on the test set using the base language model. It cleans the test data, generates zero-shot structured peer reviews, and saves the predictions to a CSV file for later evaluation.

In [None]:
# ---------------------- INFERENCE ON TEST SET ----------------------
# Generate zero-shot reviews
test_df = pd.read_csv("../data/test.csv")
test_df = clean_dataset(test_df)

generate_reviews_for_df(
    df=test_df,
    model=model,
    tokenizer=tokenizer,
    output_col="zero_shot_review",
    output_file="zero_shot_predictions.csv"
)


Generating predictions for 98 papers


  0%|          | 0/98 [00:00<?, ?it/s]

100%|██████████| 98/98 [27:02<00:00, 16.55s/it]

✅ Saved to zero_shot_predictions.csv





Unnamed: 0,title,abstract,review,rate,confidence,rating_num,confidence_num,zero_shot_review
0,Training deep neural-networks using a noise ad...,The availability of large datsets has enabled ...,This paper looks at how to train if there are ...,['Rating:###5: Marginally below acceptance thr...,['Confidence:###4: The reviewer is confident b...,5,4,Strengths: 1. The paper addresses a very commo...
1,Deep Character-Level Neural Machine Translatio...,Neural machine translation aims at building a ...,* Summary: This paper proposes a neural machin...,['Rating:###6: Marginally above acceptance thr...,['Confidence:###4: The reviewer is confident b...,6,4,Strengths: 1. The paper is well-written. 2. Th...
2,Third Person Imitation Learning | OpenReview,Reinforcement learning (RL) makes it possible ...,This paper proposed a novel adversarial framew...,['Rating:###5: Marginally below acceptance thr...,['Confidence:###3: The reviewer is fairly conf...,5,3,Strengths: 1. The paper presents a novel appro...
3,Unsupervised Learning of State Representations...,We present an approach for learning state repr...,This paper is about learning unsupervised stat...,['Rating:###6: Marginally above acceptance thr...,['Confidence:###4: The reviewer is confident b...,6,4,Strengths:\n1. The paper is clearly written an...
4,The Neural Noisy Channel | OpenReview,We formulate sequence to sequence transduction...,This paper proposes to use an SSNT model of p(...,"['Rating:###7: Good paper, accept', 'Rating:##...",['Confidence:###4: The reviewer is confident b...,7,4,Strengths: 1. The paper proposes a novel appro...
...,...,...,...,...,...,...,...,...
93,OMG: Orthogonal Method of Grouping With Applic...,Training a classifier with only a few examples...,This paper proposes a k-shot learning framewor...,['Rating:###4: Ok but not good enough - reject...,['Confidence:###4: The reviewer is confident b...,4,4,Strengths: 1. The paper proposes a novel appro...
94,Rethinking Numerical Representations for Deep ...,With ever-increasing computational demand for ...,The paper studies the impact of using customiz...,['Rating:###6: Marginally above acceptance thr...,['Confidence:###3: The reviewer is fairly conf...,6,3,Strengths: 1. The paper is well written and ea...
95,Calibrating Energy-based Generative Adversaria...,"In this paper, we propose to equip Generative ...",This paper addresses one of the major shortcom...,"['Rating:###8: Top 50% of accepted papers, cle...",['Confidence:###4: The reviewer is confident b...,8,4,Strengths: 1. The paper presents a novel appro...
96,Machine Solver for Physics Word Problems | Ope...,We build a machine solver for word problems on...,The authors describe a system for solving phys...,['Rating:###4: Ok but not good enough - reject...,['Confidence:###4: The reviewer is confident b...,4,4,Strengths: 1. The paper is well-written and ea...


This cell generates structured peer reviews for the test set using the fine-tuned model. It cleans the test data, applies the review generation function, and saves the resulting predictions to a CSV file for evaluation.

In [None]:
# Generate reviews using fine-tuned model

test_df = pd.read_csv("../data/test.csv")
test_df = clean_dataset(test_df)

generate_reviews_for_df(
    df=test_df, 
    model=model_ft, 
    tokenizer=tokenizer, 
    output_col="fine_tuned_review", 
    output_file="finetuned_predictions.csv"
)

NameError: name 'pd' is not defined

This cell evaluates the zero-shot and fine-tuned model predictions using regression metrics. It extracts numeric ratings and confidence scores from the generated reviews, computes metrics such as MAE, RMSE, R², Pearson, and Spearman correlations, and calculates BERTScore to assess the textual similarity between generated and reference reviews.

In [None]:
# ---------------------- EVALUATION ----------------------
import re

# Carica predizioni
zero = pd.read_csv("zero_shot_predictions.csv")
ft = pd.read_csv("finetuned_predictions.csv")

# Funzione per estrarre numeri dal testo
def extract_pred_numbers(text):
    rating, confidence = None, None
    if isinstance(text, str):
        match_r = re.search(r"Rating:\s*(\d+)", text)
        match_c = re.search(r"Confidence:\s*(\d+)", text)
        if match_r:
            rating = int(match_r.group(1))
        if match_c:
            confidence = int(match_c.group(1))
    return rating, confidence

# Aggiungi colonne predette
zero[["rating_pred", "confidence_pred"]] = zero["zero_shot_review"].apply(
    lambda x: pd.Series(extract_pred_numbers(x))
)
ft[["rating_pred", "confidence_pred"]] = ft["fine_tuned_review"].apply(
    lambda x: pd.Series(extract_pred_numbers(x))
)

# Funzione per regression metrics
def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pr, _ = pearsonr(y_true, y_pred)
    sr, _ = spearmanr(y_true, y_pred)
    return dict(MAE=mae, RMSE=rmse, R2=r2, Pearson=pr, Spearman=sr)

# Drop rows with missing predictions
zero_eval = zero.dropna(subset=["rating_pred", "confidence_pred"])
ft_eval = ft.dropna(subset=["rating_pred", "confidence_pred"])

print("Zero-shot usable samples:", len(zero_eval), "/", len(zero))
print("Fine-tuned usable samples:", len(ft_eval), "/", len(ft))

# Rating
print("Zero-shot Rating Metrics:", regression_metrics(zero_eval["rating_num"], zero_eval["rating_pred"]))
print("Fine-tuned Rating Metrics:", regression_metrics(ft_eval["rating_num"], ft_eval["rating_pred"]))

# Confidence
print("Zero-shot Confidence Metrics:", regression_metrics(zero_eval["confidence_num"], zero_eval["confidence_pred"]))
print("Fine-tuned Confidence Metrics:", regression_metrics(ft_eval["confidence_num"], ft_eval["confidence_pred"]))

# BERTScore su review testuali
P, R, F1 = bertscore(
    cands=ft["fine_tuned_review"].fillna("").tolist(),
    refs=ft["review"].fillna("").tolist(),
    lang="en"
)
print("Fine-tuned BERTScore F1:", F1.mean().item())


Zero-shot usable samples: 71 / 98
Fine-tuned usable samples: 65 / 98
Zero-shot Rating Metrics: {'MAE': 1.2394366197183098, 'RMSE': 1.5833642201089195, 'R2': -0.050365691489361764, 'Pearson': 0.44940425697779995, 'Spearman': 0.49971386386603706}
Fine-tuned Rating Metrics: {'MAE': 1.1846153846153846, 'RMSE': 1.4728308691872156, 'R2': -0.03489159891598925, 'Pearson': 0.4272394633667977, 'Spearman': 0.42107337422528374}
Zero-shot Confidence Metrics: {'MAE': 0.8873239436619719, 'RMSE': 1.1196075771271663, 'R2': -1.256785714285714, 'Pearson': 0.05224773559977741, 'Spearman': 0.048965873759822386}
Fine-tuned Confidence Metrics: {'MAE': 0.8769230769230769, 'RMSE': 1.1024448355290233, 'R2': -0.9018518518518521, 'Pearson': 0.16858544608470485, 'Spearman': 0.20658396443117913}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuned BERTScore F1: 0.8384987711906433
