In [1]:
%pip install -U transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install ipywidgets

from huggingface_hub import login
login("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")

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load model directly
%pip install hf_xet

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install torch torchvision transformers accelerate bitsandbytes pandas openpyxl


Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

# Load Excel
df = pd.read_excel("tp_2017conference.xlsx")

# Drop rows missing title/abstract/review
df = df.dropna(subset=["title", "abstract", "review"])

# Clean abstract field
df["abstract"] = df["abstract"].str.replace("Abstract:###", "", regex=False).str.strip()

# Deduplicate by title (merge reviews)
grouped = df.groupby("title").agg({
    "abstract": "first",  # assume same abstract
    "review": lambda r: "\n\n".join(r),  # concat reviews
    "rate": list,
    "confidence": list,
    "decision": "first"
}).reset_index()


In [6]:
from sklearn.model_selection import train_test_split

train_val, test = train_test_split(grouped, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.1, random_state=42)

# Save for future use
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
test.to_csv("test.csv", index=False)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # 👈 now this will choose GPU
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ZERO-SHOT BASELINE EVAL:
Here’s the template structure you should follow (based on the format Llama models expect):

prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Title: {paper_title}
Abstract: {paper_abstract}
What are the main strengths and weaknesses of this paper?<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>"""


In training, your target (label) will be the review_summary (or the review)

First you need to clean the dataset to deduplicate the papers (group by title, take first abstract, and merge reviews)

1. Generate Zero-Shot Predictions
Use your cleaned dataset to prompt the base model without fine-tuning, and generate its assessment for each paper.

In [8]:
def generate_zero_shot(paper):
    prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
    prompt += f"Title: {paper['title']}\nAbstract: {paper['abstract']}\n"
    prompt += "What are the strengths and weaknesses of this paper?<|eot_id|>\n"
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False, # Faster, deterministic
            temperature=1.0,
            top_p=1.0
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = decoded.split("<|start_header_id|>assistant<|end_header_id|>\n")[-1]
    return response.strip()


Sequential GPU inference

In [9]:
import pandas as pd
import time

# Load test set
test_df = pd.read_csv("test.csv")  # or use the uploaded path
print(f"Generating zero-shot reviews for {len(test_df)} test papers")

# Inference loop
preds = []
timings = []

for i, row in test_df.iterrows():
    print(f"[{i+1}/{len(test_df)}] {row['title'][:60]}...")

    try:
        start = time.time()
        response = generate_zero_shot(row)
        duration = time.time() - start
        preds.append(response)
        timings.append(duration)
        print(f"   ✅ Done in {duration:.2f}s")

    except Exception as e:
        preds.append(f"[ERROR: {e}]")
        timings.append(None)
        print(f"   ❌ Error: {e}")

# Save
test_df["zero_shot_review"] = preds
test_df["generation_time"] = timings
test_df.to_csv("zero_shot_predictions.csv", index=False)

print(f"\n✅ Saved zero-shot reviews for {len(test_df)} papers.")
print(f"⏱️ Avg generation time: {sum(t for t in timings if t) / len(timings):.2f}s")

Generating zero-shot reviews for 98 test papers
[1/98] Training deep neural-networks using a noise adaptation layer...
   ✅ Done in 12.33s
[2/98] Deep Character-Level Neural Machine Translation By Learning ...
   ✅ Done in 11.71s
[3/98] Third Person Imitation Learning | OpenReview...
   ✅ Done in 11.68s
[4/98] Unsupervised Learning of State Representations for Multiple ...
   ✅ Done in 11.72s
[5/98] The Neural Noisy Channel | OpenReview...
   ✅ Done in 11.64s
[6/98] New Learning Approach By Genetic Algorithm In A Convolutiona...
   ✅ Done in 11.63s
[7/98] An Actor-Critic Algorithm for Sequence Prediction | OpenRevi...
   ✅ Done in 11.69s
[8/98] Joint Training of Ratings and Reviews with Recurrent Recomme...
   ✅ Done in 11.68s
[9/98] What Is the Best Practice for CNNs Applied to Visual Instanc...
   ✅ Done in 11.66s
[10/98] Learning Efficient Algorithms with Hierarchical Attentive Me...
   ✅ Done in 11.54s
[11/98] A Learned Representation For Artistic Style | OpenReview...
   ✅ Done in

Fine-tuned model

In [10]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [17]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")

# Define prompt formatting function
def format_prompt(example):
    return {
        "text": f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
                f"Title: {example['title']}\nAbstract: {example['abstract']}\n"
                "What are the strengths and weaknesses of this paper?<|eot_id|>\n"
                "<|start_header_id|>assistant<|end_header_id|>\n"
                f"{example['review']}"  # ground-truth target
    }

train_dataset = Dataset.from_pandas(train_df).map(format_prompt)
val_dataset = Dataset.from_pandas(val_df).map(format_prompt)


Map:   0%|          | 0/351 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [12]:
%pip install peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [13]:
%pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [18]:
#TOKENIZE AND prepare datasets
def tokenize_function(example):
    tokens= tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()  # use input_ids as labels
    return tokens

train_dataset = train_dataset.map(tokenize_function)
val_dataset = val_dataset.map(tokenize_function)

# Remove unused columns
train_dataset = train_dataset.remove_columns(["text", "title", "abstract", "review"])
val_dataset = val_dataset.remove_columns(["text", "title", "abstract", "review"])

Map:   0%|          | 0/351 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [19]:
# Fine-tuning setup with Trainer
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./finetuned-llama3",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
100,2.6878,2.656981
200,2.6218,2.550092
300,2.6737,2.529684
400,2.5607,2.521019
500,2.567,2.517606




TrainOutput(global_step=528, training_loss=2.668295285918496, metrics={'train_runtime': 322.0242, 'train_samples_per_second': 3.27, 'train_steps_per_second': 1.64, 'total_flos': 9132968946696192.0, 'train_loss': 2.668295285918496, 'epoch': 3.0})

In [20]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

EVALUATION

In [21]:
import pandas as pd
from datasets import Dataset

# Load test data
test_df = pd.read_csv("test.csv")

# Apply prompt format
def format_prompt(example):
    return {
        "text": f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
                f"Title: {example['title']}\nAbstract: {example['abstract']}\n"
                "What are the strengths and weaknesses of this paper?<|eot_id|>\n"
                "<|start_header_id|>assistant<|end_header_id|>\n"
        # no answer (model must generate it)
    }

formatted_test = Dataset.from_pandas(test_df).map(format_prompt)


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

In [None]:
#tokenize the test set for inference
def tokenize_test(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

tokenized_test = formatted_test.map(tokenize_test)


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

In [23]:
#Generate predictions
from tqdm import tqdm
import torch

def generate_review(example):
    inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False,
            temperature=1.0,
            top_p=1.0
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return decoded.split("<|start_header_id|>assistant<|end_header_id|>\n")[-1].strip()


In [26]:
model.config.use_cache = True
model.gradient_checkpointing_disable()

In [27]:
model.eval()  # Set model to evaluation mode
test_predictions = []

for example in tqdm(test_df.to_dict(orient="records")):
    prompt = format_prompt(example)["text"]

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    # Inference
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,          # deterministic, faster
            pad_token_id=tokenizer.eos_token_id,
            temperature=1.0,
            top_p=1.0,
            use_cache=True            # ✅ IMPORTANT for speed
        )

    # Decode the output
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = decoded.split("<|start_header_id|>assistant<|end_header_id|>\n")[-1]

    test_predictions.append(response.strip())

# Save predictions
test_df["generated_review"] = test_predictions
test_df.to_csv("finetuned_predictions.csv", index=False)


100%|██████████| 98/98 [29:53<00:00, 18.30s/it]


EVALUATION WITH REGRESSION METRICS