<a href="https://colab.research.google.com/github/allan-jt/Llamathlete/blob/aditya/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Math Question Answer Verification Competition**

This notebook is based on the starter code from the [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR).

### Team Members:
- **Aditya Azad**     (aa10878)
- **Ching Huang**     (ch4802)
- **Allan Thekkepeedika** (ajt444)


## Preliminaries

In [1]:
COLLAB = 1
KAGGLE = 0
NOTEBOOK_ENV = COLLAB

In [None]:
# %%capture
# This cell will take time

if NOTEBOOK_ENV == COLLAB:
  !pip install unsloth
  !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
else:
  !pip install pip3-autoremove
  !pip-autoremove torch torchvision torchaudio -y
  !pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
  !pip install unsloth

In [None]:
# If you're loading a model that you'e saved on google drive
from google.colab import drive

drive.mount('/content/drive')

In [None]:
from unsloth import FastLanguageModel
import torch
import numpy as np

max_seq_length = 1000 # The token size of our prompt doesn't exceed 500, so this is a safe value
dtype = None          # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True   # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
# Either start fresh with pretrained Llama or load your fine-tuned model
# for further training and insert path in model_name

model_name = "unsloth/Meta-Llama-3.1-8B"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

## Wrap model with LoRA adapters

Note: This is only required if you're starting fresh with Llama

In [5]:
# Configrations we experimented with; all others were default
class LoRAConfig:
  def __init__(self, r, lora_alpha, use_rslora=True):
    self.r = r
    self.lora_alpha = lora_alpha
    self.use_rslora = use_rslora

# Instantiate configurations
LoRAConfig1 = LoRAConfig(r=16, lora_alpha=16, use_rslora=True)
LoRAConfig2 = LoRAConfig(r=32, lora_alpha=32, use_rslora=True)
LoRAConfig3 = LoRAConfig(r=32, lora_alpha=64, use_rslora=True)

In [None]:
loraConfig = LoRAConfig3

model = FastLanguageModel.get_peft_model(
    model,
    r = loraConfig.r,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = loraConfig.lora_alpha,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = loraConfig.use_rslora,
    loftq_config = None,
)

## Download competition dataset and process with selected prompt

In [None]:
# Load competition datasets and extract training data to create training and validation datasets
from datasets import load_dataset

dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
train_data = dataset['train']

# A large validation set can't fit in memory on Collab and Kaggle,
# so we assign a small percentage (0.1%) of the training data for
# validation because the size of the traiining data is 1,000,000.
# train_size_percent = 0.999

# dataset_size = len(dataset['train'])
# train_size = round(train_size_percent * dataset_size)
# val_size = dataset_size - train_size

# train_data, val_data = torch.utils.data.random_split(dataset['train'], [train_size, val_size])
# train_data = dataset['train'].select(train_data.indices)
# val_data = dataset['train'].select(val_data.indices)

# balance dataset
false_indices = [
    i for i, x in enumerate(train_data["is_correct"]) if not x
]
indices_to_remove = np.random.choice(
    false_indices, size=200000, replace=False
)
keep_mask = np.ones(len(train_data), dtype=bool)
keep_mask[indices_to_remove] = False
train_data = train_data.select(np.where(keep_mask)[0])


In [8]:
prompt = """You are a math grader tasked with evaluating whether a given answer to a math question is correct or not. Respond with 'True' if the answer is correct and 'False' if it is incorrect.

Below are the Question, the given Answer, and the Explanation of the Answer.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### It’s very important to grade the Answer accurately, so you must
1.  Carefully read and understand the Question.
2.  Review the given Answer and compare it against the Explanation provided.
3.  If the Explanation correctly justifies the Answer, respond with 'True'.
4.  If the Explanation is incorrect or does not logically support the Answer, respond with 'False'.

### Grading ('True' or 'False'):
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    question = examples["question"]
    answer = examples["answer"]
    explanation = examples["solution"]
    output = examples["is_correct"]

    texts = []
    for q, a, e, o in zip(question, answer, explanation, output):
        # Must add EOS_TOKEN to prevent infinite generation
        text = prompt.format(q, a, e, o) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts}

In [None]:
# Process the training and validation datasets and generate prompt for each datapoint
train_dataset = train_data.map(formatting_prompts_func, batched = True)
# val_dataset = val_data.map(formatting_prompts_func, batched = True)

In [None]:
# Print a smaple training datapoint
train_dataset['text'][0]

In [None]:
# Print a smaple validation datapoint
# val_dataset['text'][0]

## Supervised Fine-tuning Trainer (SFT)

In [11]:
# Configrations we experimented with; all others were default
class SFTConfig:
    def __init__(
        self,
        batch_size,
        accumulation_steps,
        warmup_steps,
        learning_rate,
        lr_scheduler_type,
        max_steps,
        weight_decay
    ):
        self.batch_size = batch_size
        self.accumulation_steps = accumulation_steps
        self.warmup_steps = warmup_steps
        self.learning_rate = learning_rate
        self.lr_scheduler_type = lr_scheduler_type
        self.max_steps = max_steps
        self.weight_decay = weight_decay

# Define learning rates
lr_slow, lr_medium, lr_fast = 1e-4, 2e-4, 3e-4

# Create configuration instances
SFTConfig1 = SFTConfig(
    batch_size=4,
    accumulation_steps=4,
    warmup_steps=50,
    learning_rate=lr_slow,
    lr_scheduler_type="cosine",
    max_steps=2000,
    weight_decay=0.001
)

SFTConfig2 = SFTConfig(
    batch_size=8,
    accumulation_steps=4,
    warmup_steps=50,
    learning_rate=lr_slow,
    lr_scheduler_type="cosine",
    max_steps=1200,
    weight_decay=0.001
)

SFTConfig3 = SFTConfig(
    batch_size=8,
    accumulation_steps=4,
    warmup_steps=50,
    learning_rate=lr_slow,
    lr_scheduler_type="linear",
    max_steps=3500,
    weight_decay=0.001
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

sftConfig = SFTConfig3
training_args = TrainingArguments(
#   Custom configurations
    per_device_train_batch_size = sftConfig.batch_size,
    gradient_accumulation_steps = sftConfig.accumulation_steps,
    warmup_steps = sftConfig.max_steps,
    max_steps = sftConfig.warmup_steps,
    learning_rate = sftConfig.learning_rate,
    lr_scheduler_type = sftConfig.lr_scheduler_type,
    weight_decay = sftConfig.weight_decay,

#   Default configurations
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none",

#   Validation configrations
    # per_device_eval_batch_size = 2,
    # eval_strategy = "steps",
    # eval_steps = 100,
    # eval_accumulation_steps = 10,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    # eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)

In [None]:
RESUME_TRAINING = False

if RESUME_TRAINING:
    trainer_stats = trainer.train(resume_from_checkpoint=True)
else:
    trainer_stats = trainer.train()

## Save model and training state

In [None]:
model.save_pretrained("outputs") # Local saving
tokenizer.save_pretrained("outputs")
trainer.save_state()

## Conducting inference

In [None]:
torch.cuda.empty_cache()

In [None]:
test_dataset = dataset['test']

sample_ques = test_dataset['question']
sample_ans = test_dataset['answer']
sample_sol = test_dataset['solution']

In [None]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

sol = []

# Prepare your input prompts
input_prompts = []
for i in range(len(sample_ques)):
    input = prompt.format(
        sample_ques[i], # question
        sample_ans[i],  # given answer
        sample_sol[i],  # explanation
        "",             # output - leave this blank for generation
    )
    input_prompts.append(input)

chunk_size = 16 # Divide your dataset into smaller chunks
for i in range(0, len(input_prompts), chunk_size):
    chunk = input_prompts[i:i + chunk_size]
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=False, max_length= max_seq_length).to("cuda")
    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch

    # Tokenize and run inference for the chunk
    outputs = model.generate(**inputs, max_new_tokens=1,use_cache=True)
    response = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)
    sol = sol + response

    # Clear cache after each chunk to free up memory
    torch.cuda.empty_cache()

In [None]:
import pandas as pd

In [None]:
ID = [i for i in range(len(sol))]
is_correct = [s == 'True' for s in sol]
dict = {'ID': ID, 'is_correct': is_correct}

df = pd.DataFrame(dict)
df.to_csv('submission.csv', index=False)