# Environment Setup

### Option 1: Use Google Colab

In [None]:
!pip install --upgrade pip
!pip install "unsloth@git+https://github.com/unslothai/unsloth.git@September-2025-v3"

In [None]:
from google.colab import drive
import sys
import os
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/unsloth_env')
os.environ["HF_HOME"] = "/content/drive/MyDrive/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/content/drive/MyDrive/hf_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/content/drive/MyDrive/hf_cache"

### Option 2: Use [NYUAD HPC](https://ood.hpc.abudhabi.nyu.edu/pun/sys/dashboard/)

In [None]:
!module avail gcc
!module avail g++
!module load gcc/9.2.0
!module show gcc/9.2.0

In [None]:
import os

gcc_bin = "/share/apps/NYUAD5/gcc/9.2.0/bin"
os.environ["CC"] = os.path.join(gcc_bin, "gcc")
os.environ["CXX"] = os.path.join(gcc_bin, "g++")
os.environ["PATH"] = f"{gcc_bin}:{os.environ.get('PATH', '')}"

print("CC =", os.environ["CC"])
print("CXX =", os.environ["CXX"])

### Check

In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
print("Number of GPUs:", torch.cuda.device_count())

# Load a trained model

In [None]:
from unsloth import FastLanguageModel
import torch

# this is the model-saving path, where the model checkpoint is saved
save_path = "/scratch/yl11109/trained_models_masked_R2/checkpoint-3125" 

max_seq_length = 2048  # Choose any sequence length
dtype = None  # This will auto-detect the best data type for your GPU
load_in_4bit = True  # Use 4-bit quantization to save memory
# Load the model and tokenizer from the saved path
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = save_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


# Validation loading

In [None]:
from datasets import load_dataset
# import random
# Load the full training dataset
full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")

# Shuffle the dataset for randomness and create our smaller splits
shuffled_dataset = full_dataset.shuffle(seed=42)
validation_dataset = shuffled_dataset.select(range(800000,800000+2000)) 
# The instructional prompt template for validation
validation_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'true' if the solution is correct, otherwise 'false'. Below is the Question, Solution, and the Answer.
Question:
{}
Solution:
{}
Answer:
{}
Output:
"""

# Accuracy test

In [None]:
from tqdm import tqdm

# Prepare the model for faster inference
FastLanguageModel.for_inference(model)

# Create the prompt template for inference (no answer included)
validation_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'true' if the solution is correct, otherwise 'false'. Below is the Question, Solution, and the Answer.
Question:
{}
Solution:
{}
Answer:
{}
Output:
"""

def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Iterate over the whole validation set and compute accuracy
total = 0
correct = 0
predictions = []
true_labels = []

for example in tqdm(validation_dataset):
    question = example["question"]
    solution = example["solution"]
    answer = example["answer"]
    texts = validation_prompt.format(question, str(solution),str(answer))
    inputs = tokenizer([texts], return_tensors="pt").to("cuda")
    # Generate the model's response
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = str(parse_output(response_text)).strip().lower()

    true_label = str(example["is_correct"]).strip().lower()  # expected: "true" or "false"

    predictions.append(prediction)
    true_labels.append(true_label)
    # Update counters
    total += 1
    if prediction == true_label:
        correct += 1

# Print final accuracy
accuracy = 100.0 * correct / total if total else 0.0
print(f"\nValidation: {correct}/{total} correct, Accuracy = {accuracy:.2f}%")


# Submission csv generalization

In [None]:
import pandas as pd
from tqdm import tqdm
# Prepare the model for faster inference
FastLanguageModel.for_inference(model)
# Prompt
test_prompt = """You are a great mathematician and you are tasked with finding if a solution to a given maths question is correct or not. Your response should be 'true' if the solution is correct, otherwise 'false'. Below is the Question, Solution, and the Answer.
Question:
{}
Solution:
{}
Answer:
{}
Output:
"""
# Load the official test set
test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
predictions = []

# A simple function to parse 'True' or 'False' from the model's raw output
def parse_output(response_text):
    # Find the text after "Output:"
    output_part = response_text.split("Output:\n")[-1]
    # Check if "True" is in that part, case-insensitively
    if 'true' in output_part.lower():
        return True
    return False

# Loop through the test dataset and generate a prediction for each example
for example in tqdm(test_dataset):
    question = example["question"]
    solution = example["solution"]
    answer = example["answer"]
    # Format the prompt
    prompt = test_prompt.format(question, str(solution), str(answer))
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Generate the prediction
    outputs = model.generate(**inputs, max_new_tokens=8, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Parse the prediction and add it to our list
    prediction = parse_output(response_text)
    predictions.append(prediction)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'is_correct': predictions
})

# Save the DataFrame to a CSV file
submission.to_csv('/scratch/yl11109/submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print("You can now download this file and submit it to the Kaggle competition.")