In [None]:
!pip install -q unsloth datasets transformers
!pip uninstall -y unsloth && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers

from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import torch
import pandas as pd


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from trl import SFTTrainer

os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

seed = 3407

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else None

# Set model parameters
max_seq_length = 900
load_in_4bit = True

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=seed,
    use_rslora=False,
    loftq_config=None,
)

model.gradient_checkpointing_enable()

dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")

# Split dataset into training and validation sets
train_val_split = dataset['train'].train_test_split(test_size=0.1, seed=seed)
train_dataset = train_val_split['train'].select(range(0, 65000))
val_dataset = train_val_split['test'].select(range(20000, 20500))

# Prompt
prompt = """You are the best mathematician and you have to find if an answer to a given maths question is correct or not to save the world as everyone trusts you. Your response should be 'True' if correct, otherwise 'False'.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""

# Formatting function
def formatting_prompts_func(examples):
    question = examples["question"]
    answer = examples["answer"]
    solution = examples["solution"]
    is_correct = examples["is_correct"]
    texts = []
    for q, a, sol, o in zip(question, answer, solution, is_correct):
        text = prompt.format(q, a, sol, o)
        texts.append(text)
    return {"text": texts}


train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
val_dataset = val_dataset.map(formatting_prompts_func, batched=True)

# Training settings
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=1,
    #max_steps=600,
    learning_rate=1.5e-3,
    fp16=True if dtype == torch.float16 else False,
    bf16=True if dtype == torch.bfloat16 else False,
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="none"
)


# Initializing trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,
    packing=False,
    args=training_args
)

trainer.train()


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/65000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/65000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 65,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 1,015
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.0916
20,0.732
30,0.7217
40,0.7156
50,0.683
60,0.7064
70,0.6866
80,0.6883
90,0.6808
100,0.6949


TrainOutput(global_step=1015, training_loss=0.5872226996962073, metrics={'train_runtime': 11356.514, 'train_samples_per_second': 5.724, 'train_steps_per_second': 0.089, 'total_flos': 1.787479602333352e+18, 'train_loss': 0.5872226996962073, 'epoch': 0.9992616293379276})

In [None]:
FastLanguageModel.for_inference(model)
test_dataset = dataset['test']
prompt = """You are the best mathematician and have to determine if an answer to a given maths question is correct. Respond with 'True' if correct, otherwise 'False'.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""

results = []

# Inference Loop
for i in range(len(test_dataset)):
    question = test_dataset['question'][i]
    answer = test_dataset['answer'][i]
    solution = test_dataset['solution'][i]

    input_prompt = prompt.format(question, answer, solution, "")

    # Tokenize and run inference
    inputs = tokenizer([input_prompt], return_tensors="pt").to(device)
    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=1, use_cache=True)

    # Decode
    response = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True).strip()

    is_correct_pred = "True" if "True" in response else "False"
    results.append({"ID": i, "is_correct": is_correct_pred})
    print(f"Record {i}: {is_correct_pred}")

submission_df = pd.DataFrame(results)
submission_df.to_csv("submission_optimized11.csv", index=False)
print("Saved submission file as 'submission_optimized.csv'")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Record 5001: True
Record 5002: True
Record 5003: False
Record 5004: True
Record 5005: True
Record 5006: True
Record 5007: True
Record 5008: False
Record 5009: False
Record 5010: False
Record 5011: True
Record 5012: False
Record 5013: False
Record 5014: False
Record 5015: False
Record 5016: False
Record 5017: False
Record 5018: True
Record 5019: True
Record 5020: False
Record 5021: False
Record 5022: False
Record 5023: False
Record 5024: True
Record 5025: False
Record 5026: True
Record 5027: True
Record 5028: True
Record 5029: True
Record 5030: True
Record 5031: True
Record 5032: False
Record 5033: False
Record 5034: False
Record 5035: True
Record 5036: True
Record 5037: False
Record 5038: False
Record 5039: False
Record 5040: True
Record 5041: False
Record 5042: False
Record 5043: False
Record 5044: False
Record 5045: False
Record 5046: False
Record 5047: True
Record 5048: False
Record 5049: False
Record 5050: False
Recor

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

FastLanguageModel.for_inference(model)
validation_dataset = dataset['train'].select(range(500000, 501000))
ground_truth = validation_dataset['is_correct']

#Prompt
prompt = """You are the best mathematician and have to determine if an answer to a given maths question is correct. Respond with 'True' if correct, otherwise 'False'.

### Question:
{}

### Answer:
{}

### Explanation:
{}

### Output:
{}"""

predictions = []
questions = []
answers = []
predicted_outputs = []

# Inference Loop
for i in range(len(validation_dataset)):
    question = validation_dataset['question'][i]
    answer = validation_dataset['answer'][i]
    solution = validation_dataset['solution'][i]
    is_correct_actual = ground_truth[i]

    input_prompt = prompt.format(question, answer, solution, "")

    # Tokenize and run inference
    inputs = tokenizer([input_prompt], return_tensors="pt").to(device)
    input_token_len = inputs['input_ids'].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=1, use_cache=True)

    # Decode
    response = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True).strip()

    if "True" in response:
        is_correct_pred = True
    elif "False" in response:
        is_correct_pred = False
    else:
        is_correct_pred = False

    predictions.append(is_correct_pred)
    questions.append(question)
    answers.append(answer)
    predicted_outputs.append(response)

    print(f"Record {5000 + i}:")
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print(f"Predicted Output: {response}")
    print(f"Predicted is_correct: {is_correct_pred}")
    print(f"Actual is_correct: {is_correct_actual}")
    print("=" * 50)

accuracy = accuracy_score(ground_truth, predictions)
print(f"\nValidation Accuracy: {accuracy * 100:.2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(ground_truth, predictions))
print("\nClassification Report:")
print(classification_report(ground_truth, predictions, target_names=["False", "True"]))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted is_correct: True
Actual is_correct: True
Record 5387:
Question: What is $\frac{~\frac{2}{5}~}{\frac{3}{7}}$?
Answer: 14/15
Predicted Output: True
Predicted is_correct: True
Actual is_correct: True
Record 5388:
Question: The graph of the parabola $x = 2y^2 - 6y + 3$ has an $x$-intercept $(a,0)$ and two $y$-intercepts $(0,b)$ and $(0,c)$.  Find $a + b + c$.
Answer: \frac{3}{2}
Predicted Output: False
Predicted is_correct: False
Actual is_correct: False
Record 5389:
Question: Find the area of the triangle with vertices $(6,5,3),$ $(3,3,1),$ and $(15,11,9).$
Answer: 0
Predicted Output: False
Predicted is_correct: False
Actual is_correct: True
Record 5390:
Question: In rectangle $ABCD$, $P$ is a point on $BC$ so that $\angle APD=90^{\circ}$. $TS$ is perpendicular to $BC$ with $BP=PT$, as shown.  $PD$ intersects $TS$ at $Q$.  Point $R$ is on $CD$ such that $RA$ passes through $Q$.  In $\triangle PQA$, $PA=20$, $AQ=25$

In [None]:
import shutil
save_directory = "saved_model_directory"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
shutil.make_archive(save_directory, 'zip', save_directory)
from google.colab import files
files.download(f"{save_directory}.zip")

print(f"Model and tokenizer saved and zipped as {save_directory}.zip for download.")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model and tokenizer saved and zipped as saved_model_directory.zip for download.
