In [None]:
# prompt: connect google drive

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/6785_project')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Set the environment variable
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from huggingface_hub import snapshot_download

# Download model from Hugging Face (requires login with HF token)
snapshot_download(
    repo_id="meta-llama/Llama-3.2-1B",
    local_dir="/content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b",
    token="anonymized"  # Replace with your token
)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

'/content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b'

In [None]:
from transformers import AutoModelForSequenceClassification

try:
  del model
  print("model deleted")
except:
  print("no model to delete")

# Load the model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b",
    num_labels=2  # Binary classification
).to("cuda")

# for debugging on CPU-only machines
# model = AutoModelForSequenceClassification.from_pretrained(
#     "/content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b",
#     num_labels=2  # Binary classification
# )

no model to delete


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/6785_project/llama-3.2-1b")
# add padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
!pip install datasets==3.3.2



In [None]:
from datasets import load_dataset

dataset_dict = load_dataset('json', data_files={
    'train': "/content/drive/MyDrive/Colab Notebooks/6785_project/train.json",
    'test': "/content/drive/MyDrive/Colab Notebooks/6785_project/test.json"
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['input', 'label'],
        num_rows: 618582
    })
    test: Dataset({
        features: ['input', 'label'],
        num_rows: 232327
    })
})


In [None]:
# Example preprocessing for binary classification
def preprocess_function(examples):
    # Tokenize the input text with truncation and padding
    inputs = tokenizer(
        examples["input"],  # Input text
        truncation=True,    # Truncate sequences longer than max_length
        padding="max_length",  # Pad sequences to max_length
        max_length=80,     # Maximum sequence length, this is the lowest that ensures all content in the prompt are tokenized
    )
    # Add labels to the inputs
    inputs["labels"] = [int(x) for x in examples["label"]]  # Ensure 'label' column exists in your dataset
    return inputs

# Apply preprocessing
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

# Remove the 'input' and 'label' column from the tokenized dataset
tokenized_dataset = tokenized_dataset.remove_columns(["input", "label"])

print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 618582
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 232327
    })
})


In [None]:
training_dataset = tokenized_dataset.copy()

In [None]:
n_samples = 20000
training_dataset['train'] = training_dataset['train'].select(range(n_samples))

### actual fine tuning

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/Colab Notebooks/6785_project/llama-finetuned_{n_samples}",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,  # Reduce batch size if needed
    gradient_accumulation_steps=1,  # Accumulate gradients over steps
    gradient_checkpointing=False,  # Enable gradient checkpointing
    save_steps=10000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    eval_strategy="steps",
    eval_steps=3 * len(training_dataset["train"]) // 64,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=True,  # Enable mixed precision training (FP16)
    report_to="none",
    # no_cuda=True,  # Ensure the code runs on CPU (if you want to train on CPU)
    remove_unused_columns=False  # Prevent removal of unused columns
)

In [None]:
# free up cuda
import torch
torch.cuda.empty_cache()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.nn.functional as F
import numpy as np

try:
  del trainer
  print("trainer deleted")
except:
  print("no trainer to delete")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.from_numpy(logits)
    predictions = logits.argmax(axis=-1)
    probs = F.softmax(logits, dim=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    rocauc = roc_auc_score(labels, logits[:, 1])
    miscalibration = torch.mean(probs[:,1]).item() - np.mean(labels)
    return {"accuracy": acc, 'rocauc': rocauc, 'Miscalibration': miscalibration}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset["train"],
    eval_dataset=training_dataset["test"],  # Add validation dataset if available
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

no trainer to delete


  trainer = Trainer(


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,Rocauc,Miscalibration
937,0.5181,0.48306,0.801551,0.657549,-0.05396


TrainOutput(global_step=939, training_loss=0.4721527262109791, metrics={'train_runtime': 1120.3226, 'train_samples_per_second': 53.556, 'train_steps_per_second': 0.838, 'total_flos': 2.80267259904e+16, 'train_loss': 0.4721527262109791, 'epoch': 3.0})

In [None]:
trainer.save_model(f"/content/drive/MyDrive/Colab Notebooks/6785_project/llama-finetuned_{n_samples}")