<a href="https://colab.research.google.com/github/aaronbergfeld/w266-final-project/blob/main/LoRA_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install bitsandbytes

import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
import torch
import wandb



# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Configuration

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
input_dir = "/content/drive/MyDrive/w266 Final Project/data/Train/"
nq_filename = "NQ-open.train_faiss_not_sanitized.jsonl"
rrb_filename = "RRB.train_faiss_not_sanitized.jsonl"
output_dir = "/content/drive/MyDrive/w266 Final Project/data/" + model_name + "/model"
max_length = 2048

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Utility Functions

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd
import torch
import os

def load_model_and_tokenizer(model_name: str, bnb_config: BitsAndBytesConfig = None, device: str = "auto") -> tuple:
    """
    Load a pre-trained model and tokenizer from Hugging Face.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16, quantization_config=bnb_config)
    return model, tokenizer

def prepare_dataset(nq_path, rrb_path, tokenizer, max_length: int = 512) -> dict:
    """
    Load and preprocess a dataset for fine-tuning.
    """

    if not os.path.exists(rrb_path):
        raise FileNotFoundError(f"RRB file not found at: {rrb_path}")
    if not os.path.exists(nq_path):
        raise FileNotFoundError(f"NQ file not found at: {nq_path}")

    try:
        rrb_df = pd.read_json(rrb_path, lines=True)
        rrb_dataset = Dataset.from_pandas(rrb_df)
        rrb_dataset = rrb_dataset.shuffle(seed=42)
    except Exception as e:
        raise IOError(f"Error reading RRB file {rrb_path}: {e}")


    try:
        nq_df = pd.read_json(nq_path, lines=True)
        nq_df['answer'] = nq_df['answer'].apply(lambda x: x[0])
        nq_dataset = Dataset.from_pandas(nq_df)
        nq_dataset = nq_dataset.shuffle(seed=42)
    except Exception as e:
        raise IOError(f"Error reading NQ file {nq_path}: {e}")


    dataset = concatenate_datasets([nq_dataset.select(range(2000)), rrb_dataset.select(range(2000))])

    system_prompt = """You are an expert question-answering model that answers solely based on the information provided in the following documents:
{documents}
Every user message will be a single question.
For each question, output exactly one line containing only your best concise factual answer, derived exclusively from the documents.
Do not repeat the question, do not include any additional text, explanations, or formatting.

Examples:
Question: Who wrote the Iliad?
Answer: Homer

Question: What is the capital of France?
Answer: Paris

Question: In what year did the Titanic sink?
Answer: 1912"""

    user_prompt = "Question: {question}\nAnswer:"

    def _format(example):
        # Format as chat message
        docs = "\n".join([d['text'] for d in example['faiss'][:3]])
        input = "<|SYSTEM|>\n" + system_prompt.format(documents=docs) + "\n\n<|USER|>\n" + user_prompt.format(question=example['question']) + "\n<|ASSISTANT|>\n"
        target = example['answer']
        return {"text": input + target}

    dataset = dataset.map(_format)

    def preprocess_function(examples):
        tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    return tokenized_dataset

def configure_lora(model, r: int = 8, lora_alpha: int = 32, target_modules: list = None) -> PeftModel:
    """
    Configure and apply LoRA to the model.
    """
    if target_modules is None:
        target_modules = ["q_proj", "v_proj"]

    lora_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        target_modules=target_modules,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    return get_peft_model(model, lora_config)

def setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir: str) -> Trainer:
    """
    Set up the Hugging Face Trainer for fine-tuning.
    """
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        eval_strategy="epoch",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        # gradient_accumulation_steps=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
        label_names=["labels"]
    )

    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

def save_model_and_tokenizer(model, tokenizer, output_dir: str):
    """
    Save the fine-tuned model and tokenizer.
    """
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def load_finetuned_model(base_model_name: str, lora_model_dir: str, device: str = "auto") -> PeftModel:
    """
    Load a fine-tuned LoRA model.
    """
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map=device)
    model = PeftModel.from_pretrained(base_model, lora_model_dir)
    return model

def generate_text(model, tokenizer, prompt: str, max_length: int = 100) -> str:
    """
    Generate text using the fine-tuned model.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def merge_lora_weights(model, output_dir: str):
    """
    Merge LoRA weights with the base model and save.
    """
    merged_model = model.merge_and_unload()
    merged_model.save_pretrained(output_dir)

# Prepare Dataset

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import torch

def sweep_train():
    import wandb
    import torch
    from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
    from peft import get_peft_model, LoraConfig, TaskType

    run = wandb.init(project="w266-final-project")
    print(f"guru-{run.name}")
    cfg = run.config

    try:
      # Load base model and tokenizer
      model, tokenizer = load_model_and_tokenizer(model_name, bnb_config=bnb_config)
      datasets = prepare_dataset(input_dir + nq_filename, input_dir + rrb_filename, tokenizer, max_length)

      # Apply LoRA
      target_modules = ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj', 'lm_head']
      lora_config = LoraConfig(
          r=cfg.lora_r,
          lora_alpha=cfg.lora_alpha,
          target_modules=target_modules,
          lora_dropout=cfg.lora_dropout,
          bias="none",
          task_type=TaskType.CAUSAL_LM
      )
      model_peft = get_peft_model(model, lora_config)
      model_peft.print_trainable_parameters()

      # Set up training arguments
      training_args = TrainingArguments(
          output_dir=f"{output_dir}/{run.name}",
          per_device_train_batch_size=cfg.batch_size,
          per_device_eval_batch_size=cfg.batch_size,
          learning_rate=cfg.learning_rate,
          num_train_epochs=cfg.num_epochs if hasattr(cfg, 'num_epochs') else 3,
          weight_decay=cfg.weight_decay,
          eval_strategy="epoch",
          save_strategy="epoch",
          load_best_model_at_end=True,
          logging_steps=10,
          report_to="wandb",
          run_name=run.name
      )

      trainer = Trainer(
          model=model_peft,
          args=training_args,
          train_dataset=datasets["train"],
          eval_dataset=datasets["test"],
          tokenizer=tokenizer,
          data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
      )

      trainer.train()

      # Save model and tokenizer
      model_peft.save_pretrained(f"{output_dir}/{run.name}")
      tokenizer.save_pretrained(f"{output_dir}/{run.name}")
      wandb.finish()

    except Exception as e:
      wandb.finish()
      torch.cuda.empty_cache()
      torch.cuda.ipc_collect()
      print(e)
    finally:
      torch.cuda.empty_cache()
      torch.cuda.ipc_collect()

In [None]:
sweep_config = {
    "method": "bayes",  # can also be "random" or "grid"
    "metric": {"name": "eval_loss", "goal": "minimize"},
    "parameters": {
        "learning_rate": {"min": 1e-5, "max": 5e-4},
        "lora_r": {"values": [4, 8, 16]},
        "lora_alpha": {"values": [8, 16, 32]},
        "batch_size": {"values": [1, 2]},
        "weight_decay": {"values": [0.0, 0.001, 0.01]},
        "lora_dropout": {"values": [0.1, 0.2, 0.3]},
    },
    "program": "None (we use a function)"
}

sweep_id = wandb.sweep(sweep_config, project="w266-final-project")

# Run 3 experiments (adjust as needed)
for _ in range(3):
    wandb.agent(sweep_id, function=sweep_train)

Create sweep with ID: qosbbojb
Sweep URL: https://wandb.ai/aaronbergfeld-university-of-california-berkeley/w266-final-project/sweeps/qosbbojb


[34m[1mwandb[0m: Agent Starting Run: 8yte97nn with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 0.00024287328878618548
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.3
[34m[1mwandb[0m: 	lora_r: 4
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33maaronbergfeld[0m ([33maaronbergfeld-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


guru-leafy-sweep-1


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 11,015,168 || all params: 8,041,276,416 || trainable%: 0.1370




Epoch,Training Loss,Validation Loss
1,0.2021,0.224026
2,0.1456,0.225478
3,0.1668,0.244787




0,1
eval/loss,▁▁█
eval/runtime,█▁▂
eval/samples_per_second,▁██
eval/steps_per_second,▁██
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/grad_norm,▆▃▁▁▄▂▄▁▁▂▃▂▃▂▄▃▁▄▂▂▂▂▃▃▄▂▃▄▂▂▁▄█▃▁▃▂▁▂▂
train/learning_rate,█████▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃▂▂▂▂▂▂▁▁▁
train/loss,▆▇█▄▇█▇▄▆▄▆▅▅▃▅▃▅▃▂▃▂▄▄▄▄▂▂▂▂▁▁▁▁▂▂▁▁▂▁▁

0,1
eval/loss,0.24479
eval/runtime,167.474
eval/samples_per_second,10.748
eval/steps_per_second,2.687
total_flos,4.98721691271168e+17
train/epoch,3.0
train/global_step,5400.0
train/grad_norm,0.9043
train/learning_rate,0.0
train/loss,0.1668


[34m[1mwandb[0m: Agent Starting Run: k7d0txnp with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.00024728187308868953
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.2
[34m[1mwandb[0m: 	lora_r: 4
[34m[1mwandb[0m: 	weight_decay: 0


guru-spring-sweep-2


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 11,015,168 || all params: 8,041,276,416 || trainable%: 0.1370




CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 162.88 MiB is free. Process 93228 has 39.39 GiB memory in use. Of the allocated memory 38.62 GiB is allocated by PyTorch, and 263.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


[34m[1mwandb[0m: Agent Starting Run: 9mnuo371 with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 0.0004790342722214917
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 4
[34m[1mwandb[0m: 	weight_decay: 0


guru-devoted-sweep-3


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 11,015,168 || all params: 8,041,276,416 || trainable%: 0.1370




Epoch,Training Loss,Validation Loss
1,0.2431,0.283545
2,0.1943,0.264328
3,0.2082,0.264458




0,1
eval/loss,█▁▁
eval/runtime,▁▂█
eval/samples_per_second,█▇▁
eval/steps_per_second,█▆▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▇▇█████
train/global_step,▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/grad_norm,▁▁▂▆▄▂▄▄▅▃█▃▅▅▃▃▂▁▄▂▂▅▂▁▁▂▅▃▂▂▃▄▂▄▄▃▂▂▂▁
train/learning_rate,█████▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁
train/loss,▆▇▅▇█▅▅▅▆▇▃▄▇▅▇▃▃▆▄▄▁▁▂▃▃▁▂▂▂▂▂▃▁▂▂▂▁▁▁▂

0,1
eval/loss,0.26446
eval/runtime,167.5737
eval/samples_per_second,10.742
eval/steps_per_second,2.685
total_flos,4.98721691271168e+17
train/epoch,3.0
train/global_step,5400.0
train/grad_norm,1.02114
train/learning_rate,0.0
train/loss,0.2082


[34m[1mwandb[0m: Agent Starting Run: cz653zvz with config:
[34m[1mwandb[0m: 	batch_size: 4
[34m[1mwandb[0m: 	learning_rate: 5.644260160302799e-05
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.3
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	weight_decay: 0.001


guru-misunderstood-sweep-4


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 44,060,672 || all params: 8,074,321,920 || trainable%: 0.5457




Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.1868,0.21172
2,0.1544,0.210688
3,0.2081,0.223747




0,1
eval/loss,▂▁█
eval/runtime,▁█▆
eval/samples_per_second,█▁▃
eval/steps_per_second,█▁▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▄▃▂▁▃▅▂▂▁▁▂▁▂▃▂▃▃▄▃▂▃▄▃▃▄▃▅▅▆▄▃▄▆▆▅▇█▄▆▆
train/learning_rate,████▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁
train/loss,▅▄▄▆█▄▄▅▄▅▃▄▄▆▅▃▃▄▄▃▄▅▄▄▄▅▃▃▄▃▄▃▁▁▁▂▃▂▁▁

0,1
eval/loss,0.22375
eval/runtime,167.8877
eval/samples_per_second,10.721
eval/steps_per_second,2.68
total_flos,5.009144322981888e+17
train/epoch,3.0
train/global_step,5400.0
train/grad_norm,0.71214
train/learning_rate,0.0
train/loss,0.2081


[34m[1mwandb[0m: Agent Starting Run: 0e9njl53 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.00047957234333614727
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.2
[34m[1mwandb[0m: 	lora_r: 4
[34m[1mwandb[0m: 	weight_decay: 0.01


guru-gallant-sweep-5


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 11,015,168 || all params: 8,041,276,416 || trainable%: 0.1370




CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 112.88 MiB is free. Process 93228 has 39.44 GiB memory in use. Of the allocated memory 38.70 GiB is allocated by PyTorch, and 233.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


[34m[1mwandb[0m: Agent Starting Run: 1h6unvgv with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.0004044805464783769
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	lora_dropout: 0.1
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	weight_decay: 0.01


guru-spring-sweep-6


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 44,060,672 || all params: 8,074,321,920 || trainable%: 0.5457




CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 68.88 MiB is free. Process 93228 has 39.48 GiB memory in use. Of the allocated memory 38.78 GiB is allocated by PyTorch, and 198.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


[34m[1mwandb[0m: Agent Starting Run: 5pj895dh with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.0002449099663016212
[34m[1mwandb[0m: 	lora_alpha: 32
[34m[1mwandb[0m: 	lora_dropout: 0.2
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	weight_decay: 0.01


guru-hopeful-sweep-7


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 44,060,672 || all params: 8,074,321,920 || trainable%: 0.5457




[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 50.88 MiB is free. Process 93228 has 39.50 GiB memory in use. Of the allocated memory 38.78 GiB is allocated by PyTorch, and 218.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
# Terminate runtime
from google.colab import runtime
runtime.unassign()

In [None]:
modules = [module for module in model.modules()]
all_linear_modules = ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj', 'lm_head']

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7d1ebdd959d0>> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

NameError: name 'model' is not defined

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7d1ebdd959d0>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

# Configure LoRA

In [None]:

model = configure_lora(model, r=8, lora_alpha=32, target_modules=all_linear_modules)

# Set Up Trainer

In [None]:
trainer = setup_trainer(model, tokenizer, train_dataset, eval_dataset, output_dir)

# Train the Model

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

trainer.train()

# Save Fine-Tuned Model and Tokenizer

In [None]:
save_model_and_tokenizer(model, tokenizer, output_dir)