In [1]:
!pip install bitsandbytes datasets accelerate loralib wandb sentencepiece -q
!pip install git+https://github.com/huggingface/peft.git git+https://github.com/zphang/transformers@c3dc391 -q

import wandb
wandb.login()
%env WANDB_PROJECT=llama-lora-crd3

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


[34m[1mwandb[0m: Currently logged in as: [33mbfitzgerald[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=llama-lora-crd3


In [2]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model


MICRO_BATCH_SIZE = 4  # this could actually be 5 but i like powers of 2
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3  # we don't need 3 tbh
LEARNING_RATE = 3e-4  # the Karpathy constant
CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

model = LLaMAForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
tokenizer = LLaMATokenizer.from_pretrained(
    "decapoda-research/llama-7b-hf", add_eos_token=True
)

model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [3]:
from datasets import load_dataset

data_source = "crd3"

if data_source == "quotes":
  data = load_dataset("Abirate/english_quotes")
  dataset_tokenized = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
elif data_source == "crd3":
  dataset = load_dataset("crd3")

  def tokenize(item):
    result = tokenizer(
      item["text"],
      truncation=True,
      max_length=512,
      padding="max_length",
    )
    return {
        "input_ids": result["input_ids"],
        "attention_mask": result["attention_mask"],
    }

  def get_relevant_turn_data(x):
      turns = x['turns']
      turns = [t for t in turns]
      res = []
      for t in turns:
          if len(t["utterances"]) > 0:
              all_utterances = " ".join(t['utterances'])
              res.append(all_utterances.strip())
      res = " ".join(res)
      return {'text': res }

  dataset = dataset.map(get_relevant_turn_data)
  dataset = dataset.shuffle().map(tokenize, batched=True, batch_size=500)



  0%|          | 0/3 [00:00<?, ?it/s]



Map:   0%|          | 0/38969 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/6327 [00:00<?, ? examples/s]

In [4]:
!nvidia-smi

Sat Apr  1 20:37:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    48W / 400W |   8255MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [7]:
import gc
gc.collect()
torch.cuda.empty_cache()

trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=20,
        output_dir="lora-alpaca",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

Step,Training Loss
20,2.7326
40,2.6771
60,2.5686
80,2.5196
100,2.4823
120,2.4459
140,2.4175
160,2.4097
180,2.3653
200,2.3669


TrainOutput(global_step=912, training_loss=2.2042412172284043, metrics={'train_runtime': 25213.0981, 'train_samples_per_second': 4.637, 'train_steps_per_second': 0.036, 'total_flos': 2.373298808684544e+18, 'train_loss': 2.2042412172284043, 'epoch': 3.0})

In [None]:
from huggingface_hub import notebook_login, login

login("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", True)
model.push_to_hub("roborovski/peft-aid", use_auth_token=True)