In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from trl import setup_chat_format, SFTTrainer
from peft import LoraConfig

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_instruction(sample):
  return {
    "prompt": sample["Name"] + ". " + (sample["Description"] if sample["Description"] else ""),
    "completion": sample["Serialized"]
  }

dataset = load_dataset("csv", data_files="clean-data/20240428-rbxldata.csv", delimiter=",")
dataset = dataset.shuffle()

dataset = dataset.map(create_instruction, remove_columns=dataset["train"].column_names, batched=False)
dataset["train"].to_json("clean-data/train_dataset.json", orient="records")

Map: 100%|██████████| 4930/4930 [00:00<00:00, 9982.81 examples/s] 
Creating json from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 38.51ba/s]


16861941

In [3]:
model_id = "kmfoda/gpt2-500m"

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
  model_id,
  # device_map="auto",
  # torch_dtype=torch.bfloat16,
  # quantization_config=bnb_config
).to("cpu")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

In [4]:
peft_config = LoraConfig(
  lora_alpha=128,
  lora_dropout=0.05,
  r=256,
  bias="none",
  target_modules="all-linear",
  task_type="CAUSAL_LM",
)

In [5]:
args = TrainingArguments(
  output_dir="masonai-4-28", # directory to save and repository id
  num_train_epochs=3,                     # number of training epochs
  per_device_train_batch_size=3,          # batch size per device during training
  gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
  gradient_checkpointing=True,            # use gradient checkpointing to save memory
  optim="adamw_torch_fused",              # use fused adamw optimizer
  logging_steps=10,                       # log every 10 steps
  save_strategy="epoch",                  # save checkpoint every epoch
  learning_rate=2e-4,                     # learning rate, based on QLoRA paper
  # bf16=True,                              # use bfloat16 precision
  tf32=True,                              # use tf32 precision
  max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
  warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
  lr_scheduler_type="constant",           # use constant learning rate scheduler
)

In [6]:
max_seq_length = 2387

trainer = SFTTrainer(
  model=model,
  args=args,
  train_dataset=dataset["train"],
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  
  dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
  }
)

Generating train split: 0 examples [00:00, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (18551 > 1024). Running this sequence through the model will result in indexing errors
Generating train split: 5161 examples [00:07, 705.94 examples/s] 


In [7]:
# print the dimensions of the data

print("Training data dimensions:")
print(trainer.train_dataset)

Training data dimensions:
Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 5161
})


In [8]:
trainer.train()
trainer.save_model()

  0%|          | 0/2580 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
del model
del trainer
torch.cuda.empty_cache()