In [None]:
# install all the things
!pip install -U peft trl bitsandbytes datasets accelerate transformers

In [None]:
# import all the things
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from datasets import load_dataset, Dataset
import pandas as pd


In [2]:
# pick your base model from huggingface - https://huggingface.co/models
base_model = "meta-llama/Llama-3.2-3B-Instruct"

# name your new model
new_model = "Llama-3.2-3B-Instruct-Jerry-Seinfeld-Monologues"

In [3]:
# load in your data
# in this particular example i've imported from a CSV file, but as long as you get it into a huggingface Dataset type it doesn't matter what shape your data initially comes in
df = pd.read_csv('cleaned_monologues.csv')

# shape your data into a single line. i've opted to separate the prompt from the actual desired output using the bar characters since that's quite a rare combo, and through the process of fine-tuning the model should learn that anything after '||' should be content in the form of a Jerry Seinfeld monologue.
df['combined'] = df['prompt'] + ' || ' + df['monologue']

In [None]:
# sanity check
df.head()

In [None]:
# convert dataframe to huggingface dataset ready for training
dataset = Dataset.from_pandas(df)

In [8]:
# set the datatype for the quantization process
compute_dtype = getattr(torch, "float16")

# perform quantization to reduce overall model memory usage so you can train using less GPU and compute
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [10]:
# use huggingface's AutoModelForCausalLM to load in the base model (which in this case is Llama-3.2-3B) with quantization config and configure it to use the underlying GPU
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
# use huggingface's AutoTokenizer to load in the base model's default tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# add the || special token which I'm using to separate the prompt from the actual desired output
if "||" not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'additional_special_tokens': ['||']})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.resize_token_embeddings(len(tokenizer))

In [12]:
# PEFT: Parameter-Efficient Fine-Tuning / LoRA: Low-Rank Adaptation
# this is the config for the PEFT/LoRA process which is a method of fine-tuning that allows you to modify only a small number of model parameters during training, rather than the entire model. It actually adds a new small layer of weights that sit alongside the original weight matrices.
peft_params = LoraConfig(
    lora_alpha=16,  # alpha parameter for the LoRA process
    lora_dropout=0.1,  # dropout rate for the LoRA process
    r=64,  # rank of the LoRA process
    bias="none",  # bias for the LoRA process
    task_type="CAUSAL_LM",  # task type for the LoRA process
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"]
)

# Read the original LoRA paper here: https://arxiv.org/abs/2106.09685

In [19]:
# training arguments required for the SFTTrainer, including which optimiser to use, the number of epochs to train for, the batch size, etc.
training_params = TrainingArguments(
    output_dir="./results",  # where to save the model checkpoints during training
    optim="paged_adamw_32bit",  # alternatives to try: adam_hf, adamw_torch
    num_train_epochs=5,  # how many times the model will go through the entire dataset
    per_device_train_batch_size=4,  # how many sequences to process at once
    gradient_accumulation_steps=1,  # number of steps to accumulate gradients before updating the model parameters
    save_steps=25,  # save the model checkpoint every 25 steps
    logging_steps=25,  # log the training metrics every 25 steps
    learning_rate=1e-4,  # learning rate for the optimiser
    weight_decay=0.001,  # weight decay for the optimiser
    max_grad_norm=0.3,  # maximum gradient norm for gradient clipping
    max_steps=-1,  # maximum number of training steps
    warmup_ratio=0.03,  # number of warmup steps
    group_by_length=True,  # group the sequences by length
    lr_scheduler_type="constant",  # learning rate scheduler type (could also try linear to ramp up over time)
    report_to="tensorboard"  # report the training metrics to tensorboard
)

In [None]:
# this is the supervised fine-tuning trainer which pulls together the model, the training arguments, the dataset, the PEFT config, and the tokenizer
trainer = SFTTrainer(
    model=model,  # the model to train
    train_dataset=dataset,  # the dataset to train on
    peft_config=peft_params,  # the PEFT config from above
    args=training_params,  # the training arguments from above 
    tokenizer=tokenizer,  # the tokenizer
    dataset_text_field="combined",  # the text field in the dataset to train on (in our case it is the <prompt> || <monologue> field combined)
    max_seq_length=None,  # the maximum sequence length (None means no maximum)
)

In [None]:
# now we train, ☕️
# depending on the infrastruture/GPU you're using, along with the dataset size, number of epochs, etc. this could take a while ... try to run this on an A100 or a similar high-end GPU
trainer.train()

In [None]:
# save your model config
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [23]:
# TODO: explain visualisation of training data
# from tensorboard import notebook
# log_dir = "results/runs"
# notebook.start("--logdir {} --port 4000".format(log_dir))

In [None]:
# test out some prompts against the new model
prompt = "Airplane food || "
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(model.device)
output_ids = model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    repetition_penalty=1.2
)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

In [None]:
# test out th esame prompts against the base model
prompt = "Airplane food || "
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(model.device)
output_ids = base_model.generate(
    input_ids,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    repetition_penalty=1.2
)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)