In [None]:
### mount google drive to colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
### install libraries
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-7819jtlz/unsloth_8ef155c4ebe64e54af606fb57a9d75e0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-7819jtlz/unsloth_8ef155c4ebe64e54af606fb57a9d75e0
  Resolved https://github.com/unslothai/unsloth.git to commit cd1b44878686972d1de60e905215825da330f1e1
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.4-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.16.0 (from unsloth[colab-ne

In [None]:
### login with huggingface account
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
### load model
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
### add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
### data preparation
import pandas as pd
import pyarrow as pa
from datasets import Dataset

# load data from csv file
df = pd.read_csv('gdrive/My Drive/datasets/reddit_AMA_full_export.csv')
print('Original number of rows:', len(df))

# remove meaningless questions
df = df[~df['question'].isin(['[deleted]', '[removed]'])].reset_index(drop=True)
print('Number of rows after removing meaningless questions:', len(df))

# keep a small subset of the data (so that training doesn't take too long)
df = df[:100].reset_index(drop=True)
print('Remaining number of rows:', len(df))

# rename columns
df = df.rename(columns={'question':'instruction', 'answer':'output'})

# add empty column
df['input'] = ''

print(df.head(10))

# convert dataframe to Arrow table
train_dataset = pa.Table.from_pandas(df)

# convert to Huggingface dataset
train_dataset = Dataset(train_dataset)

# process data so that it is consistent with LLaMA's prompt format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

Original number of rows: 28590
Number of rows after removing meaningless questions: 27746
Remaining number of rows: 100
                                               title  score      id  \
0  I am a therapist who helps people with sexual ...     24  mhmunp   
1  I am a therapist who helps people with sexual ...     24  mhmunp   
2  I am a therapist who helps people with sexual ...     24  mhmunp   
3  I am a therapist who helps people with sexual ...     24  mhmunp   
4  I am a therapist who helps people with sexual ...     24  mhmunp   
5  I am a therapist who helps people with sexual ...     24  mhmunp   
6  I am a therapist who helps people with sexual ...     24  mhmunp   
7  I am a therapist who helps people with sexual ...     24  mhmunp   
8  I am a therapist who helps people with sexual ...     24  mhmunp   
9  I am a therapist who helps people with sexual ...     24  mhmunp   

                                                 url  comms_num       created  \
0  https://www.re

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# create SFTTrainer
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

output_dir = "aloysiuschan/llama3-8b-qlora-reddit-ama-300524"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,
        push_to_hub = True,
        save_steps = 10
    )
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,2.4751
20,2.0274
30,1.9266
40,1.7528
50,1.6305


In [None]:
# load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = output_dir, # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "How should two highly sensitive people in a relationship navigate through simple ideological disagreements without taking it personally?", # instruction
        "",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nHow should two highly sensitive people in a relationship navigate through simple ideological disagreements without taking it personally?\n\n### Input:\n\n\n### Response:\nThis is a tough one. I’ve been in a relationship where one of us was highly sensitive and the other was not. The not sensitive partner would take things personally that the sensitive partner would not. It was frustrating and exhausting for both of us. \n\nI’m not sure if there is a formula for this, but']