In [7]:
!pip install -qqq "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --progress-bar off
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton --progress-bar off

import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import wandb

import json
from datasets import Dataset

In [4]:
wandb.login(key = '9abde3fa7ae96f05aa2b8d19c5d7fae16cd0fb74')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/auschra/.netrc


True

## 1. Load model for PEFT

In [5]:
# Load model
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Prepare model for PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)
print(model.print_trainable_parameters())

==((====))==  Unsloth 2024.11.1: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA GeForce RTX 2080 Ti. Max memory: 10.571 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.11.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605
None


## 2. Prepare data and tokenizer

In [8]:
with open('datasets/ft_nogroup.json', 'r') as f:
    dataset = json.load(f)

tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role" : "from",
             "content" : "value",
             "user" : "human",
             "assistant" : "gpt"}
)

# Function to apply the template
def apply_template(example):
    instruction = example['instruction']
    response = example['response']
    text = f"Instruction: {instruction}\nResponse: {response}"
    return {"text": text}

processed_dataset = [apply_template(example) for example in dataset]
final_dataset = Dataset.from_dict({"text": [item["text"] for item in processed_dataset]})


print(final_dataset[0])

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


{'text': 'Instruction: These are the 16 words of the NYT Connections game: boot, bounce, eject, toss, earth, ground, land, soil, baltic, black, philippine, red, big, philadelphia, splash, sully. What are the four groups and each of their four words?\nResponse: (philadelphia splash sully big) (philippine black red baltic) (ground earth soil land) (boot bounce toss eject)'}


## 3. Training

In [9]:
# init wandb
# Initialize W&B project
wandb.init(project="Connections-nogroups-sets", config={
    "learning_rate": 3e-4,
    "batch_size": 4,
    "epochs": 1,
    "optimizer": "adamw_8bit"
})

# Modify TrainingArguments to report to W&B
training_args = TrainingArguments(
    learning_rate=3e-4,
    lr_scheduler_type="linear",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    warmup_steps=10,
    output_dir="output",
    seed=0,
    report_to="wandb",  # Enable W&B logging
)


max_seq_length = 512
# Trainer setup
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=final_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    dataset_num_proc=2,
    packing=True,
    args=training_args,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mauschra3[0m ([33mauschra3-massachusetts-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Generating train split: 942 examples [00:00, 2028.00 examples/s]
  super().__init__(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 942 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 59
 "-____-"     Number of trainable parameters = 20,971,520
  2%|▏         | 1/59 [00:12<12:22, 12.81s/it]

{'loss': 2.1947, 'grad_norm': 3.1580238342285156, 'learning_rate': 2.9999999999999997e-05, 'epoch': 0.02}


  3%|▎         | 2/59 [00:24<11:25, 12.02s/it]

{'loss': 2.203, 'grad_norm': 2.9665374755859375, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.03}


  5%|▌         | 3/59 [00:36<11:12, 12.02s/it]

{'loss': 2.1836, 'grad_norm': 2.016535758972168, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.05}


  7%|▋         | 4/59 [00:48<11:09, 12.18s/it]

{'loss': 2.0009, 'grad_norm': 1.3293216228485107, 'learning_rate': 0.00011999999999999999, 'epoch': 0.07}


  8%|▊         | 5/59 [01:01<11:02, 12.27s/it]

{'loss': 1.9751, 'grad_norm': 1.6255760192871094, 'learning_rate': 0.00015, 'epoch': 0.08}


 10%|█         | 6/59 [01:13<10:53, 12.33s/it]

{'loss': 1.8663, 'grad_norm': 1.6117225885391235, 'learning_rate': 0.00017999999999999998, 'epoch': 0.1}


 12%|█▏        | 7/59 [01:26<10:43, 12.38s/it]

{'loss': 1.7974, 'grad_norm': 1.6220439672470093, 'learning_rate': 0.00020999999999999998, 'epoch': 0.12}


 14%|█▎        | 8/59 [01:38<10:30, 12.36s/it]

{'loss': 1.6505, 'grad_norm': 1.506601095199585, 'learning_rate': 0.00023999999999999998, 'epoch': 0.14}


 15%|█▌        | 9/59 [01:51<10:21, 12.44s/it]

{'loss': 1.5531, 'grad_norm': 1.5972713232040405, 'learning_rate': 0.00027, 'epoch': 0.15}


 17%|█▋        | 10/59 [02:03<10:09, 12.44s/it]

{'loss': 1.4732, 'grad_norm': 0.9861966371536255, 'learning_rate': 0.0003, 'epoch': 0.17}


 19%|█▊        | 11/59 [02:15<09:57, 12.45s/it]

{'loss': 1.4871, 'grad_norm': 0.9691340327262878, 'learning_rate': 0.0002938775510204081, 'epoch': 0.19}


 20%|██        | 12/59 [02:28<09:43, 12.42s/it]

{'loss': 1.4458, 'grad_norm': 0.9770352244377136, 'learning_rate': 0.0002877551020408163, 'epoch': 0.2}


 22%|██▏       | 13/59 [02:40<09:32, 12.45s/it]

{'loss': 1.415, 'grad_norm': 0.8302508592605591, 'learning_rate': 0.0002816326530612245, 'epoch': 0.22}


 24%|██▎       | 14/59 [02:53<09:21, 12.47s/it]

{'loss': 1.3274, 'grad_norm': 0.9398384690284729, 'learning_rate': 0.00027551020408163264, 'epoch': 0.24}


 25%|██▌       | 15/59 [03:05<09:09, 12.49s/it]

{'loss': 1.3279, 'grad_norm': 1.2448674440383911, 'learning_rate': 0.0002693877551020408, 'epoch': 0.25}


 27%|██▋       | 16/59 [03:18<08:57, 12.50s/it]

{'loss': 1.2642, 'grad_norm': 0.9672018885612488, 'learning_rate': 0.00026326530612244894, 'epoch': 0.27}


 29%|██▉       | 17/59 [03:30<08:45, 12.51s/it]

{'loss': 1.302, 'grad_norm': 1.052605152130127, 'learning_rate': 0.0002571428571428571, 'epoch': 0.29}


 31%|███       | 18/59 [03:43<08:35, 12.57s/it]

{'loss': 1.2652, 'grad_norm': 1.156304955482483, 'learning_rate': 0.0002510204081632653, 'epoch': 0.31}


 32%|███▏      | 19/59 [03:56<08:21, 12.53s/it]

{'loss': 1.2525, 'grad_norm': 1.0360007286071777, 'learning_rate': 0.00024489795918367346, 'epoch': 0.32}


 34%|███▍      | 20/59 [04:08<08:08, 12.52s/it]

{'loss': 1.2254, 'grad_norm': 0.8986279368400574, 'learning_rate': 0.0002387755102040816, 'epoch': 0.34}


 36%|███▌      | 21/59 [04:21<07:55, 12.52s/it]

{'loss': 1.1975, 'grad_norm': 0.9487558603286743, 'learning_rate': 0.00023265306122448976, 'epoch': 0.36}


 37%|███▋      | 22/59 [04:33<07:39, 12.41s/it]

{'loss': 1.1928, 'grad_norm': 0.9182459115982056, 'learning_rate': 0.00022653061224489791, 'epoch': 0.37}


 39%|███▉      | 23/59 [04:45<07:25, 12.37s/it]

{'loss': 1.119, 'grad_norm': 1.0680190324783325, 'learning_rate': 0.00022040816326530612, 'epoch': 0.39}


 41%|████      | 24/59 [04:57<07:10, 12.30s/it]

{'loss': 1.1493, 'grad_norm': 1.1097345352172852, 'learning_rate': 0.00021428571428571427, 'epoch': 0.41}


 42%|████▏     | 25/59 [05:09<06:55, 12.22s/it]

{'loss': 1.1242, 'grad_norm': 1.1144570112228394, 'learning_rate': 0.00020816326530612243, 'epoch': 0.42}


 44%|████▍     | 26/59 [05:21<06:41, 12.18s/it]

{'loss': 1.0866, 'grad_norm': 1.013004183769226, 'learning_rate': 0.00020204081632653058, 'epoch': 0.44}


 46%|████▌     | 27/59 [05:34<06:30, 12.20s/it]

{'loss': 1.085, 'grad_norm': 1.5739003419876099, 'learning_rate': 0.00019591836734693873, 'epoch': 0.46}


 47%|████▋     | 28/59 [05:46<06:19, 12.24s/it]

{'loss': 1.0464, 'grad_norm': 1.1460976600646973, 'learning_rate': 0.00018979591836734694, 'epoch': 0.47}


 49%|████▉     | 29/59 [05:58<06:06, 12.22s/it]

{'loss': 1.0438, 'grad_norm': 1.2370675802230835, 'learning_rate': 0.0001836734693877551, 'epoch': 0.49}


 51%|█████     | 30/59 [06:10<05:54, 12.22s/it]

{'loss': 1.0354, 'grad_norm': 1.1111936569213867, 'learning_rate': 0.00017755102040816325, 'epoch': 0.51}


 53%|█████▎    | 31/59 [06:22<05:42, 12.23s/it]

{'loss': 1.0122, 'grad_norm': 1.1148709058761597, 'learning_rate': 0.0001714285714285714, 'epoch': 0.53}


 54%|█████▍    | 32/59 [06:35<05:30, 12.24s/it]

{'loss': 0.9989, 'grad_norm': 1.2572286128997803, 'learning_rate': 0.00016530612244897955, 'epoch': 0.54}


 56%|█████▌    | 33/59 [06:47<05:19, 12.27s/it]

{'loss': 0.9903, 'grad_norm': 1.126613736152649, 'learning_rate': 0.00015918367346938776, 'epoch': 0.56}


 58%|█████▊    | 34/59 [06:59<05:06, 12.25s/it]

{'loss': 0.911, 'grad_norm': 1.1675872802734375, 'learning_rate': 0.0001530612244897959, 'epoch': 0.58}


 59%|█████▉    | 35/59 [07:11<04:52, 12.17s/it]

{'loss': 0.8851, 'grad_norm': 1.2628892660140991, 'learning_rate': 0.00014693877551020406, 'epoch': 0.59}


 61%|██████    | 36/59 [07:23<04:38, 12.12s/it]

{'loss': 0.9158, 'grad_norm': 1.5132040977478027, 'learning_rate': 0.00014081632653061224, 'epoch': 0.61}


 63%|██████▎   | 37/59 [07:35<04:26, 12.13s/it]

{'loss': 0.8785, 'grad_norm': 1.3147333860397339, 'learning_rate': 0.0001346938775510204, 'epoch': 0.63}


 64%|██████▍   | 38/59 [07:47<04:14, 12.11s/it]

{'loss': 0.8327, 'grad_norm': 1.251460313796997, 'learning_rate': 0.00012857142857142855, 'epoch': 0.64}


 66%|██████▌   | 39/59 [08:00<04:02, 12.11s/it]

{'loss': 0.8199, 'grad_norm': 1.222895860671997, 'learning_rate': 0.00012244897959183673, 'epoch': 0.66}


 68%|██████▊   | 40/59 [08:12<03:49, 12.06s/it]

{'loss': 0.8493, 'grad_norm': 1.5010488033294678, 'learning_rate': 0.00011632653061224488, 'epoch': 0.68}


 69%|██████▉   | 41/59 [08:24<03:37, 12.08s/it]

{'loss': 0.8315, 'grad_norm': 1.403039813041687, 'learning_rate': 0.00011020408163265306, 'epoch': 0.69}


 71%|███████   | 42/59 [08:36<03:25, 12.06s/it]

{'loss': 0.7658, 'grad_norm': 1.307166576385498, 'learning_rate': 0.00010408163265306121, 'epoch': 0.71}


 73%|███████▎  | 43/59 [08:48<03:13, 12.07s/it]

{'loss': 0.7681, 'grad_norm': 1.2691479921340942, 'learning_rate': 9.795918367346937e-05, 'epoch': 0.73}


 75%|███████▍  | 44/59 [09:00<03:01, 12.10s/it]

{'loss': 0.7484, 'grad_norm': 1.3397983312606812, 'learning_rate': 9.183673469387755e-05, 'epoch': 0.75}


 76%|███████▋  | 45/59 [09:12<02:50, 12.20s/it]

{'loss': 0.7246, 'grad_norm': 1.2979655265808105, 'learning_rate': 8.57142857142857e-05, 'epoch': 0.76}


 78%|███████▊  | 46/59 [09:25<02:39, 12.28s/it]

{'loss': 0.74, 'grad_norm': 1.4601085186004639, 'learning_rate': 7.959183673469388e-05, 'epoch': 0.78}


 80%|███████▉  | 47/59 [09:37<02:26, 12.23s/it]

{'loss': 0.726, 'grad_norm': 1.3908170461654663, 'learning_rate': 7.346938775510203e-05, 'epoch': 0.8}


 81%|████████▏ | 48/59 [09:49<02:14, 12.23s/it]

{'loss': 0.6914, 'grad_norm': 1.3731783628463745, 'learning_rate': 6.73469387755102e-05, 'epoch': 0.81}


 83%|████████▎ | 49/59 [10:01<02:02, 12.25s/it]

{'loss': 0.7005, 'grad_norm': 1.2934080362319946, 'learning_rate': 6.122448979591836e-05, 'epoch': 0.83}


 85%|████████▍ | 50/59 [10:14<01:50, 12.25s/it]

{'loss': 0.6681, 'grad_norm': 1.3213578462600708, 'learning_rate': 5.510204081632653e-05, 'epoch': 0.85}


 86%|████████▋ | 51/59 [10:26<01:38, 12.29s/it]

{'loss': 0.6963, 'grad_norm': 1.3100751638412476, 'learning_rate': 4.897959183673468e-05, 'epoch': 0.86}


 88%|████████▊ | 52/59 [10:38<01:25, 12.24s/it]

{'loss': 0.6572, 'grad_norm': 1.3655201196670532, 'learning_rate': 4.285714285714285e-05, 'epoch': 0.88}


 90%|████████▉ | 53/59 [10:50<01:13, 12.20s/it]

{'loss': 0.6658, 'grad_norm': 1.3676999807357788, 'learning_rate': 3.6734693877551016e-05, 'epoch': 0.9}


 92%|█████████▏| 54/59 [11:02<01:00, 12.14s/it]

{'loss': 0.6289, 'grad_norm': 1.3254294395446777, 'learning_rate': 3.061224489795918e-05, 'epoch': 0.92}


 93%|█████████▎| 55/59 [11:15<00:48, 12.19s/it]

{'loss': 0.6449, 'grad_norm': 1.2778366804122925, 'learning_rate': 2.448979591836734e-05, 'epoch': 0.93}


 95%|█████████▍| 56/59 [11:27<00:36, 12.29s/it]

{'loss': 0.6735, 'grad_norm': 1.455427885055542, 'learning_rate': 1.8367346938775508e-05, 'epoch': 0.95}


 97%|█████████▋| 57/59 [11:39<00:24, 12.26s/it]

{'loss': 0.6395, 'grad_norm': 1.3254414796829224, 'learning_rate': 1.224489795918367e-05, 'epoch': 0.97}


 98%|█████████▊| 58/59 [11:51<00:12, 12.22s/it]

{'loss': 0.6369, 'grad_norm': 1.2768852710723877, 'learning_rate': 6.122448979591835e-06, 'epoch': 0.98}


100%|██████████| 59/59 [12:02<00:00, 11.74s/it]

{'loss': 0.627, 'grad_norm': 1.3656067848205566, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 59/59 [12:04<00:00, 12.27s/it]

{'train_runtime': 724.1585, 'train_samples_per_second': 1.301, 'train_steps_per_second': 0.081, 'train_loss': 1.1121663986626318, 'epoch': 1.0}





TrainOutput(global_step=59, training_loss=1.1121663986626318, metrics={'train_runtime': 724.1585, 'train_samples_per_second': 1.301, 'train_steps_per_second': 0.081, 'total_flos': 2.177861902191821e+16, 'train_loss': 1.1121663986626318, 'epoch': 1.0})

## 4. Inference

In [27]:
# prompt
#These are the 16 words of the NYT Connections game: {words} What are the four groups and each of their four words?

# Load model for inference
model = FastLanguageModel.for_inference(model)

# Define the words
words = 'stop, poem, different, message, new, novel, yardstick, furniture, play, text, biped, record, correspond, original, pause, write'

# Create a prompt that includes only the question (without the instruction)
prompt = f"These are the 16 words of the NYT Connections game: {words} What are the four groups and each of their four words?"

instruction = [
    {
        "role": "human",
        "content": prompt,
    },
]
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Stream the output from the model, limiting it to just the answer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs["input_ids"], streamer=text_streamer, max_new_tokens=128, temperature=0.7, top_p=0.9, use_cache=True)



<|begin_of_text|>These are the 16 words of the NYT Connections game: stop, poem, different, message, new, novel, yardstick, furniture, play, text, biped, record, correspond, original, pause, write What are the four groups and each of their four words? Response: (yardstick novel furniture new) (pause correspond original write) (text biped record play) (poem stop message different)<|im_end|>


## 5. Save trained model

In [None]:
model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
model.push_to_hub_merged("mlabonne/FineLlama-3.1-8B", tokenizer, save_method="merged_16bit")

In [None]:
model.save_pretrained_gguf("model", tokenizer, "q8_0")
quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
for quant in quant_methods:
    model.push_to_hub_gguf("mlabonne/FineLlama-3.1-8B-GGUF", tokenizer, quant)