### Run Llama finetuning code

In [None]:
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
import json


#### Define the file path, model name, save name, and epochs

In [None]:
file_name = "/home/users/seunghh/l2m3_revision/data/text_categorize.jsonl"
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
save_name = 'text_categorize_1'
num_epochs = 100

In [None]:
def read_files(file_name):
    # Reads a JSONL file line-by-line, parses each line as JSON, and stores it in a list
    data_list = []
    with open(file_name, 'r') as g:
        for line_number, line in enumerate(g, start=1):
            line = line.strip()  # Remove any trailing or leading whitespace
            if line:  # Skip empty lines
                try:
                    json_obj = json.loads(line)
                    data_list.append(json_obj)
                except json.JSONDecodeError as e:
                    # Print an error message if a line cannot be parsed as JSON
                    print(f"Error parsing JSON on line {line_number}: {e}")    
    return data_list


#### Define Model

In [None]:
# Read and prepare the training data
data = read_files(file_name)

# Set max sequence length for the model
max_seq_length = 2048

# Load the pretrained model and tokenizer using a FastLanguageModel utility
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Prepare the model for Parameter-Efficient Fine-Tuning (PEFT) using LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)

# Print the number of trainable parameters in the model
print(model.print_trainable_parameters())

# Retrieve a custom chat template for the tokenizer
tokenizer = get_chat_template(
    tokenizer,
)

def apply_template(examples):
    # Apply a chat template to each example in the dataset
    # This prepares the examples in a prompt-like format for training
    message = examples["messages"]
    text = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
    examples['text'] = text
    return examples

# Apply the chat template to all training data
dataset = list(map(apply_template, data))

#### Set up the Trainer

In [None]:

# Set up the trainer for supervised fine-tuning
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=num_epochs,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        seed=0,
        logging_dir='./logs',
        report_to=['tensorboard'],
    ),
)

#### Train

In [None]:
# Start the training process
trainer.train()

#### Save the model

In [None]:
# Save the final merged model and its tokenizer
model.save_pretrained_merged(save_name, tokenizer, save_method="merged_16bit")