In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install transformers datasets accelerate bitsandbytes peft trl
# !pip install kagglehub

## Slect model to fine tune and get tokenizer for that model

In [2]:
# using mistral model
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# if there is not pad token in the model add one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.1.
401 Client Error. (Request ID: Root=1-687bf546-191d33b868fb440837558903;d4405c99-adcf-4f33-baa6-af4badcef568)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

## Dataset prepare

In [None]:
from datasets import Dataset
import kagglehub
import shutil

# get the model
input_file = kagglehub.dataset_download("viccalexander/kanyewestverses")
print("Path to dataset files:", input_file_path)

custom_location = os.path.join(os.getcwd(), 'my_kanye_data')
os.makedirs(custom_location, exist_ok=True)

for item in os.listdir(input_file):
    s = os.path.join(input_file, item)
    d = os.path.join(custom_location, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)
print(f"Dataset copied to custom location: {custom_location}")

In [None]:
import json

# split the bars
output_filepath = "./kanye_bars_prompt_completion.jsonl"
input_filepath = f"{custom_location}/kanye_verses.txt"

with open(input_filepath, 'r', encoding='utf-8') as infile, \
     open(output_filepath, 'w', encoding='utf-8') as outfile:

    current_verse_bars = []
    for line_num, line in enumerate(infile):
        stripped_line = line.strip()

        if stripped_line:
            current_verse_bars.append(stripped_line)
        else:
            if current_verse_bars:
                for i in range(0, len(current_verse_bars), 2):
                    prompt = current_verse_bars[i]
                    if i + 1 < len(current_verse_bars):
                        completion = current_verse_bars[i+1]
                    else:
                        completion = prompt

                    json_entry = {
                        "prompt": prompt,
                        "completion": completion
                    }
                    outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')
                current_verse_bars = []

    if current_verse_bars:
        for i in range(0, len(current_verse_bars), 2):
            prompt = current_verse_bars[i]
            if i + 1 < len(current_verse_bars):
                completion = current_verse_bars[i+1]
            else:
                completion = prompt

            json_entry = {
                "prompt": prompt,
                "completion": completion
            }
            outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')

print(f"Conversion complete! Output saved to '{output_filepath}'.")

In [None]:
from datasets import load_dataset

my_dataset = load_dataset('json', data_files=output_filepath)
split_dataset = my_dataset['train'].train_test_split(test_size=0.2, seed=42)

train_set = split_dataset['train']
test_set = split_dataset['test']

print(f"Total samples in original dataset: {len(my_dataset['train'])}")
print(f"Samples in training set: {len(train_set)}")
print(f"Samples in test set: {len(test_set)}")

print("\nTraining set examples:")
print(train_set[0])

print("\nTest set examples:")
print(test_set[0])

## Load model and apply quantization

In [None]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare model for k-bit training (important for QLoRA)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Get the PEFT model
model = get_peft_model(model, lora_config)

# Print trainable parameters (you'll see a small percentage)
model.print_trainable_parameters()

## Configure training arguments

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results", # Directory to save checkpoints and logs
    num_train_epochs=3, # Number of training epochs
    per_device_train_batch_size=4, # Adjust based on VRAM
    gradient_accumulation_steps=2, # Accumulate gradients over multiple steps to simulate larger batch size
    gradient_checkpointing=True, # Saves memory
    optim="paged_adamw_8bit", # Optimizer optimized for 8-bit training
    save_strategy="epoch", # Save checkpoint every epoch
    logging_dir="./logs", # Directory for logs
    logging_steps=10, # Log every N steps
    learning_rate=2e-4, # Fine-tuning learning rate
    fp16=True, # Use float16 for mixed precision training
    tf32=True, # Use TF32 for NVIDIA A100+ GPUs
    max_grad_norm=0.3, # Clip gradients to prevent exploding gradients
    warmup_ratio=0.03, # Linear warmup for learning rate
    lr_scheduler_type="cosine", # Learning rate scheduler
    disable_tqdm=False, # Enable tqdm progress bars
    evaluation_strategy="epoch", # Evaluate every epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric
    metric_for_best_model="eval_loss", # Metric to monitor for best model
    report_to="tensorboard", # Report to TensorBoard
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config, # Pass the LoRA config
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=512, # Maximum sequence length for training (adjust based on VRAM)
    dataset_text_field="text", # The field in your dataset containing the text
    # formatting_func=formatting_prompts_func, # If you didn't preprocess your dataset already
)

## train

In [None]:
trainer.train()
