In [18]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers datasets accelerate bitsandbytes peft trl
# !pip install kagglehub
# !pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl.metadata (4.0 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   --------------- ------------------------ 2.1/5.5 MB 11.5 MB/s eta 0:00:01
   ---------------------------------- ----- 4.7/5.5 MB 11.8 MB/s eta 0:00:01
   ---------------------------------------- 5.5/5.5 MB 11.6 MB/s eta 0:00:00
Downloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl (4.3 MB)
 

## Slect model to fine tune and get tokenizer for that model

In [1]:
# using mistral model
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# if there is not pad token in the model add one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Dataset prepare

In [7]:
from datasets import Dataset
import kagglehub
import shutil
import os

# get the model
input_file = kagglehub.dataset_download("viccalexander/kanyewestverses")
print("Path to dataset files:", input_file)

custom_location = os.path.join(os.getcwd(), 'my_kanye_data')
os.makedirs(custom_location, exist_ok=True)

for item in os.listdir(input_file):
    s = os.path.join(input_file, item)
    d = os.path.join(custom_location, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)
print(f"Dataset copied to custom location: {custom_location}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/viccalexander/kanyewestverses?dataset_version_number=1...


100%|██████████| 106k/106k [00:00<00:00, 171kB/s]

Extracting files...
Path to dataset files: C:\Users\anubh\.cache\kagglehub\datasets\viccalexander\kanyewestverses\versions\1
Dataset copied to custom location: C:\Users\anubh\Projects\fine-tune-llm-kanye-best\my_kanye_data





In [8]:
import json

# split the bars
output_filepath = "./kanye_bars_prompt_completion.jsonl"
input_filepath = f"{custom_location}/kanye_verses.txt"

with open(input_filepath, 'r', encoding='utf-8') as infile, \
     open(output_filepath, 'w', encoding='utf-8') as outfile:

    current_verse_bars = []
    for line_num, line in enumerate(infile):
        stripped_line = line.strip()

        if stripped_line:
            current_verse_bars.append(stripped_line)
        else:
            if current_verse_bars:
                for i in range(0, len(current_verse_bars), 2):
                    prompt = current_verse_bars[i]
                    if i + 1 < len(current_verse_bars):
                        completion = current_verse_bars[i+1]
                    else:
                        completion = prompt

                    json_entry = {
                        "prompt": prompt,
                        "completion": completion
                    }
                    outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')
                current_verse_bars = []

    if current_verse_bars:
        for i in range(0, len(current_verse_bars), 2):
            prompt = current_verse_bars[i]
            if i + 1 < len(current_verse_bars):
                completion = current_verse_bars[i+1]
            else:
                completion = prompt

            json_entry = {
                "prompt": prompt,
                "completion": completion
            }
            outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')

print(f"Conversion complete! Output saved to '{output_filepath}'.")

Conversion complete! Output saved to './kanye_bars_prompt_completion.jsonl'.


In [9]:
from datasets import load_dataset

my_dataset = load_dataset('json', data_files=output_filepath)
split_dataset = my_dataset['train'].train_test_split(test_size=0.2, seed=42)

train_set = split_dataset['train']
test_set = split_dataset['test']

print(f"Total samples in original dataset: {len(my_dataset['train'])}")
print(f"Samples in training set: {len(train_set)}")
print(f"Samples in test set: {len(test_set)}")

print("\nTraining set examples:")
print(train_set[0])

print("\nTest set examples:")
print(test_set[0])

Generating train split: 0 examples [00:00, ? examples/s]

Total samples in original dataset: 3159
Samples in training set: 2527
Samples in test set: 632

Training set examples:
{'prompt': 'Or Jay is', 'completion': 'My favorite'}

Test set examples:
{'prompt': 'Now if my man Benzino got a Benz and they call him Benzino', 'completion': 'When I get my Bentley they gon call me Bent-lino'}


## Load model and apply quantization

In [10]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from transformers.modeling_utils import PreTrainedModel
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare model for k-bit training (important for QLoRA)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Get the PEFT model
model = get_peft_model(model, lora_config)

# Print trainable parameters (you'll see a small percentage)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


## Configure training arguments

In [19]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results", # Directory to save checkpoints and logs
    num_train_epochs=3, # Number of training epochs
    per_device_train_batch_size=2, # Adjust based on VRAM
    gradient_accumulation_steps=4, # Accumulate gradients over multiple steps to simulate larger batch size
    gradient_checkpointing=True, # Saves memory
    optim="paged_adamw_8bit", # Optimizer optimized for 8-bit training
    save_strategy="epoch", # Save checkpoint every epoch
    logging_dir="./logs", # Directory for logs
    logging_steps=10, # Log every N steps
    learning_rate=2e-4, # Fine-tuning learning rate
    fp16=True, # Use float16 for mixed precision training
    # tf32=False, # Use TF32 for NVIDIA A100+ GPUs
    max_grad_norm=0.3, # Clip gradients to prevent exploding gradients
    warmup_ratio=0.03, # Linear warmup for learning rate
    lr_scheduler_type="cosine", # Learning rate scheduler
    disable_tqdm=False, # Enable tqdm progress bars
    eval_strategy="epoch", # Evaluate every epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric
    metric_for_best_model="eval_loss", # Metric to monitor for best model
    report_to="tensorboard", # Report to TensorBoard
)

sft_config = SFTConfig(
    max_seq_length=256,
    dataset_text_field="text",
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_set,
    eval_dataset=test_set,
    peft_config=lora_config,
    args=training_args,
    processing_class=tokenizer,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## train

In [20]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,2.8546,2.848207
2,2.4821,2.884391
3,1.7583,3.044556


TrainOutput(global_step=948, training_loss=2.43133630229451, metrics={'train_runtime': 6108.0073, 'train_samples_per_second': 1.241, 'train_steps_per_second': 0.155, 'total_flos': 9247608244346880.0, 'train_loss': 2.43133630229451})

## Save model

In [21]:
# Save the adapter model
trainer.model.save_pretrained("./fine_tuned_mistral_adapter")
tokenizer.save_pretrained("./fine_tuned_mistral_adapter")

('./fine_tuned_mistral_adapter\\tokenizer_config.json',
 './fine_tuned_mistral_adapter\\special_tokens_map.json',
 './fine_tuned_mistral_adapter\\tokenizer.json')

## inference with fine tuned model

In [3]:
from transformers import pipeline
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

OFFLOAD_DIRECTORY = "./model_offload_cache"
os.makedirs(OFFLOAD_DIRECTORY, exist_ok=True)

# Load base model
model_inference = AutoModelForCausalLM.from_pretrained(
    "./fine_tuned_mistral_adapter",
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder=OFFLOAD_DIRECTORY,
    offload_state_dict=True
)
tokenizer_inference = AutoTokenizer.from_pretrained("./fine_tuned_mistral_adapter")
if tokenizer_inference.pad_token is None:
    tokenizer_inference.pad_token = tokenizer_inference.eos_token

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=model_inference,
    tokenizer=tokenizer_inference,
    torch_dtype=torch.float16, # Or bfloat16
    device_map="auto"
)

# Test the model
prompt = "### Prompt: i just woke up in the morning \n ### completion:"
# Ensure the prompt format matches your training data!

outputs = generator(
    prompt,
    max_new_tokens=100,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
    pad_token_id=tokenizer_inference.pad_token_id
)

print(outputs[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: model.layers.17, model.layers.18, model.layers.19, model.layers.20, model.layers.21, model.layers.22, model.layers.23, model.layers.24, model.layers.25, model.layers.26, model.layers.27, model.layers.28, model.layers.29, model.layers.30, model.layers.31, model.norm, model.rotary_emb, lm_head.

In [4]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel # Import PeftModel
import torch
import os

OFFLOAD_DIRECTORY = "./model_offload_cache"
os.makedirs(OFFLOAD_DIRECTORY, exist_ok=True)

# You NEED to specify the original base model name here
# This should be the same as 'model_name' you used during training
base_model_name_for_inference = "mistralai/Mistral-7B-v0.1" # <--- IMPORTANT: Replace with your actual base model name

# If you trained with 4-bit/8-bit quantization (QLoRA),
# you should load the base model with the same quantization config.
# If you didn't use quantization for the base model during training, remove bnb_config.
# Assuming you did use QLoRA, include this:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Or torch.float16 if that's what you used
    bnb_4bit_use_double_quant=True,
)

# 1. Load the original BASE model (potentially with quantization)
base_model_inference = AutoModelForCausalLM.from_pretrained(
    base_model_name_for_inference, # Load from Hugging Face Hub
    quantization_config=bnb_config, # Include if you used QLoRA during training
    torch_dtype=torch.float16, # Match the dtype you want for operations
    device_map="auto",
    offload_folder=OFFLOAD_DIRECTORY, # Offload base model if it doesn't fit
    offload_state_dict=True
)

tokenizer_inference = AutoTokenizer.from_pretrained(base_model_name_for_inference)
if tokenizer_inference.pad_token is None:
    tokenizer_inference.pad_token = tokenizer_inference.eos_token

# 2. Load the PEFT adapter weights on top of the base model
model_inference = PeftModel.from_pretrained(base_model_inference, "./fine_tuned_mistral_adapter")

# You can optionally call .eval() for inference
model_inference = model_inference.eval()

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=model_inference,
    tokenizer=tokenizer_inference,
    torch_dtype=torch.float16, # Or bfloat16
    device_map="auto" # This applies to the combined model
)

# Test the model
prompt = "### Prompt: i just woke up in the morning \n ### completion:"

outputs = generator(
    prompt,
    max_new_tokens=100,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
    pad_token_id=tokenizer_inference.pad_token_id
)

print(outputs[0]["generated_text"])

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 