In [18]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers datasets accelerate bitsandbytes peft trl
# !pip install kagglehub
# !pip install tensorboard

Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl.metadata (4.0 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   --------------- ------------------------ 2.1/5.5 MB 11.5 MB/s eta 0:00:01
   ---------------------------------- ----- 4.7/5.5 MB 11.8 MB/s eta 0:00:01
   ---------------------------------------- 5.5/5.5 MB 11.6 MB/s eta 0:00:00
Downloading tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl (4.3 MB)
 

## Slect model to fine tune and get tokenizer for that model

In [1]:
# using mistral model
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

## Dataset prepare

In [3]:
from datasets import Dataset
import kagglehub
import shutil
import os

# get the model
input_file = kagglehub.dataset_download("viccalexander/kanyewestverses")
print("Path to dataset files:", input_file)

custom_location = os.path.join(os.getcwd(), 'my_kanye_data')
os.makedirs(custom_location, exist_ok=True)

for item in os.listdir(input_file):
    s = os.path.join(input_file, item)
    d = os.path.join(custom_location, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)
print(f"Dataset copied to custom location: {custom_location}")

Path to dataset files: C:\Users\anubh\.cache\kagglehub\datasets\viccalexander\kanyewestverses\versions\1
Dataset copied to custom location: C:\Users\anubh\Projects\fine-tune-llm-kanye-best\my_kanye_data


In [4]:
import json

# split the bars
output_filepath = "./kanye_bars_prompt_completion.jsonl"
input_filepath = f"{custom_location}/kanye_verses.txt"

with open(input_filepath, 'r', encoding='utf-8') as infile, \
     open(output_filepath, 'w', encoding='utf-8') as outfile:

    current_verse_bars = []
    for line_num, line in enumerate(infile):
        stripped_line = line.strip()

        if stripped_line:
            current_verse_bars.append(stripped_line)
        else:
            if current_verse_bars:
                for i in range(0, len(current_verse_bars), 2):
                    prompt = current_verse_bars[i]
                    if i + 1 < len(current_verse_bars):
                        completion = current_verse_bars[i+1]
                    else:
                        completion = prompt

                    json_entry = {
                        "prompt": prompt,
                        "completion": completion
                    }
                    outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')
                current_verse_bars = []

    if current_verse_bars:
        for i in range(0, len(current_verse_bars), 2):
            prompt = current_verse_bars[i]
            if i + 1 < len(current_verse_bars):
                completion = current_verse_bars[i+1]
            else:
                completion = prompt

            json_entry = {
                "prompt": prompt,
                "completion": completion
            }
            outfile.write(json.dumps(json_entry, ensure_ascii=False) + '\n')

print(f"Conversion complete! Output saved to '{output_filepath}'.")

Conversion complete! Output saved to './kanye_bars_prompt_completion.jsonl'.


In [5]:
from datasets import load_dataset

my_dataset = load_dataset('json', data_files=output_filepath)
split_dataset = my_dataset['train'].train_test_split(test_size=0.2, seed=42)

train_set = split_dataset['train']
test_set = split_dataset['test']

print(f"Total samples in original dataset: {len(my_dataset['train'])}")
print(f"Samples in training set: {len(train_set)}")
print(f"Samples in test set: {len(test_set)}")

print("\nTraining set examples:")
print(train_set[0])

print("\nTest set examples:")
print(test_set[0])

Generating train split: 0 examples [00:00, ? examples/s]

Total samples in original dataset: 3159
Samples in training set: 2527
Samples in test set: 632

Training set examples:
{'prompt': 'Or Jay is', 'completion': 'My favorite'}

Test set examples:
{'prompt': 'Now if my man Benzino got a Benz and they call him Benzino', 'completion': 'When I get my Bentley they gon call me Bent-lino'}


## Load model and apply quantization

In [6]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from transformers.modeling_utils import PreTrainedModel
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading model {model_name} with quantization...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={'': 0},
        torch_dtype=torch.float16,
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    print("This likely means the model, even quantized, cannot fit into 6GB VRAM.")
    print("Consider a smaller model or running on a cloud GPU.")
    exit()


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

Loading model meta-llama/Llama-3.2-3B with quantization...


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model loaded successfully.
trainable params: 2,293,760 || all params: 3,215,043,584 || trainable%: 0.0713


## Configure training arguments

In [13]:
from trl import SFTTrainer, SFTConfig

# Use SFTConfig to consolidate all training arguments
sft_config = SFTConfig(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    disable_tqdm=False,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="tensorboard",
    eval_steps=100,
    max_seq_length=256,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_set,
    eval_dataset=test_set,
    peft_config=lora_config,
    args=sft_config,
)

Adding EOS to train dataset:   0%|          | 0/2527 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2527 [00:00<?, ? examples/s]



Truncating train dataset:   0%|          | 0/2527 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/632 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/632 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/632 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## train

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.2026,3.373966
2,3.2241,3.377975
3,2.9786,3.403653


TrainOutput(global_step=948, training_loss=3.176914617482117, metrics={'train_runtime': 4585.4902, 'train_samples_per_second': 1.653, 'train_steps_per_second': 0.207, 'total_flos': 3049263381196800.0, 'train_loss': 3.176914617482117})

## Save model

In [18]:
# Save the adapter model
trainer.model.save_pretrained("./fine_tuned_llama_adapter")
tokenizer.save_pretrained("./fine_tuned_llama_adapter")

('./fine_tuned_llama_adapter\\tokenizer_config.json',
 './fine_tuned_llama_adapter\\special_tokens_map.json',
 './fine_tuned_llama_adapter\\tokenizer.json')

## inference with fine tuned model

In [1]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, pipeline
import torch
import os

model_name = "meta-llama/Llama-3.2-3B"
OFFLOAD_DIRECTORY = "./model_offload_cache"
os.makedirs(OFFLOAD_DIRECTORY, exist_ok=True)

print(f"Loading base model: {model_name}...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)
print("Base model loaded.")

tokenizer_inference = AutoTokenizer.from_pretrained(model_name)
if tokenizer_inference.pad_token is None:
    tokenizer_inference.pad_token = tokenizer_inference.eos_token
tokenizer_inference.padding_side = "right"

print("Loading PEFT adapter and attaching to base model...")
model_inference = PeftModel.from_pretrained(base_model, "./fine_tuned_llama_adapter")
print("PEFT adapter loaded and attached.")

model_inference.eval()

generator = pipeline(
    "text-generation",
    model=model_inference,
    tokenizer=tokenizer_inference,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Test the model
prompt = "Bougie girl, grab my hand "

outputs = generator(
    prompt,
    max_new_tokens=100,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
    pad_token_id=tokenizer_inference.pad_token_id
)

print(outputs[0]["generated_text"])

Loading base model: meta-llama/Llama-3.2-3B...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded.


Device set to use cuda:0


Loading PEFT adapter and attaching to base model...
PEFT adapter loaded and attached.
Bougie girl, grab my hand 24/7I know I'm the man


## Push to hugging face

In [3]:
from huggingface_hub import HfApi
import os

repo_id = "anubhutiv1/llama_kanye_best"

api = HfApi()

local_adapter_path = "./fine_tuned_llama_adapter"

print(f"Uploading files from {local_adapter_path} to {repo_id}...")

api.upload_folder(
    folder_path=local_adapter_path,
    repo_id=repo_id,
    repo_type="model",
    commit_message="Upload initial LoRA adapter and tokenizer",
)

print("LoRA adapter and tokenizer successfully pushed to Hugging Face Hub!")
print(f"You can view your model at: https://huggingface.co/{repo_id}")

Uploading files from ./fine_tuned_llama_adapter to anubhutiv1/llama_kanye_best...


adapter_model.safetensors:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

LoRA adapter and tokenizer successfully pushed to Hugging Face Hub!
You can view your model at: https://huggingface.co/anubhutiv1/llama_kanye_best
