In [82]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import Dataset

In [99]:
#installed libraries (venv used exclusively for this notebook)
%pip list

Package                 Version
----------------------- ------------
absl-py                 2.1.0
accelerate              0.27.2
aiohttp                 3.9.5
aiosignal               1.3.1
asttokens               2.4.1
attrs                   23.2.0
bitsandbytes            0.43.1
certifi                 2024.2.2
charset-normalizer      3.3.2
colorama                0.4.6
comm                    0.2.2
datasets                2.19.0
debugpy                 1.8.1
decorator               5.1.1
dill                    0.3.8
executing               2.0.1
filelock                3.13.1
frozenlist              1.4.1
fsspec                  2024.2.0
grpcio                  1.62.2
huggingface-hub         0.22.2
idna                    3.7
intel-openmp            2021.4.0
ipykernel               6.29.4
ipython                 8.24.0
ipywidgets              8.1.2
jedi                    0.19.1
Jinja2                  3.1.3
jupyter_client          8.6.1
jupyter_core            5.7.2
jupyterlab_wid

In [83]:
#Dataset
df = pd.read_csv('Metallica_songs.csv', sep=';')
df.head()

Unnamed: 0,Song,Author_band,Songwriters,Duration,Album,Album_type,No_on_album,Release_date,Link,Lyrics
0,2 x 4,Metallica,"James Hetfield,Lars Ulrich,Kirk Hammett",5:28,Load,Album,2.0,1996-06-04,https://www.metallica.com/songs/2x4.html,"I’m gonna make you shake you take you,I’m gonn..."
1,53rd & 3rd,The Ramones,Douglas Colvin,2:21,We're a Happy Family: A Tribute to Ramones,Compilation,4.0,2003-02-11,https://www.metallica.com/songs/53rd-and-3rd.html,"If you think you can well come on man,I was a ..."
2,72 Seasons,Metallica,"James Hetfield,Lars Ulrich,Kirk Hammett",7:39,72 Seasons,Album,1.0,2023-04-14,https://www.metallica.com/songs/72-seasons.html,"Feeding on the wrath of man,Shot down,Traumati..."
3,Ain’t My Bitch,Metallica,"James Hetfield,Lars Ulrich",5:04,Load,Album,1.0,1996-06-04,https://www.metallica.com/songs/aint-my-bitch....,"Outta my way,Outta my day,Out of your mind and..."
4,All Day and All of the Night,The Kinks,Ray Davies,,,,,,https://www.metallica.com/songs/all-day-and-al...,"I’m not content to be with you in the daytime,..."


In [85]:
#Checking lyrics
lyrics = '\n'.join(df.loc[:,'Lyrics']).replace(",", "\n")
print(lyrics[:200])

I’m gonna make you shake you take you
I’m gonna be the one who breaks you
Put the screws to you yeah my way
Yeah come on and come on come and make my day
Make my day
 
Got some hell to pay I steal you


In [86]:
#All characters
print(' '.join(sorted(set(lyrics))))

	 
   ! & ' ( ) - . 1 2 3 4 5 6 7 8 9 : ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z   Æ é – ‘ ’ “ ” …


In [87]:
#Cleaning special characters
cleaned_lyrics = re.sub(r"[^a-zA-Z0-9 ,'.!\n]", '', lyrics)
print(''.join(sorted(set(cleaned_lyrics))))


 !'.123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [88]:
#Splitting
split_point = int(len(cleaned_lyrics)*0.95)
train_data = cleaned_lyrics[:split_point]
test_data = cleaned_lyrics[split_point:]
train_data_seg = []
for i in range(0, len(train_data), 500):
        text = train_data[i:min(i+500, len(train_data))]
        train_data_seg.append(text)
train_data_seg = Dataset.from_dict({'text':train_data_seg})
print(len(train_data_seg))

448


In [44]:
#Log in to HugFace
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
#Pulling model
model_name = "PY007/TinyLlama-1.1B-step-50K-105b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,           
    bnb_4bit_quant_type="nf4",    
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, 
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, 
    device_map="auto",  
    trust_remote_code=True, 
)

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [46]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [91]:
# Generating lyrics with the base model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def generate_lyrics(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=250, pad_token_id = tokenizer.eos_token_id,repetition_penalty=1.3, eos_token_id = tokenizer.eos_token_id)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)    
    output = text_output[len(query):]
    return output

text_output = generate_lyrics(test_data[2100:2600], model)



In [93]:
print(text_output.replace(",", "\n"))

unk or sober I feel like Im dead
 
Its all about money and power
And if you dont have your cake and eat it too
Youll never know whats going on inside
 
So get up off that rock and run away
Get up off that rock and run away
From those who would steal your life
Now isnt time for that man
Too late to save him from his own wrath
 
The world has changed since then
He was born into poverty but he grew up rich
Into an unjust society
A new kind of oppression
That no one can escape
 
Were gonna kill them with our guns
We gotta make sure nobody gets hurt
Just because we are poor doesnt mean we cant fight back
 
Shocking news coming through loud and clear
It seems the whole world is against us
Everyone wants their piece of the pie
All want their share of the loot
 
Theres nothing wrong with being rich
No matter how much you earn
There is always something wrong with being poor
Because everyone else does aswell
 
Poverty is a lie
Lies told by thieves


In [94]:
# Setting parameters
model = prepare_model_for_kbit_training(model)

lora_alpha = 32
lora_dropout = 0.05 
lora_rank = 32 

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  
    task_type="CAUSAL_LM")

peft_model = get_peft_model(model, peft_config)

In [95]:
# Setting train args
output_dir = "bnalyv/tinyllama" # Model repo HugFace destination
per_device_train_batch_size = 3
gradient_accumulation_steps = 2  
optim = "paged_adamw_32bit" 
save_strategy="steps" 
save_steps = 10 
logging_steps = 10  
learning_rate = 2e-3  
max_grad_norm = 0.3 
max_steps = 200     
warmup_ratio = 0.03 
lr_scheduler_type = "cosine" 

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none'
)

In [96]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_data_seg,
    peft_config=peft_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

In [97]:
#Training
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]

Checkpoint destination directory bnalyv/tinyllama\checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 3.1583, 'grad_norm': 0.4162677228450775, 'learning_rate': 0.0019979028262377117, 'epoch': 0.13}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.9247, 'grad_norm': 0.31791678071022034, 'learning_rate': 0.001974410524646926, 'epoch': 0.27}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-30 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.9225, 'grad_norm': 0.318973571062088, 'learning_rate': 0.0019254212296427042, 'epoch': 0.4}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.9696, 'grad_norm': 0.3652134835720062, 'learning_rate': 0.0018522168236559692, 'epoch': 0.53}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.9225, 'grad_norm': 0.32852327823638916, 'learning_rate': 0.0017567128158176952, 'epoch': 0.67}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-60 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.8776, 'grad_norm': 0.3380332291126251, 'learning_rate': 0.00164140821963114, 'epoch': 0.8}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-70 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.9605, 'grad_norm': 0.3779241144657135, 'learning_rate': 0.001509320162328763, 'epoch': 0.93}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-80 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.7922, 'grad_norm': 0.3565402626991272, 'learning_rate': 0.0013639049369634877, 'epoch': 1.07}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-90 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.6236, 'grad_norm': 0.3221951425075531, 'learning_rate': 0.0012089675630312753, 'epoch': 1.2}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.6333, 'grad_norm': 0.37463831901550293, 'learning_rate': 0.0010485622221144484, 'epoch': 1.33}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-110 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.5849, 'grad_norm': 0.36563369631767273, 'learning_rate': 0.0008868861738047158, 'epoch': 1.47}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-120 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.5607, 'grad_norm': 0.37835443019866943, 'learning_rate': 0.0007281699277636571, 'epoch': 1.6}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-130 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.6775, 'grad_norm': 0.37425386905670166, 'learning_rate': 0.0005765665457425102, 'epoch': 1.73}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-140 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.6229, 'grad_norm': 0.3600066006183624, 'learning_rate': 0.0004360429701490934, 'epoch': 1.87}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-150 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.6042, 'grad_norm': 0.5924780964851379, 'learning_rate': 0.00031027622272189573, 'epoch': 2.0}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-160 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.5181, 'grad_norm': 0.39385610818862915, 'learning_rate': 0.0002025571894372794, 'epoch': 2.13}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-170 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.4489, 'grad_norm': 0.48114246129989624, 'learning_rate': 0.00011570450926997656, 'epoch': 2.27}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-180 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.3336, 'grad_norm': 0.4597238600254059, 'learning_rate': 5.199082004372957e-05, 'epoch': 2.4}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-190 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.2814, 'grad_norm': 0.4304306209087372, 'learning_rate': 1.3083291266109298e-05, 'epoch': 2.53}


Checkpoint destination directory bnalyv/tinyllama\checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 2.384, 'grad_norm': 0.4029560387134552, 'learning_rate': 0.0, 'epoch': 2.67}
{'train_runtime': 460.198, 'train_samples_per_second': 2.608, 'train_steps_per_second': 0.435, 'train_loss': 2.6900500679016113, 'epoch': 2.67}


TrainOutput(global_step=200, training_loss=2.6900500679016113, metrics={'train_runtime': 460.198, 'train_samples_per_second': 2.608, 'train_steps_per_second': 0.435, 'train_loss': 2.6900500679016113, 'epoch': 2.67})

In [98]:
# Generating lyrics with fine-tuned model
text_output_ft = generate_lyrics(test_data[2100:2600], model)

print(text_output_ft.replace(",", "\n"))



unk or sober Ive got no use for words
Its all gone wrong and Im left with nothing but pain
And thats what Im feeling
 
Screaming into my face
My eyes are open wide shut
The blood is pouring down my nose
Theres something that needs killing
It wont let go until im dead
 
Blood everywhere
All over the place
Living inside your head
You cant escape the fact that life has been taken away
No matter how much you try
Your body can never get used to being alive
So long as there is breath
Then Ill take it like a maniac
 
Walking through the streets
Ain't gonna give up
Just because I am strong
That doesn't mean I have to die
 
Stupid boy who thought he was cool
He thinks his name is Rage
Rage
 
Damned if I dont kill him
Cause I know better than to trust myself
Into my mind I knew better than to believe
Now I feel like Im crazy
 
Painting my face red
With blood coming from my ears
Tears streaming
