# CS510 NLP Final Project

Authors - Will McIntosh & Ian Wasson

# Installs and Imports

## Packages

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
%%capture
!pip install -q -U bitsandbytes==0.41.2.post2
!pip install -q -U einops==0.7.0
!pip install -q -U safetensors==0.4.0
!pip install -q -U torch==2.1.0+cu118
!pip install -q -U xformers==0.0.22.post7
!pip install -q -U datasets==2.14.6
!pip install -q -U transformers==4.35.0
!pip install -q -U peft==0.6.1
!pip install -q -U accelerate==0.24.1

## Libraries

In [None]:
%%capture
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import transformers
import torch
from torch.utils.data import Dataset as TorchDataset
from datasets import Dataset
from transformers import AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer

# Load Model

In [None]:
# clear VRAM on GPU
torch.cuda.empty_cache()

In [None]:
def get_falcon_model():

  # 4bit Quantize configurations
  bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
  )

  model_id = "tiiuae/falcon-7b"

  # Get the pretrained falcon model
  model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=False
  )

  # PEFT (Parameter-Efficient Fine-Tuning) with LoRA (Low-Rank Adaptation)
  config = LoraConfig(
      r=16,
      lora_alpha=32,
      target_modules=["query_key_value"],
      lora_dropout=0.05,
      bias="none",
      task_type="CAUSAL_LM"
  )

  model = get_peft_model(model, config)
  model = model.float()
  model.to('cuda')

  return model, model_id

## Get Falcon Model

In [None]:
#%%capture
model, model_id = get_falcon_model()

Downloading config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

# Prepare Data

## Load Data

In [None]:
%%capture
import pandas as pd
import numpy as np
import sys, os # Importing data

In [None]:
%%capture

# downloads the .csv files from google drive only if isn't already in directory
path = "/content/spotify_millsongdata.csv"
if os.path.isfile(path) == False:
  !gdown --id 1wGtLywxyCq858JTVtizWHR5dtIf4Di8v

In [None]:
def select_only_desired_artist(artist, fdf):
  fdf = fdf[fdf["artist"]==artist]
  fdf = fdf.drop(['artist'], axis=1)
  new_df = {"title": [], "lyrics": []}

  for row in fdf.index:
    stripped_lyrics = list(filter(bool, df["lyrics"][row].splitlines()))

    for lyric in stripped_lyrics:
      new_df["title"].append(fdf["title"][row])
      new_df["lyrics"].append(lyric)

  new_df = pd.DataFrame(data=new_df)
  return new_df

df = pd.read_csv(path, usecols=['artist', 'song', 'text'])
df = df.rename(columns={"song": "title", "text": "lyrics"})

# This variable is used later for printing
artist = "Rihanna"
df = select_only_desired_artist(artist, df)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

## Tokenize and Encode Dataset

In [None]:
def get_tokenized_dataset(dataset, model_id, partial_performance=True):
  # Initialize the tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token

  # Function to concatenate title and lyrics
  def concatenate_qa(examples):
      return {'input_text': "Artist: Rhianna" + ": song :" + examples['title'] + "->" + examples['lyrics']}

  # Apply the function to the dataset
  dataset = dataset.map(concatenate_qa)

  # Tokenize the dataset
  tokenized_dataset = tokenizer(
      dataset['input_text'],
      #truncation=True,
      truncation=False,
      padding=True,
      #max_length=256,
      return_tensors='pt'
  )

  return tokenized_dataset, tokenizer

In [None]:
tokenized_dataset, tokenizer = get_tokenized_dataset(dataset, model_id, partial_performance=False)

Map:   0%|          | 0/9365 [00:00<?, ? examples/s]

In [None]:
class TextDataset(TorchDataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
# Convert the encodings to PyTorch datasets
train_dataset_pytorch = TextDataset(tokenized_dataset)

# Example Before Fine Tuning

In [None]:
def generate_new_song(title, artist, model, tokenizer):

    # Load tokenizer and model
    prompt = f"Title: {title}\nArtist: Rhianna\nLyrics:"

    # Create pipeline
    song_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    # Generate text
    sequences = song_generator(
        prompt,
        max_length=200,
        do_sample=True,
        top_k=0,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Print generated song
    print(f"In the Style of: {artist}")
    print("Generated Song:")
    for seq in sequences:
        print(seq['generated_text'])

In [None]:
# Example usage
generate_new_song("Take on Me", artist, model, tokenizer)

In the Style of: Rihanna
Generated Song:
Title: Take on Me
Lyrics:
Now the only thing we need is maybe someday
When the heroes of our lives will feel just like me (Just like me!) (Just like me)
And tomorrow I'm afraid we'll all be nothing again
Just trying to be players on our likable teams
Sélection des chansons du moment


# Training

In [None]:
from transformers import TrainingArguments, TrainerCallback, Trainer
from tqdm.auto import tqdm

# For progress bars
class ProgressCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        self.progress_bar = tqdm(total=state.max_steps)
        self.progress_bar.set_description("Training")

    def on_step_end(self, args, state, control, **kwargs):
        self.progress_bar.update(1)

    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()

torch.cuda.empty_cache()

# Modify your TrainingArguments
training_args = TrainingArguments(
    num_train_epochs=100,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.05,
    max_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_strategy="steps",
    logging_steps=25,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    lr_scheduler_type='cosine',
)

# Create your Trainer with the ProgressCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_pytorch,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[ProgressCallback()]
)

# silence the warnings. Please re-enable for inference!
model.config.use_cache = False

# upcast cross attention layer to bfloat16
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float)

# Train the model
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
25,1.477
50,1.3785
75,1.3546
100,1.3765


TrainOutput(global_step=100, training_loss=1.396648826599121, metrics={'train_runtime': 130.7498, 'train_samples_per_second': 12.237, 'train_steps_per_second': 0.765, 'total_flos': 2928236457984000.0, 'train_loss': 1.396648826599121, 'epoch': 0.17})

# Example After Fine Tuning

In [None]:
%%capture
model.config.use_cache = True
model.eval()

In [None]:
generate_new_song("Take on Me", artist, model, tokenizer)

In the Style of: Rihanna
Generated Song:
Title: Take on Me
Artist: Rhianna
Lyrics: 's crying his eyes out
And all of his dreams will come true
Shouldn't you be begging me
To be the one -- a-a in his life?
I hope he sees the light
Decide, oh, dilemma
Now in the sunshine of his sun
Shines the brightest where every bird can fly
Does she have a shotgun?
And it's just a waste of time, oh-oh, see, see, see, see, now, now we're moving on
It's time to tilt the balance again
Now are you in line?
So let me call on you and let me call you
And when we're dancing
We scatter diamonds to the wind
Oh-oh, see, see, see, see, see, now
We're taking on me...
Put your hands up!
Oh!


In [None]:
generate_new_song("Umbrella", artist, model, tokenizer)

In the Style of: Rihanna
Generated Song:
Title: Umbrella
Artist: Rhianna
Lyrics:
Ha, yeah, uh,
Uh, baby, yeah, yo, girl, I owe you a good time, uh, oh (you know what I mean)
I keep it locked, tucked in tight so I'm feeling good
You keep me wet, no I won't fold, oh, I'm so bad, so damn naughty, uh (oh)
And you make me feel bad bad bad bad
And I can't feel my fingers, oh, yeah, oh, yeah, silly thing to think, right? (Silly thing to think)
So you make me then make me straight, make me not know what I'm doing
I just can't believe the rise in the roof then I blush, you know, what I'm thinking, no
And you make me feel sad sad sad sad sad, oh
You know the


In [None]:
generate_new_song("YMCA", artist, model, tokenizer)

In the Style of: Rihanna
Generated Song:
Title: YMCA
Artist: Rhianna
Lyrics:
I'm all about loving you,
And all that I've been dreaming,
Of all the promises we made,
Now broken has left me filled, with so much to realize,
For why not be happy,
And pulling off the brakes,
When you're about to break through, oh boy,
So let me know,
And even though I hate to admit it,
I'll be right there with you,
So I can be,
Like be like (I want to),
The way you want me to,
I want it like that,
And I'm far far far're not afraid of falling in love with you,
I love you like crazy,
And there is nothing you wouldn't do,
Nothing that I would not do,
For you.
And why not be happy, baby,
And I can


In [None]:
model.save_pretrained(f"/content/{artist}_model")