# Preliminaries

In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transfor

In [3]:
import re
import torch
import pandas as pd
from transformers import TrainingArguments, Trainer, GPTNeoForCausalLM, GPT2TokenizerFast
from tqdm import tqdm

In [4]:
MODELS_FOLDER = "/content/drive/MyDrive/NLP Finetuning/Finetuned Models/"

In [5]:
import random
import numpy as np

In [6]:
torch.cuda.empty_cache()

tokenizer = GPT2TokenizerFast.from_pretrained('EleutherAI/gpt-neo-125m', pad_token='<|pad|>')

torch.cuda.empty_cache()

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
if torch.cuda.is_available():
  device = torch.device('cuda')

else:
  device = torch.device('cpu')

print(device)

cuda


In [8]:
!nvidia-smi

Wed Jun 21 08:23:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    12W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Load test datasets

In [9]:
rock_data = pd.read_csv("/content/drive/MyDrive/NLP Finetuning/Genre Datasets/en_Rock_Lyrics.csv")
pop_data = pd.read_csv("/content/drive/MyDrive/NLP Finetuning/Genre Datasets/en_Pop_Lyrics.csv")
indie_data = pd.read_csv("/content/drive/MyDrive/NLP Finetuning/Genre Datasets/en_Indie_Lyrics.csv")
hip_hop_data = pd.read_csv("/content/drive/MyDrive/NLP Finetuning/Genre Datasets/en_Hip Hop_Lyrics.csv")
rap_data = pd.read_csv("/content/drive/MyDrive/NLP Finetuning/Genre Datasets/en_Rap_Lyrics.csv")

In [10]:
def get_eval_data(data):

  data['length'] = data['Lyric'].apply(lambda x:len(str(x).split()))
  data = data[data['length'] <= 200]

  data = data.drop(['Unnamed: 0', 'Genres', 'length'], axis=1)

  return data.tail(2000)

In [11]:
rock_data = get_eval_data(rock_data)
pop_data = get_eval_data(pop_data)
indie_data = get_eval_data(indie_data)
hip_hop_data = get_eval_data(hip_hop_data)
rap_data = get_eval_data(rap_data)

# Load models of each genre

In [12]:
rock = GPTNeoForCausalLM.from_pretrained('/content/drive/MyDrive/NLP Finetuning/Finetuned Models/Rock Lyrics Generator')
pop = GPTNeoForCausalLM.from_pretrained('/content/drive/MyDrive/NLP Finetuning/Finetuned Models/Pop Lyrics Generator')
indie = GPTNeoForCausalLM.from_pretrained('/content/drive/MyDrive/NLP Finetuning/Finetuned Models/Indie Lyrics Generator')
hip_hop = GPTNeoForCausalLM.from_pretrained('/content/drive/MyDrive/NLP Finetuning/Finetuned Models/Hip Hop Lyrics Generator')
rap = GPTNeoForCausalLM.from_pretrained('/content/drive/MyDrive/NLP Finetuning/Finetuned Models/Rap Lyrics Generator')

# Perplexity score

In [17]:
def get_perplexity(model, eval_data):

  model.to('cuda')
  encodings = tokenizer('\n\n'.join(eval_data['Lyric']), return_tensors = 'pt')

  eval_max_length = max([len(re.findall('\w+', x)) for x in eval_data['Lyric']])
  stride = 32
  seq_len = encodings.input_ids.size(1)

  nlls = []
  prev_end_loc = 0
  for begin_loc in tqdm(range(0, seq_len, stride)):
      end_loc = min(begin_loc + eval_max_length, seq_len)
      trg_len = end_loc - prev_end_loc
      input_ids = encodings.input_ids[:, begin_loc:end_loc].to('cuda')
      target_ids = input_ids.clone()
      target_ids[:, :-trg_len] = -100

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)

          neg_log_likelihood = outputs.loss

      nlls.append(neg_log_likelihood)

      prev_end_loc = end_loc
      if end_loc == seq_len:
          break

  ppl = torch.exp(torch.stack(nlls).mean())

  return ppl

In [19]:
print(f'Perplexity of rock lyrics generator model: { get_perplexity(rock, rock_data) }')

100%|█████████▉| 11689/11698 [06:30<00:00, 29.93it/s]


Perplexity of rock lyrics generator model: 52.25809860229492


In [20]:
print(f'Perplexity of pop lyrics generator model: { get_perplexity(pop, pop_data) }')

100%|█████████▉| 12950/12959 [07:33<00:00, 28.55it/s]


Perplexity of pop lyrics generator model: 376.931396484375


In [21]:
print(f'Perplexity of indie lyrics generator model: { get_perplexity(indie, indie_data) }')

100%|█████████▉| 11387/11394 [05:15<00:00, 36.06it/s]


Perplexity of indie lyrics generator model: 31.69938850402832


In [22]:
print(f'Perplexity of hip hop lyrics generator model: { get_perplexity(hip_hop, hip_hop_data) }')

100%|█████████▉| 10200/10207 [04:48<00:00, 35.29it/s]


Perplexity of hip hop lyrics generator model: 9.732035636901855


In [23]:
print(f'Perplexity of rap lyrics generator model: { get_perplexity(rap, rap_data) }')

100%|█████████▉| 11780/11788 [06:27<00:00, 30.40it/s]


Perplexity of rap lyrics generator model: 7705.46533203125
