## Import Libraries

# CS510 NLP Final Project

**Author** - Anamika Nayak

In [None]:
%%capture
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
import pandas as pd
import numpy as np
import sys, os # Importing data
import torch
from torch.utils.data import Dataset, DataLoader
!pip install transformers[torch]
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling
from tqdm import tqdm, trange
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/NLPGroupProject'

Mounted at /content/drive


## Data

In [None]:
# downloads the .csv files from google drive only if it's not already in directory
csv_path = f"{path}/spotify_millsongdata.csv"
if os.path.isfile(csv_path) == False:
  !gdown --id 1wGtLywxyCq858JTVtizWHR5dtIf4Di8v

base_df = pd.read_csv(csv_path)
trng_set = base_df[['artist', 'text', 'song']]

# only select Rihanna
trng_set = trng_set[trng_set['artist'] == 'Rihanna']
print(f"Training list length - {len(trng_set)}")

Training list length - 143


In [None]:
def replace_newlines(input_string, replacement_symbol=";;"):
    return input_string.replace('\r\n', replacement_symbol)

In [None]:
trng_set = trng_set.dropna()
text_data = open('Artists.txt', 'w')
for idx, item in trng_set.iterrows():
  lyrics = replace_newlines(item["text"])
  title = item["song"]
  artist = item["artist"]
  text_data.write(f"{artist}|{title}|{lyrics}\r\n")
text_data.close()

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [None]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)
  if torch.cuda.is_available():
      device = torch.device("cuda")
      print(f"Using GPU: {torch.cuda.get_device_name(0)}")

  else:
      device = torch.device("cpu")
      print("Using CPU")

  model.to(device)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          save_steps=save_steps,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [None]:
# you need to set parameters
train_file_path = "/content/Artists.txt"
model_name = 'gpt2'
output_dir = f'{path}/result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3.0
save_steps = 500

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using GPU: Tesla T4


Step,Training Loss


# Inference

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    final_outputs = tokenizer.decode(final_outputs[0], skip_special_tokens=True).replace(';;','\r\n')
    final_outputs = "\n".join(final_outputs.split('\r\n')[1:])
    singer = sequence.split('|')[0]
    song_title = sequence.split('|')[1]
    final_outputs = f'{singer} \r\n\r\n{song_title} \r\n {final_outputs}'
    return final_outputs

In [None]:
max_len = int(250)
sequence = "Rihanna | Take on me"

In [None]:
print(generate_text(sequence, max_len))

Rihanna  

 Take on me 
 I'm so cold, it's cold inside  
I gotta tell you how cold it is|Can't fight  
Baby just hold on tight  
If you're gonna give me no love  
Don't you  
Do what it takes to feel like an outsider  
It's crazy how you live with hate  
But you know it's alright  
  
(Chorus)  
I'm so cold, it's cold inside  
I gotta tell you how cold it is|Can't fight  
  
It's crazy how you live with hate  
But you know it's alright  
But you know it's alright  
  
(Baby just hold on tight)  
If you're gonna give me no love  
Don't you  
Do what it takes to feel like an outsider  
It's crazy how you live with hate  
But you know it's alright  
  
(Chorus)  
  
(Baby just hold on tight)  
I'm so cold, it's cold inside  
I gotta tell you how cold it is|Can't fight


In [None]:
sequence = "Rihanna | Umbrella"
print(generate_text(sequence, max_len))

Rihanna  

 Umbrella 
 Ooh  
I'm feeling sick to my stomach  
(Ohhh)  
I thought it might be too much but you took a little longer than you were expecting  
I had no idea just how sick you were  
You were feeling and I wish we would've known  
You had been the one to tell me  
I don't hate you enough to love me, I know what you're thinking  
But you're a monster  
You're a monster  
(Ohh)  
  
So I'm gonna hold your hand and hold yours  
Cause you ain't no stranger to me so let's be nice  
Ain't nobody better to treat me like that  
Baby this won't be the last time  
I'm the only one to say goodbye  
(Ohhh)


Rihanna|My Love Is Your Brand|[Verse 2]  
Ain't nobody better to treat me like that  
Ain't nobody better to treat me like that  
(Ohhh)  
  
Ain't nobody better to treat me like


In [None]:
sequence = "Rihanna | YMCA"
print(generate_text(sequence, max_len))

Rihanna  

 YMCA 
 I'm a fool


Rihanna|Boy I Don't Want You|Baby it's alright to be an idiot  
If you can't find it, give it to me  
That's what I call a child  
You can't be a fool, you can't be an idiot


Rihanna|Boy I Don't Want You|Hey y'all just got me feeling lost and lonely  
I'm feeling lost and lonely


Rihanna|Boy I Don't Want You|Baby it's alright to be an idiot  
If you can't find it, give it to me


Rihanna|I'm a Fool|Baby I don't want you boy I'm the one. I'm the one.  
I'm the one.  
I'm the one.  
I'm the one.  
I'm the one.


Rihanna|I'm a Fool|Baby I don't want you boy I'm the one. I'm the one.  
I'm the one.  
I'm


In [None]:
%%capture
!sudo apt-get update
!sudo apt-get install texlive-xetex texlive-fonts-recommended texlive-latex-recommended

In [None]:
!jupyter nbconvert --to pdf "/content/drive/MyDrive/NLPGroupProject/Fine-Tune-GPT2-HuggingFace.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/NLPGroupProject/Fine-Tune-GPT2-HuggingFace.ipynb to pdf
[NbConvertApp] Writing 43128 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 48999 bytes to /content/drive/MyDrive/NLPGroupProject/Fine-Tune-GPT2-HuggingFace.pdf
