### GPT-2 Lyrics Generation

In [1]:
import pandas as pd
from ast import literal_eval
import random
import re
from tqdm import tqdm

import tensorflow as tf
import gpt_2_simple as gpt2
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
###### Uncomment to download the different gpt2 sized models ######

# for model_name in ["124M","355M"]:
#     gpt2.download_gpt2(model_name=model_name)

#### Import the lyrics csv

In [83]:
# import the lyrics as dataframe
rock_lyrics_df = pd.read_csv('Data\df_rock_30k_lyrics.csv', converters={'words':literal_eval,'segments':literal_eval})
rock_lyrics_df.head()

Unnamed: 0.1,Unnamed: 0,track,artist,lyrics,words,segments
0,0,"Puerto Cabezas, NI",Lewis Del Mar,"Puerto Cabezas, NI Lyrics[Verse 1]\nHigh my bl...","[puerto, cabezas, ni, lyric, high, blood, heav...","[([Verse 1], High my blood, heavy like hot ra..."
1,1,The Way I Live - Main Explicit,Baby Boy Da Prince,The Way I Live Lyrics[Intro]\nD-Wizzle on the ...,"[way, live, lyric, dwizzle, track, dwizzle, tr...","[([Intro], D-Wizzle on the track, D-Wizzle o..."
2,2,MAFIA,Travis Scott,MAFIA Lyrics[Verse 1: Travis Scott]\nCustom th...,"[mafia, lyric, custom, thing, custom, wing, cu...","[([Verse 1: Travis Scott], Custom the things,..."
3,3,Am I Alright,Aly & AJ,"Am I Alright Lyrics[Verse 1]\nEvery day, every...","[alright, lyric, every, day, every, day, pulli...","[([Verse 1], Every day, every day I'm pulling..."
4,4,Mountain Sound,Of Monsters and Men,Mountain Sound Lyrics[Verse 1: Ragnar Þórhalls...,"[mountain, sound, lyric, heard, calling, dista...","[([Verse 1: Ragnar Þórhallsson], I heard them..."


In [84]:
rock_lyrics_df.shape

(21723, 6)

#### Lyrics text preprocsessing

Ruleset to simplify the tags into either intro, verse, chorus, or outro for each lyrical section.

In [85]:
def clean_segments_to_lyrics(segments):
    
    match_tags = {'cho': 'chorus',
                'chr': 'chorus',
                'hook': 'chorus',
                'refrain': 'chorus',
                'reprise': 'chorus',
                'out': 'outro',
                'ver': 'verse',
                'bridge': 'verse',
                'interlude': 'verse',
                'breakdown': 'verse',
                }
    tag_list = ['intro','verse','chorus','outro']
    tag_lyrics_list = []
    for tag, seg in segments:
        try:
            # regex for getting names in the tags
            clean_tag = re.match('(?:\[)([\-a-zA-Z\s]+)(?:\:)*', tag).group(1)
        except:
            clean_tag = tag
        clean_tag = clean_tag.strip().lower()

        # filter out instrumental tag lyrics which are usually empty
        if 'instrument' in clean_tag:
            continue

        # convert tags to their most similar category
        for tag_key, tag_value in match_tags.items():
            if tag_key in clean_tag:
                clean_tag = tag_value
                break
        
        if clean_tag in tag_list:
            tag_lyrics_list.append('[' + clean_tag + ']' + seg.strip() + '\n\n')
    return ''.join(tag_lyrics_list)

lyrics_df = rock_lyrics_df.copy()
lyrics_df['cleaned_lyrics'] = rock_lyrics_df['segments'].apply(clean_segments_to_lyrics)


Filter out songs that are too short or too long

In [111]:
filtered_lyrics_df = lyrics_df[lyrics_df['cleaned_lyrics'].apply(lambda x: 10 < len(x.split(' ')) < 1000)]

In [112]:
filtered_lyrics_df.shape

(15056, 7)

Remove unwanted texts and add necessary newline spaces within the text to maintain lyric readability

In [113]:
def add_feature_lyrics(lyric):
    clean_lyric = re.sub('(\])', r'\1\n', lyric)
    clean_lyric = re.sub('(\s)([^Ia-z])', r'\n\2', clean_lyric)
    clean_lyric = re.sub('(\s)(I[\'a-z]+\s)', r'\n\2', clean_lyric)
    clean_lyric = re.sub('(.)(\d*Embed)', r'\1 ', clean_lyric)
    clean_lyric = clean_lyric.lower()
    return clean_lyric

filtered_lyrics_df = pd.DataFrame(filtered_lyrics_df['cleaned_lyrics'].apply(add_feature_lyrics))


Example output for the cleaned lyrics:

In [194]:
print(filtered_lyrics_df.sample().cleaned_lyrics.iloc[0][:300])

[intro]
colours, colours, colours, colours

[verse]
right there, just between the height of
my friend, something left me thinking
stay young and focus on the fireworks
stand back and look a little higher

[chorus]
so many colours fill the sky
so many good things to come
if only they'd always caught 


In [28]:
# lyrics_df = lyrics_df.sample(1000)
filtered_lyrics_df['cleaned_lyrics'].to_csv('Data/rock_15k_lyrics_only.txt',index=False,header=False,sep=' ')
filtered_lyrics_df['cleaned_lyrics'].to_csv('Data/rock_15k_lyrics_only.csv',index=False)

#### GPT-2 Model

Load csv file with single column of lyrics 

In [195]:
dataset_path = 'Data/rock_15k_lyrics_only.csv'
pd.read_csv(dataset_path).head()

Unnamed: 0,cleaned_lyrics
0,"[verse]\nhigh my blood, heavy like hot rain\ni..."
1,"[intro]\nd-wizzle on the track,\n d-wizzle on ..."
2,"[verse]\ncustom the things, custom the wings i..."
3,"[verse]\nevery day, every day\ni'm pulling the..."
4,[verse]\ni heard them calling in the distance\...


Setting parameters for finetuning the GPT2 model

In [None]:
learning_rate = 1e-4
optimizer = 'adam'
batch_size = 1
model_name = "124M"
steps = 10000
sess = None

tf.compat.v1.reset_default_graph()
if not sess:
    sess = gpt2.start_tf_sess()
else:
    sess = gpt2.reset_session(sess)

run_name = '15k_' + 'lr' + str(learning_rate)
gpt2.finetune(sess,
            dataset_path,
            model_name=model_name,
            learning_rate=learning_rate,
            batch_size=batch_size,
            steps=steps,
            sample_every=10000,
            sample_length=300,
            save_every=2000,
            print_every=10000,
            restore_from='fresh',
            run_name=run_name)


#### Generate lyrics from fine-tuned gpt2_simple models

In [200]:
tf.compat.v1.reset_default_graph()
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, run_name='15k_lr0.0001')

Loading checkpoint checkpoint\15k_lr0.0001\model-10000
INFO:tensorflow:Restoring parameters from checkpoint\15k_lr0.0001\model-10000


In [257]:
lyrics_results = gpt2.generate(sess,
                            prefix="<|startoftext|>",
                            truncate="<|endoftext|>",
                            nsamples=5,
                            temperature=.85, # higher temperature the model gives more random text generations (default(0.7))
                            top_p=0.9, # cumulative probability of guesses
                            top_k=0, # top k guesses (default(0); 0 ~= disabled)
                            length=180, # number of tokens to generate (e.g. max: default(1023))
                            return_as_list=True,
                            include_prefix=True)

for i, lyric in enumerate(lyrics_results):
  print('-'*20 + f'Lyric {i}:' + '-'*20 + '\n')
  print(lyric)


[verse]
i see a new face in the emptiness 
i know you're out there somewhere watching over me
but i'm leaving here tonight
you know that it won't be long
'til you see my fire

[verse]
we're on the road to somewhere far away
it's always the first to last mile
i've been waiting for this moment
(i've been waiting for this moment)
you know that it won't be long
'til you see my fire

[verse]
i'm breaking free from myself
i'm breaking free
(i'm breaking free)
i'm breaking free

[chorus]
we're all just burning up
we're all just burning up

[chorus]
we're all just burning up
we're all just burning up


#### Evaluate lyrics generated using bleu

Replacing the newline character with spaces for both the reference lyrics and the generated lyrics, so words can be tokenized properly before being compared using bleu.

In [48]:
rock_15k_lyrics = pd.read_csv('Data/rock_15k_lyrics_only.csv')
rock_15k_lyrics = rock_15k_lyrics['cleaned_lyrics'].apply(lambda lyric: lyric.replace('\n', ' '))
rock_15k_lyrics_list = rock_15k_lyrics.to_list()

In [72]:
rock_15k_lyrics_list[0]

"[verse] high my blood, heavy like hot rain i'm drown in you colors mixing on a hot day, hot day  [chorus] in the streets without the street lights and with no power lines i am electric, i'm electrical i finally found the rest of me  [chorus] my dance hall is all bodies now and they're burning sugar sweet, so sweet and my old world is on fire now as i move into the heat, the heat  [verse] high my blood, fresh fish and sticky plantains i am bound to you by the mystery of my own name  [chorus] my dance hall is all bodies now and that burning sugars sweet, so sweet and my old world is on fire now as i move into the heat as i move into the heat, the heat  [chorus] (in the streets without the street lights and no power lines i am electric, i am electric)  in the streets without the street lights and no power lines i am electric, electric  [verse] in the streets that are my insides my father never lost his mind how come i never call you now? why do i never call you now? bottom of the ocean  

In [242]:
generated_lyrics = [lyric.replace('<|startoftext|>', '').replace('\n', ' ') for lyric in lyrics_results]
generated_lyrics

[" [verse] tell me where you've been take a picture, say it to my face and i'll send it back a hundred times yeah, i'm a liar but i still got that yeah, i'm a let down so don't say a word  [chorus] i'll wait, i'll wait i'll hold you down just like a run-on  [verse] i'm a playboy on a dirty track tied up in your slums i'm a big boss with a bad reputation i'll go anywhere but there well i'm a drop top in the middle of the ocean i'm a real pimp with a bad deal i'll go anywhere but there  [chorus] i'll wait, i'll wait i'll hold you down just like a run-on  [verse]",
 " [verse] i'm a samurai on a mission to save the world from itself i'm a ninja on a mission to save the world from itself i'm a ninja, i'm a ninja  [chorus] i'm a ninja, i'm a ninja  [chorus] i'll strike once, i'll strike twice i'll break twice, i'll break once more i'll strike once, i'll strike twice i'll break twice, i'll break twice more  [verse] i'm a ninja on a mission to save the world from itself i'm a ninja, i'm a ninj

In [254]:
test_size = 1000
smoothing = SmoothingFunction().method5

for i, gen_lyric in enumerate(generated_lyrics):

    avg_bleu = min_bleu = max_bleu = 0

    for _ in tqdm(range(test_size)):
        sample_song = random.choice(rock_15k_lyrics_list)
        sample_bleu = sentence_bleu(sample_song, gen_lyric, smoothing_function=smoothing)
        avg_bleu += sample_bleu
        max_bleu = max(max_bleu, sample_bleu)
    print(f'Lyric{i}')
    print(f'avg_bleu_score: {avg_bleu/test_size}')
    print(f'max_bleu_score: {max_bleu}')

100%|██████████| 100/100 [01:11<00:00,  1.41it/s]


Lyric0
avg_bleu_score: 0.0704217190043469
max_bleu_score: 0.07074701880140805


100%|██████████| 100/100 [00:27<00:00,  3.60it/s]


Lyric1
avg_bleu_score: 0.06969752359968084
max_bleu_score: 0.06985732794465223


100%|██████████| 100/100 [00:47<00:00,  2.10it/s]


Lyric2
avg_bleu_score: 0.06912513284879583
max_bleu_score: 0.06931380940735446


100%|██████████| 100/100 [00:29<00:00,  3.43it/s]


Lyric3
avg_bleu_score: 0.06866981994072772
max_bleu_score: 0.06868493629534578


100%|██████████| 100/100 [00:52<00:00,  1.89it/s]

Lyric4
avg_bleu_score: 0.06957827692305227
max_bleu_score: 0.06981353000387662



