# Notebook to create the lyrics dataset for the songs in the FMA music dataset using the LyricsGenius API

In [1]:
from dotenv import load_dotenv
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import requests
import json
import pandas as pd
from lyricsgenius import Genius
from requests.exceptions import Timeout
from tqdm import tqdm
import time

In [2]:
#secret variables

load_dotenv()

SPOT_ID= os.getenv('SPOT_ID')
SPOT_SECRET= os.getenv('SPOT_SECRET')
LYRIC_TOKEN= os.getenv('LYRIC_TOKEN')


In [3]:
genius = Genius(LYRIC_TOKEN,skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], timeout=60,retries=2, verbose=False)


Loading songs metadata to get the name of the song and artist needed to find the lyrics using the API

In [4]:
tracksMetaData = pd.read_csv('tracksMetaData.csv')


In [5]:
tracksMetaData.head()

Unnamed: 0,id,trackName,genre,trackArtist
0,2,Food,Hip-Hop,AWOL
1,5,This World,Hip-Hop,AWOL
2,10,Freeway,Pop,Kurt Vile
3,140,Queen Of The Wires,Folk,Alec K. Redfearn & the Eyesores
4,141,Ohio,Folk,Alec K. Redfearn & the Eyesores


Functions to make the API calls and save the data in local
The functions ensure that we collect the data in batches with breaks in between since this is a time taking process where the server might sever the connection if kept alive continuously for longer durations

In [4]:
def lyricExtractor(seg1,seg2):

    data = pd.DataFrame()
    cnt = 0
    for index, row in tqdm(tracksMetaData[seg1:seg2].iterrows()):
        df = pd.DataFrame()
        res = genius.search_song(title=row['trackName'], artist=row['trackArtist'])
        df['id'] = [row['id']]
        df['track'] = row['trackName']
        df['trackArtist'] = row['trackArtist']
        df['genre'] = row['genre']
        if res != None:
            df['lyrics'] = [res.lyrics]
        else:
            df['lyrics'] = None
            cnt = cnt + 1 
        data = pd.concat([data,df])
    print('Number of missed lyrics ',cnt)
    return data

The saveFiles() function was used with the FMA Small dataset that has ~8k songs to be queried but was modified later to include breaks between batches when working with the medium dataset which is more than 3X the size of the small datasets at 25k songs. The modified function is given further down in the notebook and is named as saveFiles_medium() function

In [7]:
def saveFiles(startPart,endPart,segmentName):
    
    for i in range(startPart,endPart):
        print('Starting Part '+ str(i) +' from '+str(i*1000) +' to ' + str((i+1)*1000))
        data = lyricExtractor(i*1000,(i+1)*1000)
        data.to_csv('tracksLyricFeatures/tracksLyricFeatures_'+segmentName+'_part_'+str(i)+'.csv')
        print('Finished Part '+ str(i))
        print('Saved as '+ 'tracksLyricFeatures/tracksLyricFeatures_'+segmentName+'_part_'+str(i)+'.csv')

In [8]:
saveFiles(0,1,'segment01')

Starting Part 0 from 0 to 1000


1000it [22:31,  1.35s/it]


Number of missed lyrics  409
Finished Part 0
Saved as tracksLyricFeatures/tracksLyricFeatures_segment01_part_0.csv


In [8]:
saveFiles(1,4,'segment02')

Starting Part 1 from 1000 to 2000


1000it [17:07,  1.03s/it]


Number of missed lyrics  492
Finished Part 1
Saved as tracksLyricFeatures/tracksLyricFeatures_segment02_part_1.csv
Starting Part 2 from 2000 to 3000


1000it [19:40,  1.18s/it]


Number of missed lyrics  480
Finished Part 2
Saved as tracksLyricFeatures/tracksLyricFeatures_segment02_part_2.csv
Starting Part 3 from 3000 to 4000


1000it [20:06,  1.21s/it]


Number of missed lyrics  466
Finished Part 3
Saved as tracksLyricFeatures/tracksLyricFeatures_segment02_part_3.csv


In [9]:
saveFiles(4,8,'segment03')

Starting Part 4 from 4000 to 5000


1000it [19:30,  1.17s/it]


Number of missed lyrics  477
Finished Part 4
Saved as tracksLyricFeatures/tracksLyricFeatures_segment03_part_4.csv
Starting Part 5 from 5000 to 6000


1000it [20:27,  1.23s/it]


Number of missed lyrics  430
Finished Part 5
Saved as tracksLyricFeatures/tracksLyricFeatures_segment03_part_5.csv
Starting Part 6 from 6000 to 7000


1000it [19:09,  1.15s/it]


Number of missed lyrics  427
Finished Part 6
Saved as tracksLyricFeatures/tracksLyricFeatures_segment03_part_6.csv
Starting Part 7 from 7000 to 8000


1000it [18:34,  1.11s/it]


Number of missed lyrics  468
Finished Part 7
Saved as tracksLyricFeatures/tracksLyricFeatures_segment03_part_7.csv


Missed Lyrics - 3649/7999 = 45%

This is the version of the saveFiles() function above used with considerably bigger medium dataset

In [5]:
def saveFiles_medium(startPart,endPart,segmentName):
    
    for i in range(startPart,endPart):
        print('Starting Part '+ str(i) +' from '+str(i*1000) +' to ' + str((i+1)*1000))
        data = lyricExtractor(i*1000,(i+1)*1000)
        data.to_csv('tracksLyricFeaturesMedium/tracksLyricFeatures_'+segmentName+'_part_'+str(i)+'.csv')
        print('Finished Part '+ str(i))
        print('Saved as '+ 'tracksLyricFeaturesMedium/tracksLyricFeatures_'+segmentName+'_part_'+str(i)+'.csv')
        time.sleep(600)

In [7]:
tracksMetaData = pd.read_csv('tracksMetaDataMedium.csv')

In [8]:
tracksMetaData.head()

Unnamed: 0,id,trackName,genre,trackArtist
0,2,Food,Hip-Hop,AWOL
1,3,Electric Ave,Hip-Hop,AWOL
2,5,This World,Hip-Hop,AWOL
3,10,Freeway,Pop,Kurt Vile
4,134,Street Music,Hip-Hop,AWOL


In [9]:
tracksMetaData.shape

(25000, 4)

In [20]:
saveFiles_medium(0,1,'segment01')

Starting Part 0 from 0 to 1000


1000it [22:55,  1.38s/it]


Number of missed lyrics  305
Finished Part 0
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment01_part_0.csv


In [21]:
saveFiles_medium(1,10,'segment02')

Starting Part 1 from 1000 to 2000


1000it [57:49,  3.47s/it]


Number of missed lyrics  393
Finished Part 1
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_1.csv
Starting Part 2 from 2000 to 3000


1000it [20:20,  1.22s/it]


Number of missed lyrics  474
Finished Part 2
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_2.csv
Starting Part 3 from 3000 to 4000


1000it [19:29,  1.17s/it]


Number of missed lyrics  485
Finished Part 3
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_3.csv
Starting Part 4 from 4000 to 5000


1000it [18:05,  1.09s/it]


Number of missed lyrics  552
Finished Part 4
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_4.csv
Starting Part 5 from 5000 to 6000


1000it [20:04,  1.20s/it]


Number of missed lyrics  498
Finished Part 5
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_5.csv
Starting Part 6 from 6000 to 7000


1000it [17:52,  1.07s/it]


Number of missed lyrics  487
Finished Part 6
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_6.csv
Starting Part 7 from 7000 to 8000


1000it [19:33,  1.17s/it]


Number of missed lyrics  430
Finished Part 7
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_7.csv
Starting Part 8 from 8000 to 9000


1000it [19:14,  1.15s/it]


Number of missed lyrics  421
Finished Part 8
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_8.csv
Starting Part 9 from 9000 to 10000


1000it [17:15,  1.04s/it]


Number of missed lyrics  529
Finished Part 9
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment02_part_9.csv


In [22]:
saveFiles_medium(10,15,'segment03')

Starting Part 10 from 10000 to 11000


1000it [17:14,  1.03s/it]


Number of missed lyrics  487
Finished Part 10
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment03_part_10.csv
Starting Part 11 from 11000 to 12000


1000it [18:13,  1.09s/it]


Number of missed lyrics  465
Finished Part 11
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment03_part_11.csv
Starting Part 12 from 12000 to 13000


1000it [18:29,  1.11s/it]


Number of missed lyrics  487
Finished Part 12
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment03_part_12.csv
Starting Part 13 from 13000 to 14000


1000it [19:01,  1.14s/it]


Number of missed lyrics  470
Finished Part 13
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment03_part_13.csv
Starting Part 14 from 14000 to 15000


1000it [19:44,  1.18s/it]


Number of missed lyrics  433
Finished Part 14
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment03_part_14.csv


In [23]:
saveFiles_medium(15,20,'segment04')

Starting Part 15 from 15000 to 16000


1000it [17:18,  1.04s/it]


Number of missed lyrics  517
Finished Part 15
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment04_part_15.csv
Starting Part 16 from 16000 to 17000


1000it [19:58,  1.20s/it]


Number of missed lyrics  411
Finished Part 16
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment04_part_16.csv
Starting Part 17 from 17000 to 18000


1000it [18:59,  1.14s/it]


Number of missed lyrics  469
Finished Part 17
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment04_part_17.csv
Starting Part 18 from 18000 to 19000


1000it [21:47,  1.31s/it]


Number of missed lyrics  357
Finished Part 18
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment04_part_18.csv
Starting Part 19 from 19000 to 20000


1000it [20:32,  1.23s/it]


Number of missed lyrics  413
Finished Part 19
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment04_part_19.csv


In [24]:
saveFiles_medium(20,25,'segment05')

Starting Part 20 from 20000 to 21000


1000it [21:02,  1.26s/it]


Number of missed lyrics  392
Finished Part 20
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment05_part_20.csv
Starting Part 21 from 21000 to 22000


1000it [23:05,  1.39s/it]


Number of missed lyrics  472
Finished Part 21
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment05_part_21.csv
Starting Part 22 from 22000 to 23000


1000it [21:56,  1.32s/it]


Number of missed lyrics  508
Finished Part 22
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment05_part_22.csv
Starting Part 23 from 23000 to 24000


1000it [22:51,  1.37s/it]


Number of missed lyrics  421
Finished Part 23
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment05_part_23.csv


In [10]:
saveFiles_medium(24,25,'segment06')

Starting Part 24 from 24000 to 25000


1000it [20:55,  1.26s/it]


Number of missed lyrics  442
Finished Part 24
Saved as tracksLyricFeaturesMedium/tracksLyricFeatures_segment06_part_24.csv


Concatenating the batches for FMA Small datasets

In [3]:
datasets = os.listdir('./tracksLyricFeatures')

print(f"filenames : {datasets}")


filenames : ['tracksLyricFeatures_segment01_part_0.csv', 'tracksLyricFeatures_segment02_part_1.csv', 'tracksLyricFeatures_segment02_part_2.csv', 'tracksLyricFeatures_segment02_part_3.csv', 'tracksLyricFeatures_segment03_part_4.csv', 'tracksLyricFeatures_segment03_part_5.csv', 'tracksLyricFeatures_segment03_part_6.csv', 'tracksLyricFeatures_segment03_part_7.csv']


In [9]:

tracksLyricFeatures_complete = pd.DataFrame() 

for i in datasets:
    x = pd.read_csv('./tracksLyricFeatures/' + i)
    tracksLyricFeatures_complete = pd.concat([tracksLyricFeatures_complete,x])
    # pd.concat([broken_tracksAudioFeatures_complete,x])

In [13]:
tracksLyricFeatures_complete.head()

Unnamed: 0.1,Unnamed: 0,id,track,trackArtist,genre,lyrics
0,0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra..."
1,0,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s..."
2,0,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ..."
3,0,140,Queen Of The Wires,Alec K. Redfearn & the Eyesores,Folk,
4,0,141,Ohio,Alec K. Redfearn & the Eyesores,Folk,


We were not able to get lyrics for ~3.6k songs out of the 8k songs in the small dataset

In [14]:
tracksLyricFeatures_complete["lyrics"].isna().sum()

3649

In [15]:
tracksLyricFeatures_complete.to_csv('./tracksLyricFeatures/tracksLyricFeaturesComplete.csv', encoding='utf8')

Repeating the concatenation steps for the Medium datasets

In [16]:
datasets = os.listdir('./tracksLyricFeaturesMedium')

print(f"filenames : {datasets}")


filenames : ['tracksLyricFeatures_segment01_part_0.csv', 'tracksLyricFeatures_segment02_part_1.csv', 'tracksLyricFeatures_segment02_part_2.csv', 'tracksLyricFeatures_segment02_part_3.csv', 'tracksLyricFeatures_segment02_part_4.csv', 'tracksLyricFeatures_segment02_part_5.csv', 'tracksLyricFeatures_segment02_part_6.csv', 'tracksLyricFeatures_segment02_part_7.csv', 'tracksLyricFeatures_segment02_part_8.csv', 'tracksLyricFeatures_segment02_part_9.csv', 'tracksLyricFeatures_segment03_part_10.csv', 'tracksLyricFeatures_segment03_part_11.csv', 'tracksLyricFeatures_segment03_part_12.csv', 'tracksLyricFeatures_segment03_part_13.csv', 'tracksLyricFeatures_segment03_part_14.csv', 'tracksLyricFeatures_segment04_part_15.csv', 'tracksLyricFeatures_segment04_part_16.csv', 'tracksLyricFeatures_segment04_part_17.csv', 'tracksLyricFeatures_segment04_part_18.csv', 'tracksLyricFeatures_segment04_part_19.csv', 'tracksLyricFeatures_segment05_part_20.csv', 'tracksLyricFeatures_segment05_part_21.csv', 'tracks

In [18]:

tracksLyricFeatures_complete = pd.DataFrame() 

for i in datasets:
    x = pd.read_csv('./tracksLyricFeaturesMedium/' + i)
    tracksLyricFeatures_complete = pd.concat([tracksLyricFeatures_complete,x])
    # pd.concat([broken_tracksAudioFeatures_complete,x])

We were not able to get lyrics for ~11k songs out of the 25k songs in the medium dataset

In [20]:
tracksLyricFeatures_complete["lyrics"].isna().sum()

11318

In [21]:
tracksLyricFeatures_complete.to_csv('./tracksLyricFeaturesMedium/tracksLyricFeaturesComplete.csv', encoding='utf8')

Preview of the lyrics data

In [34]:
tracksLyricFeatures_complete["lyrics"]

array(["The Waking Blind LyricsThe waking blind, embraced the dark\nThey made us fight for the answers\nThey made us they made us fight fight fight for the answers\nThey made us fight\n\nTime is up the lines are drawn in sand\nIt's like a ritual cleaning\nFaith against them all\nSome kind of wicked pressure growing below\nBound to breach the surface\nGloves are off\nThe blinders tighten\nMisguided hands\nLeading the awoken\nFood for thought\nOnce your brothers now\nThe wretches the dregs the scum and the bastards\nEyes up\nGrab your pitchforks just before the dawn\nThey’re out of time and out of touch and out of luck\nWith the air that we breath and the ground beneath their feet\n\nA mask of virtue\nYou’ll hide behind as you feed off the lies\nBowing down to pressure with no resolve\nNever again\nCan we allow this fear to hold us\nFaith against them all\nDesigned deny and deprive\n\nThe waking blind, embraced the dark\nThey made us fight for the answers\n\nThe coming whipping winds\nTh

Comparison of the FMA Small and FMA Medium dataset

In [3]:
small_dataset = pd.read_csv('./tracksLyricFeaturesSmall/tracksLyricFeaturesComplete.csv')
small_dataset.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,track,trackArtist,genre,lyrics
0,0,0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra..."
1,1,0,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s..."
2,2,0,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ..."
3,3,0,140,Queen Of The Wires,Alec K. Redfearn & the Eyesores,Folk,
4,4,0,141,Ohio,Alec K. Redfearn & the Eyesores,Folk,


In [2]:
medium_dataset = pd.read_csv('./tracksLyricFeatures/tracksLyricFeaturesComplete.csv')
medium_dataset.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,track,trackArtist,genre,lyrics
0,0,0,2,Food,AWOL,Hip-Hop,"The Waking Blind LyricsThe waking blind, embra..."
1,1,0,3,Electric Ave,AWOL,Hip-Hop,Oversikt over norske artister Lyrics*&aute&rom...
2,2,0,5,This World,AWOL,Hip-Hop,"E.T. Lyrics[Verse One: Mickey Factz]\nSee, I s..."
3,3,0,10,Freeway,Kurt Vile,Pop,"Freeway LyricsI got a freeway in mind, let go ..."
4,4,0,134,Street Music,AWOL,Hip-Hop,"Goin’ Down Lyrics[Intro: DJ Drama, Fabolous & ..."


In [5]:
len(medium_dataset)
len(medium_dataset[medium_dataset["lyrics"].isna()])

11318

In [5]:
print(f'small dataset shape:{small_dataset.shape} medium dataset shape:{medium_dataset.shape}')

small dataset shape:(8000, 7) medium dataset shape:(25000, 7)


Given the greater volume of the data, we decided to go with the FMA Medium dataset for which we were able to get the lyrics for ~14k songs out of the 25k songs