In [1]:
import pandas as pd
import re

# read file containing English lyrics
df = pd.read_csv("song_lyrics_en.csv")

# keep only these columns
df = df[["id", "title", "tag", "artist", "year", "lyrics"]]
df.set_index("id", inplace=True)

df

Unnamed: 0_level_0,title,tag,artist,year,lyrics
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Killa Cam,rap,Cam'ron,2004,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki..."
3,Can I Live,rap,JAY-Z,1996,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,..."
4,Forgive Me Father,rap,Fabolous,2003,Maybe cause I'm eatin\nAnd these bastards fien...
5,Down and Out,rap,Cam'ron,2004,[Produced by Kanye West and Brian Miller]\n\n[...
6,Fly In,rap,Lil Wayne,2005,"[Intro]\nSo they ask me\n""Young boy\nWhat you ..."
...,...,...,...,...,...
7882838,Everything Is Alright Now,pop,Chuck Bernard,2013,"Everything is alright now\nOh yes, baby\nEvery..."
7882840,White Lies,pop,ElementD,2019,[Verse 1]\nHalf truth and half you\nDidn't we ...
7882842,Ocean,pop,Effemar,2022,[Verse 1]\nDance for me now\nKeeping yourself ...
7882845,Raise Our Hands,pop,"Culture Code, Pag & Mylo",2016,[Verse 1]\nHere our purpose feels alive\nWe ar...


In [2]:
# check on population of songs by genre
df[["tag", "title"]].groupby("tag").count()

Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,86658
misc,140979
pop,1393543
rap,964549
rb,155076
rock,633289


In [3]:
# we only want to keep country, rap, r&b, and rock songs
df = df[df["tag"].str.contains("country|rap|rb|rock", regex=True)]

# show count of songs after dropping other genres
df[["tag", "title"]].groupby("tag").count()

Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,86658
rap,964549
rb,155076
rock,633289


In [4]:
# keep only songs that contain square bracket part of song markers
df = df[df["lyrics"].str.contains(r"\[.*\]", regex=True) == True]

# show count of songs after dropping songs that do not contain a part of song marker in the lyrics
df[["tag", "title"]].groupby("tag").count()

Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,46440
rap,672102
rb,99012
rock,255754


In [5]:
# examine the first part of song markers contained in the lyrics
pd.options.display.max_rows = None
pd.Series(df["lyrics"].str.extract(r"(\[.*\])", expand=False).unique())[0:500]

0                        [Chorus: Opera Steve & Cam'ron]
1                                [Produced by Irv Gotti]
2                                                 [Hook]
3              [Produced by Kanye West and Brian Miller]
4                                                [Intro]
5                                     [Intro: Lil Wayne]
6                                       [Intro: Pusha T]
7                                     [Verse 1: Cam'ron]
8                                              [Verse 1]
9                                       [Chorus: Jaheim]
10                              [Produced by DJ Premier]
11                              [Produced by Just Blaze]
12                                      [Intro: Birdman]
13                               [Produced by Timbaland]
14                                     [Chorus: Cam'ron]
15                      [Intro: Jay-Z, Kid Capri & Both]
16                                [Intro: Juelz Santana]
17                [Produced by 

In [6]:
# Standardize the markers for the parts of a song: Intro, Hook, Verse, Chorus, Refrain, Interlude, Bridge and Outro
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Intro.*\]", "[INTRO]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Hook.*\]", "[HOOK]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Verse.*\]", "[VERSE]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Chorus.*\]", "[CHORUS]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Refrain.*\]", "[REFRAIN]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Interlude.*\]", "[INTERLUDE]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Bridge.*\]", "[BRIDGE]", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"\[.*Outro.*\]", "[OUTRO]", case=False, regex=True)

# Do some other clean up to make the lyrics easier to process
df["lyrics"] = df["lyrics"].str.replace(r"\[Produced by.*\]\s", "", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"[\n\r\f]{2,}", "\n", case=False, regex=True)
df["lyrics"] = df["lyrics"].str.replace(r"[ \t\v]{2,}", " ", case=False, regex=True)

In [7]:
# Keep songs where the lyrics contain no other bracketed words other than recognized part of song markers
df = df[df["lyrics"].str.contains(r"\[[^\]]*(?:(?<!INTRO)(?<!HOOK)(?<!VERSE)(?<!CHORUS)(?<!REFRAIN)(?<!INTERLUDE)(?<!BRIDGE)(?<!OUTRO))\][^\]]*", case=False, regex=True) == False]
df[["tag", "title"]].groupby("tag").count()

Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,37786
rap,477694
rb,77286
rock,179995


In [8]:
# Keep only songs that contain at least one of the part of song markers
df = df[df["lyrics"].str.contains(r"\[[^\]]*(?:(INTRO|HOOK|VERSE|CHORUS|REFRAIN|INTERLUDE|BRIDGE|OUTRO))[^\]]*\]", regex=True) == True]
df[["tag", "title"]].groupby("tag").count()

  df = df[df["lyrics"].str.contains(r"\[[^\]]*(?:(INTRO|HOOK|VERSE|CHORUS|REFRAIN|INTERLUDE|BRIDGE|OUTRO))[^\]]*\]", regex=True) == True]


Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,37786
rap,477509
rb,77279
rock,179966


In [9]:
df = df.groupby("tag", as_index=False).apply(lambda x: x.sample(25000, random_state=1868))
df[["tag", "title"]].groupby("tag").count()

Unnamed: 0_level_0,title
tag,Unnamed: 1_level_1
country,25000
rap,24999
rb,24999
rock,24998


In [10]:
for (i, lyric) in enumerate(df["lyrics"].iloc[0:3]):
    print(f"Song #{i+1:d}:")
    print("-------")
    print(lyric)
    print("\n\n")

Song #1:
-------
[VERSE]
Met her on a Tuesday
When I walked into her store
Looked into her sweet eyes
And I knew I wanted more
Than I'd come for
[CHORUS]
I called her up and I asked if she'd see me
The girl from the sex shop
She met me at the truck stop
And the rest is history
[CHORUS]
In my heart I know she's the one I've been hopin' for all my life
One day soon, even in Alberta
She could be my wife
[VERSE]
I've been kinda lookin'
But the prospects were a pain
Coachin' the conductor
And a woman gone insane
(What was her name?)
And then I met Tamulah, the one who stole my heart
She made me stay up late
She called me rifle bait
When we talked all night inside her car
[CHORUS]
You should know that she's the one I've been hopin' for all my life
One day soon, even in Alberta
She could be my wife
[CHORUS]
So I met Tamulah, my girl with sweaty hands
The girl from the sex shop
Who met me at the truck stop
Said she'd be my man
[CHORUS]
She's the one I've been hopin' for all my life
One day soo

In [11]:
df.to_csv("sampled_song_lyrics_en.csv")