In [1]:
import pandas as pd
import numpy as np
import re
from numpy.random import randint 
from gensim.parsing.preprocessing import remove_stopwords

## Data loading and initial cleaning

In [3]:
# df = pd.read_csv("data/song_lyrics.csv")
# df = df[df['language']=="en"]

In [None]:
df = df[df['language']=="en"]

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df = pd.read_csv("data/en_song_lyrics.csv")

In [None]:
# checking null values 
df.isna().sum()

In [None]:
# Dropping null values
df= df.dropna(how="any")

In [None]:
# Checking for duplicates
df.duplicated().sum()

In [None]:
df.shape

In [None]:
years = df['year'].unique()
years

In [None]:
# df.to_csv("data/clean_en_song_lyrics.csv")

# FURTHER DATA CLEANING AND FILTERING

### Creating "song_id" from title and artist name

In [2]:
df=pd.read_csv("data/clean_en_song_lyrics.csv")

In [3]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,0,0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,1,1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,2,2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,3,3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,4,4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374066,3374193,5134847,Everything Is Alright Now,pop,Chuck Bernard,2013,2,{},"Everything is alright now\nOh yes, baby\nEvery...",7882838,en,en,en
3374067,3374194,5134849,White Lies,pop,ElementD,2019,1,"{""Harley Bird""}",[Verse 1]\nHalf truth and half you\nDidn't we ...,7882840,en,en,en
3374068,3374195,5134851,Ocean,pop,Effemar,2022,3,{},[Verse 1]\nDance for me now\nKeeping yourself ...,7882842,en,en,en
3374069,3374196,5134853,Raise Our Hands,pop,"Culture Code, Pag & Mylo",2016,3,"{Elex,""Culture Code / Pag & Mylo""}",[Verse 1]\nHere our purpose feels alive\nWe ar...,7882845,en,en,en


In [4]:
df=df.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])

In [5]:
# Standardize strings in 'artist' and 'title' columns
df['artist_clean'] = df['artist'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")
df['title_clean'] = df['title'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")

# Combine 'artist' and 'title' to create 'song_id' column
df['song_id'] = (df['artist_clean'] + '_' + df['title_clean']).str.replace(' ', '_')


In [6]:
#Importing scraped weekly top 100 billboard songs
df_top = pd.read_csv("data/billboard_weekly_1958-2024.csv")

In [7]:
df_top.shape

(30045, 4)

In [8]:
df.shape

(3374071, 14)

In [9]:
# Standardize strings in 'artist' and 'title' columns
df_top['artist_clean'] = df_top['artist'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")
df_top['title_clean'] = df_top['title'].str.strip().str.lower().str.replace('"', '').str.replace("'", "").str.replace(",", "").str.replace(".", "").str.replace("!", "").str.replace("?", "").str.replace("&", "and")

# Combine 'artist' and 'title' to create 'song_id' column
df_top['song_id'] = (df_top['artist_clean'] + '_' + df_top['title_clean']).str.replace(' ', '_')

In [10]:
df_filtered = df[df['song_id'].isin(df_top['song_id'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered)/len(df_top)*100),2))

Percentage of songs matched: 60.23


In [11]:
# Find songs in df_top that are not present in df
missing_songs_top = df_top[~df_top['song_id'].isin(df['song_id'])]

print("Songs in df_top that are not present in df:")
print(missing_songs_top[['artist_clean', 'title_clean']])


Songs in df_top that are not present in df:
                                            artist_clean  \
0                                          groove holmes   
1                                          groove holmes   
3                                          pookie hudson   
13                                     weird al yankovic   
16                                           til tuesday   
...                                                  ...   
30040                            william and nicki minaj   
30041                    william featuring justin bieber   
30042   william featuring mick jagger and jennifer lopez   
30043                      william featuring miley cyrus   
30044  william featuring miley cyrus french montana w...   

                     title_clean  
0                          misty  
1               what now my love  
3                  i know i know  
13               white and nerdy  
16     (believed you were) lucky  
...                      

In [12]:
# Investigating missing songs: 

search_word = ""
artist = "william"

matching_titles = df_top[(df_top['artist'].str.contains(search_word, case=False))]
# df_top[(df_top['artist'].fillna('').str.lower() == artist.lower()) & 
                     
matching_titles

Unnamed: 0,artist,title,date,year,artist_clean,title_clean,song_id
0,"""Groove"" Holmes",Misty,1966-08-30,1966,groove holmes,misty,groove_holmes_misty
1,"""Groove"" Holmes",What Now My Love,1966-10-11,1966,groove holmes,what now my love,groove_holmes_what_now_my_love
2,"""Little"" Jimmy Dickens",May The Bird Of Paradise Fly Up Your Nose,1965-12-14,1965,little jimmy dickens,may the bird of paradise fly up your nose,little_jimmy_dickens_may_the_bird_of_paradise_...
3,"""Pookie"" Hudson",I Know I Know,1963-05-21,1963,pookie hudson,i know i know,pookie_hudson_i_know_i_know
4,"""Weird Al"" Yankovic",Amish Paradise,1996-07-07,1996,weird al yankovic,amish paradise,weird_al_yankovic_amish_paradise
...,...,...,...,...,...,...,...
30040,will.i.am & Nicki Minaj,Check It Out,2011-01-15,2011,william and nicki minaj,check it out,william_and_nicki_minaj_check_it_out
30041,will.i.am Featuring Justin Bieber,#thatPOWER,2013-07-20,2013,william featuring justin bieber,#thatpower,william_featuring_justin_bieber_#thatpower
30042,will.i.am Featuring Mick Jagger & Jennifer Lopez,T.H.E (The Hardest Ever),2012-02-18,2012,william featuring mick jagger and jennifer lopez,the (the hardest ever),william_featuring_mick_jagger_and_jennifer_lop...
30043,will.i.am Featuring Miley Cyrus,Fall Down,2013-05-04,2013,william featuring miley cyrus,fall down,william_featuring_miley_cyrus_fall_down


In [13]:
# Need to deal with 'featuring' and "and" - extract main artist

def extract_main_artist(artist):
    if 'and' in artist:
        return artist.split(' and ')[0].strip()
    elif 'featuring' in artist:
        return artist.split(' featuring ')[0].strip()
    elif 'feat' in artist:
        return artist.split(' feat ')[0].strip()
    elif 'feat.' in artist:
        return artist.split(' feat.')[0].strip()
    elif 'with' in artist:
        return artist.split(' with ')[0].strip()
    elif ',' in artist:
        return artist.split(',')[0].strip()
    else:
        return artist
    



In [14]:
# Apply the function to create 'main_artist' column in df_top DataFrame
df_top['main_artist'] = df_top['artist_clean'].apply(extract_main_artist)

# Apply the function to create 'main_artist' column in df DataFrame
df['main_artist'] = df['artist_clean'].apply(extract_main_artist)

In [15]:
# make new ids

df_top['song_id2'] = (df_top['main_artist'] + '_' + df_top['title_clean']).str.replace(' ', '_')
df['song_id2'] = (df['artist_clean'] + '_' + df['title_clean']).str.replace(' ', '_')

In [16]:
df_filtered2 = df[df['song_id2'].isin(df_top['song_id2'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered2)/len(df_top)*100),2))

Percentage of songs matched: 63.12


In [17]:
df_filtered2.shape

(18964, 16)

In [18]:
#Investigate again:

# Find songs in df_top that are not present in df
missing_songs_top = df_top[~df_top['song_id2'].isin(df['song_id2'])]

print("Songs in df_top that are not present in df:")
print(missing_songs_top[['main_artist', 'title']])

Songs in df_top that are not present in df:
                                             main_artist  \
0                                          groove holmes   
1                                          groove holmes   
3                                          pookie hudson   
13                                     weird al yankovic   
15                                                  $not   
...                                                  ...   
30039                                            william   
30040                                            william   
30041                                            william   
30042                      william featuring mick jagger   
30044  william featuring miley cyrus french montana w...   

                          title  
0                         Misty  
1              What Now My Love  
3                 I Know I Know  
13                White & Nerdy  
15                         Doja  
...                         ...

In [19]:
# Apply the function to create 'main_artist' column in df_top DataFrame
df_top['main_artist'] = df_top['main_artist'].apply(extract_main_artist)

# Apply the function to create 'main_artist' column in df DataFrame
df['main_artist'] = df['main_artist'].apply(extract_main_artist)

In [20]:
df_top['song_id2'] = (df_top['main_artist'] + '_' + df_top['title_clean']).str.replace(' ', '_')
df['song_id2'] = (df['main_artist'] + '_' + df['title_clean']).str.replace(' ', '_')

In [21]:
df_filtered3 = df[df['song_id2'].isin(df_top['song_id2'])].reset_index(drop=True)
print(f"Percentage of songs matched:", round((len(df_filtered3)/len(df_top)*100),2))

Percentage of songs matched: 69.23


Improvement 68% -> 69.23%

In [22]:
df_filtered3.shape

(20801, 16)

In [23]:
df_filtered3.to_csv("data/songs_filtered.csv")

### Text preprocessing

In [24]:
def clean_text(text):

    text = text.replace('\n', ' ')
    text = re.sub(r'[,\.!?]', '', text)
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'\w*\d\w*',' ', text)
    text = re.sub(r'[()]', ' ', text)
    text = text.lower()
    text = re.sub(r'\b(chorus|verse|intro)\b', '', text)
    return text

In [25]:
df_filtered3['lyrics_clean'] = df_filtered3['lyrics'].astype(str).apply(lambda x: clean_text(x))
df_filtered3

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,artist_clean,title_clean,song_id,main_artist,song_id2,lyrics_clean
0,Mr. Carter,rap,Lil Wayne,2008,542488,{JAY-Z},[Produced by Infamous and Drew Correa]\n\n[Int...,126,en,en,en,lil wayne,mr carter,lil_wayne_mr_carter,lil wayne,lil_wayne_mr_carter,yo yo drew and inf this-this this right h...
1,Pop Bottles,rap,Birdman,2007,93210,"{""Lil Wayne""}","[Hook: Jadakiss, Lil Wayne & Birdman]\nStart w...",112,en,en,en,birdman,pop bottles,birdman_pop_bottles,birdman,birdman_pop_bottles,start with straight shots and then pop bottl...
2,Fireman,rap,Lil Wayne,2005,147351,{},"[Intro]\n(Weezy Baby)\nShh, the fireman comin'...",39,en,en,en,lil wayne,fireman,lil_wayne_fireman,lil wayne,lil_wayne_fireman,weezy baby shh the fireman comin' yeah yea...
3,Brooklyn Zoo,rap,Ol' Dirty Bastard,1995,183750,"{""Ol\\' Dirty Bastard""}",[Produced by True Master & Ol' Dirty Bastard]\...,47,en,en,en,ol dirty bastard,brooklyn zoo,ol_dirty_bastard_brooklyn_zoo,ol dirty bastard,ol_dirty_bastard_brooklyn_zoo,shit word i'll bust that nigga ass right ...
4,A Milli,rap,Lil Wayne,2008,1237174,{},[Intro]\nBangladesh\nYoung Money!\nYou dig?\nM...,59,en,en,en,lil wayne,a milli,lil_wayne_a_milli,lil wayne,lil_wayne_a_milli,bangladesh young money you dig mack i'm goin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20796,For a While,pop,Mary Macgregor,1976,2,{},[Verse 1]\nI think I'll stay around here for a...,7866850,en,en,en,mary macgregor,for a while,mary_macgregor_for_a_while,mary macgregor,mary_macgregor_for_a_while,i think i'll stay around here for a while i ...
20797,Dancin Like Lovers,pop,Mary Macgregor,1980,1,{},[Verse 1]\nThe music's playing softly in the s...,7866892,en,en,en,mary macgregor,dancin like lovers,mary_macgregor_dancin_like_lovers,mary macgregor,mary_macgregor_dancin_like_lovers,the music's playing softly in the summer nig...
20798,Thump Shit,rap,42 Dugg & EST Gee,2022,821,{},"[Intro: 42 Dugg]\nFree them boys, we them (Me ...",7871619,en,en,en,42 dugg and est gee,thump shit,42_dugg_and_est_gee_thump_shit,42 dugg,42_dugg_thump_shit,free them boys we them me and spiff ayy fr...
20799,In my Head,rap,Lil Tjay & Lil XXEL,2022,18,"{""Lil Xxel""}",[Intro]\nShawty's like a melody in my head tha...,7877849,en,en,en,lil tjay and lil xxel,in my head,lil_tjay_and_lil_xxel_in_my_head,lil tjay,lil_tjay_in_my_head,shawty's like a melody in my head that i can...


In [26]:
# Removing stopwords

df_filtered3['lyrics_clean'] = df_filtered3['lyrics_clean'].astype(str).apply(lambda x: remove_stopwords(x))

## Checking matching 

In [27]:
df_top.shape

(30045, 9)

In [28]:
df_top3 = df_top.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)

df_top3.shape


(30034, 9)

In [29]:
len(df_top)

30045

In [30]:
df_top2 = df_top[df_top['song_id2'].isin(df_filtered3['song_id2'])].reset_index(drop=True)


In [31]:
df_filtered3.shape

(20801, 17)

In [32]:
unique_top2 = df_top2['song_id2'].unique()
unique_df_f3 = df_filtered3['song_id2'].unique()

In [33]:
print(unique_top2.size)
print(unique_df_f3.size)

20549
20549


In [34]:
duplicate_count1 = df_top2['song_id2'].duplicated().sum()
duplicate_count2 = df_filtered3['song_id2'].duplicated().sum()

print(duplicate_count1)

print(duplicate_count2)

9
252


In [35]:
df_top2 = df_top2.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)

In [36]:
df_filtered3 = df_filtered3.drop_duplicates(
    subset=['song_id2'],
    keep='first'
).reset_index(drop=True)



In [37]:
duplicate_count1 = df_top2['song_id2'].duplicated().sum()
duplicate_count2 = df_filtered3['song_id2'].duplicated().sum()

print(duplicate_count1)

print(duplicate_count2)

0
0


In [38]:
selected_columns = ['song_id2','artist','title','tag' ,'year', 'lyrics']

df_filtered3 = df_filtered3[selected_columns]

In [39]:
df_filtered3 = df_filtered3.rename(columns={"song_id2": "song_id"})


In [40]:
df_filtered3.to_csv("data/songs_lyrics_filtered.csv")

In [41]:
selected_columns2 = ['song_id','artist','title','date','year']
df_top2 = df_top2[selected_columns2]
df_top2 = df_top2.rename(columns={"song_id2": "song_id"})

In [42]:
df_top2.to_csv("data/popular_songs.csv")

In [43]:
df_top2

Unnamed: 0,song_id,artist,title,date,year
0,little_jimmy_dickens_may_the_bird_of_paradise_...,"""Little"" Jimmy Dickens",May The Bird Of Paradise Fly Up Your Nose,1965-12-14,1965
1,weird_al_yankovic_amish_paradise,"""Weird Al"" Yankovic",Amish Paradise,1996-07-07,1996
2,weird_al_yankovic_canadian_idiot,"""Weird Al"" Yankovic",Canadian Idiot,2006-11-04,2006
3,weird_al_yankovic_eat_it,"""Weird Al"" Yankovic",Eat It,1984-05-26,1984
4,weird_al_yankovic_fat,"""Weird Al"" Yankovic",Fat,1988-05-28,1988
...,...,...,...,...,...
20544,twenty_one_pilots_stressed_out,twenty one pilots,Stressed Out,2016-10-01,2016
20545,twenty_one_pilots_tear_in_my_heart,twenty one pilots,Tear In My Heart,2015-08-08,2015
20546,william_i_got_it_from_my_mama,will.i.am,I Got It From My Mama,2007-10-20,2007
20547,william_its_a_new_day,will.i.am,It's A New Day,2009-02-07,2009
