In [1]:
# Imports
import pandas as pd
import re

In [4]:
'''
Function -> cleanLyrics() Cleans the lyrics scraped from Genius API by prepping it for regular use. 
            It removes gibberish before each song that was scraped and information indicating if the 
            section of the song is [Chorus],[Verse X],[Person Singing] etc.

(Input): String of lyrics to clean
(Output): A list of strings of songs 
'''

def cleanLyrics(text):
    # Removing brackets like [chorus], [verse] ...
    text = re.sub(r'\s*\[.*?\]\s*', '\n', text)
    splitBySong = text.split("New Song")
    songs = []
    for song in splitBySong:
        # Removing gibberish before song lyrics start
        lyrics_start = song.find("Lyrics")+len("Lyrics") 
        lyrics = song[lyrics_start:]
#         lyrics = re.sub(r'(\d+)\s*Embed', '', lyrics)
        lyric = re.sub(r'\s*Embed', '', lyrics)
        songs.append(lyric)
    
    return songs


In [3]:
"""
It clean the lyrics scraped from ScrapeLyrics.ipynb and returns a dataframe containing 
lyrics corresponding to each song/artist/gendre/gender. The lyrics in the lyrics column 
are now ready for cleaning according to user(ie. includes stopwords, capitals, punctuation, etc).
"""
def main():
    listText = []
    df = pd.read_csv("songsOfYear.csv", names=["Billboard Hit Number","Songs","Artist","Genre","Gender","Lyrics"])

    years = list(range(2022, 1942, -10))
    songs = []
    for i, year in enumerate(years):
        filename = f'lyrics{year}.txt'
        with open(filename, 'r') as f:
            text = f.read()
        # Adding a "Year" column to df
        df.loc[i*100:(i+1)*100-1, "Year"] = str(year)
        songs.extend(cleanLyrics(text))    

    # slice songs list to match length of df index
    df['Lyrics'] = songs[:len(df)] 
    
    # Write final df to csv
    df.to_csv("lyricsDataset2.csv")
#     display(df)

if __name__ == "__main__":
    main()

In [18]:
# Double checking all data is in there
new = pd.read_csv("lyricsDataset.csv")
new

Unnamed: 0.1,Unnamed: 0,Billboard Hit Number,Songs,Artist,Genre,Gender,Lyrics,Year
0,0,1,"""Heat Waves""",Glass Animals,Psychedelic pop,Group,"\n(Last night, all I think about is you)\n(Don...",2022
1,1,2,"""As It Was""",Harry Styles,Synth-pop,Male,Holdin' me back\nGravity's holdin' me back\nI...,2022
2,2,3,"""Stay""",The Kid Laroi and Justin Bieber,Pop rap,Male,\nI do the same thing I told you that I never ...,2022
3,3,4,"""Easy on Me""",Adele,Pop,Female,\nThere ain't no gold in this river\nThat I've...,2022
4,4,5,"""Shivers""",Ed Sheeran,Dance-pop,Male,\nI took an arrow to the heart\nI never kissed...,2022
...,...,...,...,...,...,...,...,...
725,725,26,"""I'm Yours""",Don Cornell,,Male,I'm yours\nHeart and soul I am yours\nCan't yo...,1952
726,726,27,"""I'll Walk Alone""",Don Cornell,"Popular music, Musical film",Male,I'll walk alone\nBecause to tell you the truth...,1952
727,727,28,"""Tell Me Why""",Eddie Fisher,,Male,"Tell me why, though I try to forget\nTell me w...",1952
728,728,29,"""Trying""",The Hilltoppers,Traditional pop,Group,I'm tryin' to forget you\nBut try as I may\nYo...,1952
