In [2]:
#As always, we import everything
import pandas as pd
import numpy as np 

import os
import re

from tqdm import tqdm

import lyricfetcher
import csv
import urllib
from urllib.request import urlopen, HTTPError

from langdetect import detect


import warnings
warnings.filterwarnings('ignore')

In [1]:
data_dir = './' + 'data'

# README

In [23]:
df = pd.read_csv(data_dir + '/msd_tagtraum_cd2c.cls', comment='#', names=['track_id','genre'], sep='\t')
genre_dataset = df[['genre', 'track_id']].set_index('track_id')

year_dataset = pd.read_table(os.path.join(data_dir, 'tracks_per_year.txt'),delimiter ='<SEP>',
                             names=('year','track_id', 'artist_name','title'), index_col = 'track_id')

In [24]:
year_artist_name_title_genre = pd.merge(year_dataset, genre_dataset, left_index=True, right_index=True)

In [26]:
year_artist_name_title_genre.to_csv(data_dir + 'year_artist_name_title_genre.csv')

# Lyrics scrapping code

In [9]:
def scrape_lyrics(artists_list,songs_list):
    '''
    Main scrapping function
    Input: lists of songs and their artists names 
    Output: list of lyrics of the songs and list of songs for which the scrapping did not succeed
    In case the scrapping failed, lyrics are returned as empty strings
    
    The lyrics are scrapped from several websites:
    - metrolyrics
    If failed:
    -azlyrics
    If failed also:
    - lyricswikia
    And if it also failed:
    - through the herokuapp online API (unknown source)
    
    '''
    
    lyrics_not_found = []
    if (len(artists_list) == 0 or len(songs_list) == 0):
        raise ValueError('The provided artists list or songs list is empty')
        
    elif (len(artists_list) != len(songs_list)):
        print("artist list has len: ", len(artists_list))
        print("songs list has len: ", len(songs_list))
        raise ValueError('The provided artists and songs lists have different lenghts')
        
    else:
        lyrics = []
        
        for i in tqdm(range(len(artists_list))):
            
            try:
                lyrics_metro = str(lyricfetcher.get_lyrics('metrolyrics',artists_list[i],songs_list[i]))
                lyrics_metro = re.sub(r'[\[].*?[\]]', '', lyrics_metro.replace('\n', ' '))
                lyrics_metro = re.sub(',', '', lyrics_metro)

                if (lyrics_metro == "" or len(lyrics_metro.split())<3):
                    lyrics_az = str(lyricfetcher.get_lyrics('azlyrics',artists_list[i],songs_list[i]))
                    lyrics_az = re.sub(r'[\[].*?[\]]', '', lyrics_az.replace('\n', ' '))
                    lyrics_az = re.sub(',', '', lyrics_az)

                    if (lyrics_az == "" or len(lyrics_az.split())<3):
                        lyrics_wikia = str(lyricfetcher.get_lyrics('lyricswikia',artists_list[i],songs_list[i]))
                        lyrics_wikia = re.sub(r'[\[].*?[\]]', '', lyrics_wikia.replace('\n', ' '))
                        lyrics_wikia = re.sub(',', '', lyrics_wikia)
                        
                        if (lyrics_wikia == "" or len(lyrics_wikia.split())<3):
                            try:
                                lyrics_herokuapp = json.load(urllib.request.urlopen('http://lyric-api.herokuapp.com/api/find/' 
                                                                                + artists_list[i].replace(' ', '%20') + '/' 
                                                                                + songs_list[i].replace(' ', '%20') + '.html'))['lyric']
                            except HTTPError:
                                lyrics_herokuapp = ''
                                
                            lyrics_herokuapp = re.sub(r'[\[].*?[\]]', '', lyrics_herokuapp.replace('\n', ' '))
                            lyrics_herokuapp = re.sub(',', '', lyrics_herokuapp)
                            
                            if (lyrics_herokuapp == "" or len(lyrics_herokuapp.split())<3):
                                
                                raise ValueError('No lyrics OR lyrics smaller or equal to 2 words found')
                                
                            else:
                                lyrics.append(lyrics_herokuapp)
                        else:
                            lyrics.append(lyrics_wikia)
                    else:
                        lyrics.append(lyrics_az)

                else:
                    lyrics.append(lyrics_metro)
                    
            except:
                lyrics_not_found.append([artists_list[i],songs_list[i]])
                lyrics.append("")
                continue
    return lyrics, lyrics_not_found

## Create positive, feminist playlist dataframe

In [10]:
#Provide BELOW the full set of feminist tracks 
#-----------------------------------------------------------------------------
positive_artists_to_scrape = ["Sugababes","Helen Reddy","Gwen Guthrie","Jennifer Lopez","Tinashe","Sheryl Crow","Lisa Stansfield",
                     "Ethel Merman", "Hole", "Geri Halliwell","Hole","Christina Aguilera","Shakira","Cardi B","Destiny's Child",
                    "Meredith Brooks","Lupe Fiasco","Little Mix","Iggy Azalea","Lady Gaga","Fifth Harmony","Kelis","Peaches",
                   "Ariana Grande","Tupac Shakur","Alexandra Burke","La Roux","Blondie","Christina Aguilera",
                    "Kelly Clarkson","All Saints","i5","Kelly Rowland", "Remy Ma","Demi Lovato","Janet Jackson",
                    "Tori Amos","TLC","Patti Smith","2Pac","Hole","Beyonce","Janelle Monae","Jessie J","Aretha Franklin",
                     "Mary J. Blige","Chris Janson","Babes in Toyland", "Janelle Monae", "Madonna","Christina Aguilera",
                     "Drake","Janet Jackson","Shane McAnally","Bomshel","Christina Aguilera",
                     "Beyonce","Queenadreena","Kacey Musgraves","Nina Simone","Lady Gaga","Lykke Li","Destiny's Child",
                     "Maddie & Tae","Alicia Keys","Cyndi Lauper","Mary Chapin Carpenter","Ariana Grande","Ashanti",
                     "Little Mix","Robyn","Lily Allen","Mary Chapin Carpenter", "Katy Perry",
                     "David Guetta", "Blu Cantrell", "Gwen Stefani", "Madonna", "Pussycat Dolls",
                     "Jordin Sparks","Kelly Clarkson","Kelly Clarkson","Amil","Christina Aguilera",
                     "Icona Pop", "Gloria Gaynor","Diana Ross","Chaka Khan", "Beyonce","Martina McBride",
                     "Destiny's Child","Beyonce","Whitney Houston","Little Mix","Madonna","No Doubt","2Pac",
                     "Christina Aguilera","TLC","Sonic Youth","Vera Blue","Lush","Kesha", "Sugarcubes","Idina Menzel","Miley Cyrus",
                    "Ciara","Alesha Dixon","Beyonce","Manic Street Preachers", "Little Mix","Jennifer Lopez","Nicki Minaj","K.Michelle",
                     "Hailee Steinfeld","Zara Larsson","Shania Twain","Cicely Hamilton","Selena Gomez","Beyonce",
                      "Kelly Clarkson","Hailee Steinfeld","Janet Jackson","Dua Lipa","Drake","Meghan Trainor",
                      "Lil B","Barbra Streisand","TLC","Franz Ferdinand","Salt-n-Pepa","Doubleclicks","Mariah Carey",
                      "X-Ray Spex","Tweet","Madonna","Brooke Candy","Katy Perry","Jodi Benson","Beyonce","Nervo",
                      "Taylor Swift","Loretta Lynn","Nirvana","Little Mix","L7","Maggie Lindemann","Keri Hilson",
                      "Britney Spears","Beyonce","Marina and the Diamonds","Janelle Monae","Janelle Monae","Javine",
                       "Demi Lovato","Bikini Kill","Sugababes","Aretha Franklin","Katy Perry","All Saints",
                    "Lauryn Hill","Beyonce","Little Mix","Spice Girls","Alessia Cara","The Gits","Mary Lambert",
                     "Green Day","Cyndi Lauper","Camila Cabello","Donna Summer","the Ramones","Lily Allen","Beyonce",
                     "Yoko Ono","Daya","Eliza Doolittle","Demi Lovato","Fifth Harmony","Miley Cyrus","P!nk","Madonna",
                     "Cher","Britney Spears","Kelly Clarkson","P!nk","Alicia Keys","Karyn White","Destiny's Child",
                     "Ava Max","P!nk","Queen Latifah","Fantasia","G.R.L","TLC","Sia","Natasha Bedingfield","Beyonce",
                     "Hole","Spice Girls","Nina Sublatti","Madonna","Gwen Stefani","The Pussycat Dolls","Jody Watley",
                     "The Pussycat Dolls","Vanessa Carlton","Baha Men","Selena Gomez","Robyn","Kelly Clarkson",
                     "Alessia Cara","Gwen Stefani","Kesha","John Lennon","Cher","Britney Spears","Alicia Keys",
                     "Amanda Lear","Fifth Harmony","Yoko Ono","Jax Jones","Lesley Gore","Alanis Morissette",
                     "coldplay"]


positive_titles_to_scrape = ["About a Girl", "Ain't No Way to Treat a Lady","Ain't Nothin' Goin' On but the Rent","Ain't Your Mama",
                    "All Hands on Deck", "All I Wanna Do", "All Woman","Anything You Can Do (I Can Do Better)",
                    "Asking for It","Bag It Up","Be a Man","Beautiful","Beautiful Liar","Bickenhead",
                    "Bills Bills Bills","Bitch","Bitch Bad","Black Magic","Black Widow","Born This Way","Boss",
                    "Bossy","Boys Wanna Be Her","Break Free","Brenda's Got a Baby","Broken Heels","Bulletproof",
                    "Call Me","Can't Hold Us Down","Catch My Breath","Chick Fit",
                    "Cinderella","Commander","Conceited","Confident","Control","Cornflake Girl","Creep",
                    "Dancing Barefoot","Dear Mama","Dicknail","Diva","Django Jane","Do It like a Dude",
                    "Do Right Woman Do Right Man","Doubt","Drunk Girl","Dust Cake Boy","Electric Lady", 
                    "Express Yourself", "Fall in Line","Fancy","Feedback","Female","Fight Like a Girl",
                     "Fighter","Flawless","FM Doll","Follow Your Arrow","Four Women","G.U.Y.","Get Some","Girl",
                    "Girl in a Country Song","Girl on Fire","Girls Just Want to Have Fun","Girls with Guitars",
                     "God Is a Woman","Good Good","Hair","Handle Me","Hard out Here",
                     "He Thinks He'll Keep Her","Hey Hey Hey","Hey Mama", 
                     "Hit 'Em Up Style (Oops!)","Hollaback Girl","Human Nature","Hush Hush; Hush Hush",
                     "I Am Woman","I Do Not Hook Up",
                     "I Don't Think About You","I Got That","I Hate Boys",
                     "I Love It", "I Will Survive","I'm Coming Out","I'm Every Woman",
                     "If I Were a Boy","Independence Day","Independent Women",
                     "Irreplaceable",
                     "It's Not Right but It's Okay","Joan of Arc","Jump",
                     "Just a Girl","Keep Ya Head Up", "Keeps Gettin' Better", "Kick Your Game","Kool Thing",
                    "Lady Powers","Ladykillers","Learn to Let Go","Leash Called Love","Let It Go","Liberty Walk",
                    "Like a Boy","Lipstick","Listen","Little Baby Nothing","Little Me","Live It Up","Lookin Ass",
                     "Love 'Em All","Love Myself","Lush Life","Man! I Feel Like a Woman!","The March of the Women",
                     "Me & My Girls","Me Myself and I","Miss Independent","Most Girls",
                    "Nasty","New Rules (song)","Nice for What","No","No Black Person Is Ugly",
                     "No More Tears (Enough Is Enough)","No Scrubs","No You Girls","None of Your Business",
                     "Nothing to Prove","Obsessed", "Oh Bondage Up Yours!","Oops (Oh My)",
                     "Papa Don't Preach","Paper or Plastic","Part of Me","Part of Your World","Partition",
                     "People Grinnin'","Picture to Burn","The Pill","Polly","Power","Pretend We're Dead",
                     "Pretty Girl","Pretty Girl Rock","Pretty Girls","Pretty Hurts","Primadonna","Pynk",
                      "Q.U.E.E.N.","Real Things","Really Don't Care","Rebel Girl","Red Dress","Respect","Roar",
                    "Rock Steady","A Rose Is Still a Rose",
                     "Run the World (Girls)","Salute","Say You'll Be There","Scars to Your Beautiful","Second Skin",
                     "Secrets","She","She Bop","She Loves Control","She Works Hard for the Money","Sheena Is a Punk Rocker",
                     "Sheezus","Single Ladies (Put a Ring on It)","Sisters O Sisters","Sit Still Look Pretty",
                    "Skinny Genes","Skyscraper","Sledgehammer","SMS (Bangerz)","So What","Sorry","Strong Enough",
                    "Stronger", "Stronger (What Doesn't Kill You)","Stupid Girls","Superwoman","Superwoman","Survivor", 
                    "Sweet but Psycho","U + Ur Hand","U.N.I.T.Y.","Ugly","Ugly Heart","Unpretty","Unstoppable", 
                     "Unwritten","Upgrade U","Violet","Wannabe","Warrior","What It Feels Like for a Girl","What You Waiting For?",
                     "Whatcha Think About That","When a Man Loves a Woman","When I Grow Up",
                     "White Houses","Who Let the Dogs Out?","Who Says","Who's That Girl","Whole Lotta Woman",
                     "Wild Things","Wind It Up","Woman","Woman Is the Nigger of the World","Woman's World",
                     "Womanizer","A Woman's Worth","Women", "Worth It","Yang Yang","You Don't Know Me",
                    "You Don't Own Me","You Oughta Know", "paradise"]

#-----------------------------------------------------------------------------

In [11]:
# The line below calls the scrapping function
positive_lyrics, _ = scrape_lyrics(positive_artists_to_scrape, positive_titles_to_scrape)


100%|████████████████████████████████████████████████████████████████████████████████| 224/224 [00:49<00:00,  7.55it/s]


## Same with sexist songs

In [12]:
sexist_artists_to_scrape = ["Foreigner","Cyndi Lauper","Ray J","Adina Howard","Cassie","Tone-Loc",
                            "George Michael","Donna Summer","Prince", "Johnny Gill",
                       "Juvenile","Roberta Flack","Sylvia","Def Leppard","R. Kelly","Poison","Marvin Gaye","Donna Summer",
                       "Labelle","Peter Gabriel","Missy Elliott","The Pointer Sisters","Heart","Usher","Karyn White",
                       "Akon","R. Kelly","Bell Biv Devoe","Silk","Gregory Abbott","Starland Vocal Band","50 Cent" ,
                        "Monica","Anita Ward","Nelly Furtado","Toni Braxton","Color Me Badd","Donna Summer","Lil Wayne",
                        "The Rolling Stones","Rod Stewart","Exile","Madonna"," Captain & Tennille","Donna Summer",
                        "Marvin Gaye","Next","Boyz II Men","Rod Stewart","Olivia Newton-John","Lil Wayne","The Dead Exs",
                        "Major Lazer","Young Money","Enrique Iglesias","Robin Thicke","Kiss","Britney Spears","Lil Wayne",
                        "Dr. Dre","David Guetta","Nicki Minaj","Whitesnake","Anal Cunt","Chris Brown","Prince","Falco",
                        "The Mentors","The Prodigy","Eminem","Eminem","Dierks Bentley","Nickelback","Young Money",
                        "Van Halen","Robert Hazard","Demi Lovato","Bon Jovi","Alannah Myles","Prince","Manika",
                        "Christina Aguilera","Rihanna","Maroon 5","Fetty Wap","Akon","Raelynn","Sir Mix A Lot",
                        "Notorious BIG","Tyler Farr","Usher","Syd","The Citrus Cloud","Ro James","SiR","Miguel","Charlie Puth",
                        "Miguel","Nao","Harry Styles","Lana Del Rey","Lana Del Rey","Shaggy","The Crystals",
                        "One Direction","Beatles","The Weeknd","Alex Gaudino","Bob Sinclar","Vleger","Sasha Lopez",
                        "David Guetta","Pitbull","Eric Prydz","Madonna","Fedde Le Grand","Black Eyed Peas","Eminem",
                        "Frank Loesser","Jason Derulo","YG","Tyler The Creator","Dr. Dre","Outkast","Eminem",
                        "Waka Flocka Flame","Lil Wayne","Ying Yang Twins","Common","Ghostface Killah","Eminem","Eminem",
                        "Eminem","Eminem","Snoop Dog","Eminem","Jeremih","Chris Brown","Timbaland","Milow",
                        "Chris Brown","Akon","Akon","Pitbull","Pitbull","Rocko","Rocko","Asap Rocky","Benny Benassi",
                        "erykah badu", "nina simone","mungo jerry"]


sexist_titles_to_scrape = ["Urgent","She bop","Sexy Can I","Freak Like Me","Me & U", 
                    "Wild Thing","I Want Your Sex","Dim All The Lights","Raspberry Beret","Rub you the Right Way",
                    "Slow Motion","Feel Like Makin' Love","Pillow Talk","Pour Some Sugar on Me","Bump N' Grind",
                    "Unskinny Bop","Sexual Healing","Love To Love You Baby","Lady Marmalade","Slegehammer",
                    "Work It","Slow Hand","All I Want To Do Is Make Love To You","Love In This Club","Romantic",
                    "Smack That","Ignition","Do Me!","Freak Me","Shake You Down","Afternoon Delight","Candy Shop",
                    "The First Night","Ring My Bell","Promiscuous","You're Makin' Me High","I Wanna Sex You Up",
                    "Bad Girls","Lollipop","Honky Tonk Women","Da Ya Think I'm Sexy","Kiss You All Over",
                    "Like A Virgin","Do That To Me One More Time","Hot Stuff","Let's Get It On","Too Close",
                    "I'll Make Love To You","Tonight's The Night","Physical","Love Me","Shut Up and Love Me","Bubble Butt",
                    "Lookin' Ass","Tonight (I'm Fuckin' You)","Blurred Lines","I Just Wanna","I'm a Slave 4 U","How to Love",
                    "Bitches Ain't Shit","Hey Mama","Anaconda","Slow An' Easy","Woman: Nature's Punching Bag",
                    "Loyal","Cream","Jeanny","Golden Shower","Smack My Bitch Up","Superman","Love the Way You Lie",
                    "Different for Girls","Something in Your Mouth","Every Girl","Drop Dead Legs","Girls Just Want to Have Fun",
                    "Body Say","Let's Make It Baby","Black Velvet","Kiss","I Might Go Lesbian","Dirrty ","S&M",
                    "Animals","679","I Wanna Fuck You","God Made Girls","Baby Got Back","Nasty Girl","Redneck Crazy",
                    "Love In This Club","Smile More","Lost Wolves","Burn Slow","Ooh Nah Nah","Pussy Is Mine",
                    "Suffer","...goingtohell","Bad Blood","Feels Like","Love","Lust For Life","It Wasn't Me","He Hit Me",
                    "What Makes You Beautiful","Run For Your Life","High For This","Destination Calabria","What I Want",
                    "After Night In Ibiza","Beautiful Life","Play Hard","International Love","Call On Me",
                    "Hung Up","Put Your Hands Up For Detroit","My Humps","Kill You","Baby It's Cold Outside",
                    "Wiggle","Toot it and Boot it","Translyvania","Bitches Ain't Shit","Hootie Hoo","Shake That",
                    "No Hands","Alphabet Bitches","Wait Til Ya See My Dick","Heidi Hoe","Wildflower","Ass Like That",
                    "Guilty Conscience","Cleaning Out My Closet","The Warning","Step Yo Game Up","Without Me",
                    "Birthday Sex","Wet The Bed","Carry Out","Ayo Technology","Ayo","Sexy Bitch","I Just Had Sex",
                    "Hotel Room","Hey Baby","You Don't Even Know It","U.O.E.N.O.","Fuckin' Problems","Satisfaction", "hello",
                    "I put a spell on you","in the summertime"]

In [13]:
# The line below calls the scrapping function
sexist_lyrics, _ = scrape_lyrics(sexist_artists_to_scrape, sexist_titles_to_scrape)



100%|████████████████████████████████████████████████████████████████████████████████| 149/149 [00:31<00:00,  5.47it/s]


# Finally: neutral ones

In [14]:
neutral_titles_to_scrape =  ["these arms of mine", "she", "green garden", "hotline bling", "three little birds",
                             "masterblaster", "inner city blues", "move on up", "use me", "papa was a rolling stone",
                             "richman poorman", "Is It Because I'm Black", "feeling good", "weird fishes", 
                             "all i need", "no room for doubt", "close to you", "how deep is your love", 
                             "if i ain't got you", "better in tune with the infinite", "comfortably numb", 
                             "shine on your crazy diamond", "wish you were here", "stairway to heaven", 
                             "space oddity", "california dreamin'", "hotel california", "an other brick in the wall", 
                             "lose yourself to dance", "don't stop 'til you get enough", "mr. tambourine man",
                             "blackbird", "breathe" , "time", "school", "sultans of swing", "dreams",
                             "brothers in arms", "the sound of silence", "a horse with no name", "riders on the storm",
                             "the look", "bohemian rhapsody","stuck in the middle with you", "superstition",
                             "blame it on the boogie", "september", "celebration", "virtual insanity",
                             "englishman in new york", "message in a bottle", "wonderwall", "creep", "no surprises",
                             "zombie", "cold little heart","wasting my young years", "three little birds", "overcome",
                             "redbone", "It Runs Through Me", "i need a dollar", "crazy", "this world",
                             "i won't go for more", "Didn't Cha Know", "never gonna catch me", "them changes",
                             "show you the way", "lovely day", "i wish", "sir duke", "living for the city", 
                             "don't let me down", "better together", "let it be", "river", "deathless",
                             "let it happen", "kids", "parachute", "life on mars", "changes", "friendly fire",
                             "deat meat", "no one knows", "lonely boy", "clint eastwood", "feel good inc.",
                             "bittersweet symphony", "come together", "don't worry be happy", "stand by me", "happy",
                             "get lucky", "let's grovve", "boogie wonderland", "i can't go for that (no can do)",
                             "owner of a lonely heart", "the logical song", "lady d'arbanville", "unstoppable", 
                             "the lung", "jekyll", "breathing underwater", "walking on the moon", "change", 
                             "that's the spirit", "wild world", "father and son", "piano man", "hallelujah",
                             "the curse", "taro", "breezeblocks", "in cold blood", "i like that", "tightrope",
                             "come over", "fitzpleasure", "peace of mind", "reason", "video killed the radio star",
                             "once in a lifetime", "the block", "afro blue", "i try", "you don't know my name",
                             "disco yes", "i will survive", "the bare necessities", "what a wonderful world", 
                             "when the saints go marching in", "everybody need somebody", "hit the road jack"
                             , "born to be wild", "all along the watchower", "piece of my heart",
                             "son of a preacher man", "mercy", "lumberjack soul", "tum rakak", "my world", "shoes",
                             "people make the workd go round", "gold", "chicken in the corn","skin teeth", 
                             "welcome to machine", "us and them", "brain damage", "have a cigar", "money",
                             "learning to fly", "high hopes", "lost for words", "simple man", "free bird",
                             "sweet home alabama", "smoke on tha water", "lazy", "child in time", "perfect strangers",
                             "riverside", "just so", "blue lights", "jammin", "so much trouble in the world",
                             "satisfy my soul", "crazy baldhead", "pimper's paradise", "what's going on", 
                             "have a talk with god", "don't you worry 'bout a thing", "ordinary pain", 
                             "knocks me off my feet", "summer soft", "too young to die", "strange fruit",
                             "cry me a river", "clocks", "viva la vida",
                             "titanium", "losing my religion", "in the air tonight", "proud mary", 
                             "time of the season", "a whiter shade of pale", "money for nothing", "walk of life",
                             "brothers in arms", "private investigations", "telegraph road", "preacheman",
                             "same to you", "your heart is as black as night", "who will comfort me"]

neutral_artists_to_scrape = ["otis redding", "laura mvula", "laura mvula", "drake", "bob marley", "stevie wonder",
                             "marvin gaye", "curtis mayfield", "bill withers", "the temptations","clinton fearon",
                             "Tiken Jah Fakoly", "nina simone", "radiohead", "radiohead", "lianne la havas", 
                             "carpenters","bee gees","alicia keys","ibeyi","pink floyd","pink floyd","pink floyd",
                             "led zeppelin","david bowie","the mamas and the papas", "eagles", "pink floyd","daft punk",
                             "michael jackson","bob dylan", "the beatles", "pink floyd", "pink floyd", "supertramp", 
                             "dire straits", "fleetwood mac", "dire straits", "paul simon", "america", "the doors","metronomy",
                             "queen","stealers wheel","stevie wonder", "the jackson five", "earth wind & fire", "kool & the gang",
                             "jamiroquai", "sting", "the police", "oasis", "radiohead", "radiohead","the cranberries", "Michael Kiwanuka",
                             "london grammar", "bob marley", "laura mvula", "childish gambino", "tom misch", "aloe blacc", "gnarls barkley",
                             "selah sue", "selah sue","erykah badu", "flying lotus","thundercat", "thundercat", "bill withers", 
                             "stevie wonder", "stevie wonder", "stevie wonder", "the beatles", "jack johnson","the beatles",
                             "ibeyi", "ibeyi", "tame impala", "mgmt", "sean lennon", "davie bowie", "davie bowie", "sean lennon",
                             "sean lennon", "queens of the stone age", "the black keys", "gorillaz", "gorillaz", "the verve",
                             "the beatles",          "bobby mcferrin",      "ben e king", "pharrell williams", "daft punk",
                             "earth wind & fire", "earth wind & fire", "hall & oates", "yes","supertramp", "cat stevens",
                             "lianne la havas", "hiatus kaiyote", "hiatus kaiyote", "hiatus kaiyote", "the police", "sandra nkake",
                             "lady bazaar", "cat stevens", "cat stevens", "billy joel", "jeff buckley", "agnes obel", "alt j", "alt j",
                             "alt j","janelle monae", "janelle monae", "the internet", "alt j", "selah sue", "selah sue", 
                             "the bugges", "talking heads","akua naru", "erykah badu", "macy gray", "alicia keys","tom misch",
                             "gloria gaynor", "louis armstrong", "louis armstrong", "louis armstrong","blues brothers", "ray charles", "steppenwolf", "the jimi hendrix experience", "janis joplin", "dusty springfield", "duffy",
                             "miles bonny", "deluxe","deluxe", "deluxe", "michael jackson", "andreya triana", "brushy one string",
                             "brushy one string", "pink floyd","pink floyd","pink floyd", "pink floyd", "pink floyd", "pink floyd",
                             "pink floyd", "pink floyd","lynyrd skynyrd", "lynyrd skynyrd", "lynyrd skynyrd", "deep purple", "deep purple",
                             "deep purple", "deep purple", "agnes obel", "agnes obel", "jorja smith", "bob marley", "bob marley", "bob marley",
                             "bob marley", "bob marley", "marvin gaye", "stevie wonder", "stevie wonder","stevie wonder", "stevie wonder",
                             "stevie wonder", "jamiroquai", "billie holiday", "Ella Fitzgerald", "coldplay",
                             "coldplay","david guetta", "REM", "phil collins", "tina turner", "Zombies",
                             "procol harum", "dire straits" ,"dire straits", "dire straits", "dire straits", "dire straits",
                             "melody gardot", "melody gardot", "melody gardot", "melody gardot"]

In [15]:
# The line below calls the scrapping function
neutral_lyrics, _ = scrape_lyrics(neutral_artists_to_scrape, neutral_titles_to_scrape)

100%|████████████████████████████████████████████████████████████████████████████████| 201/201 [00:55<00:00,  3.60it/s]


In [32]:
# Now, we label the lyrics, we create the train dataframe and save it to a csv

label_lyrics = [1]*len(positive_lyrics) + [-1]*len(sexist_lyrics) + [0]*len(neutral_lyrics)

lyrics = positive_lyrics + sexist_lyrics + neutral_lyrics
lyrics = [l.replace('\r','') for l in lyrics ] # remove the \r from the lyrics ( it creates problems in the csv)

lyrics_dataframe = pd.DataFrame({'Lyrics':lyrics, 'labels':label_lyrics})
lyrics_dataframe.drop(lyrics_dataframe[lyrics_dataframe['Lyrics']==''].index,inplace=True) # Drop the empty lyrics
lyrics_dataframe.to_csv(data_dir + '/train_lyrics.csv')

In [21]:
# Load the MSD dataset

year_artist_name_title_genre = pd.read_csv(os.path.join(data_dir + "/year_artist_name_title_genre.csv")).set_index('track_id')

In [22]:
year_artist_name_title_genre

Unnamed: 0_level_0,year,artist_name,title,genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TRSGHLU128F421DF83,1922,alberta hunter,don't pan me,Blues
TRRAHXQ128F42511FF,1922,barrington levy,looking my love,Reggae
TRFAFTK12903CC77B8,1922,barrington levy,warm and sunny day,Reggae
TRTRKSF12903CFEDD7,1924,vernon dalhart,wreck of the old 97,Country
TRQYSYF128F935F350,1925,bessie smith,careless love blues,Blues
TRZKAOZ128F4280C36,1925,papa charlie jackson,all i want is a spoonful,Blues
TRIOMVJ128F9309AD1,1925,papa charlie jackson,maxwell street blues,Blues
TRUEUUK128F92FD462,1925,papa charlie jackson,shake that thing,Blues
TRLRJYC128F930CB57,1926,"bertha ""chippie"" hill",trouble in mind,Blues
TRCAMMU128F429F53E,1926,blind blake,west coast blues,Blues


In [None]:
 '''
 Check whether the language of the songs' title is in english
 The detect function is not doing an optimal job: It leaves some non-english titles and 
 it deletes some english titles
 '''

for idx, row in year_artist_name_title_genre.iterrows():
    try:
        tit = year_artist_name_title_genre.loc[idx, 'title']
        language = None
        language = detect(tit)
        if language !='en':
            year_artist_name_title_genre = year_artist_name_title_genre.drop(idx)
    except:
        pass

In [None]:
artists_to_scrape = list(year_artist_name_title_genre.reset_index()['artist_name'])
titles_to_scrape = list(year_artist_name_title_genre.reset_index()['title'])
track_ids_to_scrape = list(year_artist_name_title_genre.reset_index())


# The line below calls the scrapping function
lyrics, _ = scrape_lyrics(artists_to_scrape, titles_to_scrape)

final_lyrics_dataframe = pd.DataFrame({'Artists': artists_to_scrape,'Titles': titles_to_scrape,'Lyrics': lyrics})
final_lyrics_dataframe.to_csv(data_dir +'/msd_lyrics.csv')