In [1]:
import os, sys

dir_tree = '../Data/millionsongsubset_full/MillionSongSubset/'

for dir_path, dir_names, file_names in os.walk(dir_tree):
    for file_name in file_names:
        try:
            os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
        except OSError:
            print ("Could not move %s " % os.join(dir_path, file_name))

In [2]:
import os
import pandas as pd

def make_artist_table(base):

# Get file names

    files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
    data = {'file':[], 'artist':[], 'title':[], 'year':[]}

    # Add artist and title data to dictionary
    for f in files:
        store = pd.HDFStore(f)
        #print(store.keys())
        year=store.root.musicbrainz.songs.cols.year[0]
        title = store.root.metadata.songs.cols.title[0]
        artist = store.root.metadata.songs.cols.artist_name[0]
        
        data['file'].append(os.path.basename(f))
        data['title'].append(title.decode("utf-8"))
        data['artist'].append(artist.decode("utf-8"))
        data['year'].append(year)
        store.close()
    
    # Convert dictionary to pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='columns')
    df = df[['file', 'artist', 'title','year']]
    return df

In [3]:
base = '../Data/millionsongsubset_full/MillionSongSubset/'
df = make_artist_table(base)
df.tail()

Unnamed: 0,file,artist,title,year
9996,TRBIJMU12903CF892B.h5,Moonspell,The Hanged Man,1998
9997,TRBIJNF128F14815A7.h5,Danny Williams,The Wonderful World Of The Young,1998
9998,TRBIJNK128F93093EC.h5,Winston Reedy,Sentimental Man,0
9999,TRBIJRN128F425F3DD.h5,"Myrick ""Freeze"" Guillory",Zydeco In D-Minor,0
10000,TRBIJYB128F14AE326.h5,Seventh Day Slumber,Shattered Life,2005


#### Now we add a new column for the lyrics to our DataFrame.

In [4]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()

Unnamed: 0,file,artist,title,year,lyrics
9996,TRBIJMU12903CF892B.h5,Moonspell,The Hanged Man,1998,
9997,TRBIJNF128F14815A7.h5,Danny Williams,The Wonderful World Of The Young,1998,
9998,TRBIJNK128F93093EC.h5,Winston Reedy,Sentimental Man,0,
9999,TRBIJRN128F425F3DD.h5,"Myrick ""Freeze"" Guillory",Zydeco In D-Minor,0,
10000,TRBIJYB128F14AE326.h5,Seventh Day Slumber,Shattered Life,2005,


 #### Now we webscrap the song lyrics from SongLyrics.com or LyricsMode.com based on the artist and title names in the pandas DataFrame.

In [5]:
import urllib.parse
import urllib, re
import urllib.request
import urllib3
from bs4 import BeautifulSoup
import requests
          
def songlyrics(artist, title):
    artist = urllib.parse.quote(artist.lower().replace(' ','-'))
    title = urllib.parse.quote(title.lower().replace(' ','-'))
    
    try:
        http = urllib3.PoolManager()
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
        url = 'http://www.songlyrics.com/%s/%s-lyrics/' % (artist,title)
        response = http.request('GET', url,headers)
    except:
        return 'Cannot connect to songlyrics.com.'
    
    soup = BeautifulSoup(response.data,"lxml")    
    
    lyrics = soup.findAll(attrs= {'id' : 'songLyricsDiv'})
    if not lyrics:
        return 'Lyrics not found.'
    else:
        if str(lyrics[0]).startswith("<p class='songLyricsV14 iComment-text' id='songLyricsDiv'></p>"):

            return 'Lyrics not found.'
        try:
            return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
        except:
            return 'Cannot parse the lyrics.'
        
        
def lyricsmode(artist, title):
    artist = urllib.parse.quote(artist.lower().replace(' ','-'))
    title = urllib.parse.quote(title.lower().replace(' ','-'))
    
    try:
        http = urllib3.PoolManager()
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
        url = 'http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)
        print(url)
        response = http.request('GET', url,headers)
        
    except:
        return 'Cannot connect to lyricsmode.com.'
    #lyricsmode places the lyrics in a span with an id of "lyrics"

    soup = BeautifulSoup(response.data,"lxml")    
    lyrics = soup.findAll(attrs= {'id' : 'lyrics_text'})
    if not lyrics:
        return 'Lyrics not found.'
    try:
        return re.sub('<[^<]+?>', '', ''.join(str(lyrics[0])))
    except:
        return 'Cannot parse the lyrics.'
    
    
def get_lyrics(artist, title):
    lyr = songlyrics(artist, title)
    if not lyr:
        lyr = lyricsmode(artist, title)
    return lyr

In [6]:
import pyprind
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    artist,title = df.loc[row_id]['artist'], df.loc[row_id]['title']
    lyr = songlyrics(artist,title)
    df.loc[row_id,'lyrics'] = lyr
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:52:44


In [7]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()

downloaded Lyrics for 10001 songs


Unnamed: 0,file,artist,title,year,lyrics
0,subset_msd_summary_file.h5,Mastodon,Deep Sea Creature,2001,"Knowing right, learning wrong\nWhat you're fee..."
1,TRAAAAW128F429D538.h5,Casual,I Didn't Mean To,0,Alright I might\nHave had a little glare when ...
2,TRAAABD128F429CF47.h5,The Box Tops,Soul Deep,1969,Darlin' I don't know much.\r\nI know I love yo...
3,TRAAADZ128F9348C2E.h5,Sonora Santanera,Amor De Cabaret,0,"Sorry, we have no Sonora santanera - Amor de c..."
4,TRAAAEF128F4273421.h5,Adam Ant,Something Girls,1982,Every girl is a something girl\nEvery girl is ...


#### WE now remove the titles from the data for which we could not find the lyrics

In [8]:
df=df[~df['lyrics'].str.contains("Sorry, we have no")] 
df=df[~df['lyrics'].str.contains("Lyrics not found.")]
df=df[~df['lyrics'].str.contains("We do not have the lyrics for")]
df=df[~df['lyrics'].str.contains("Cannot connect to lyricsmode.com.")]
df=df[~df['lyrics'].str.contains("Cannot connect to songlyrics.com.")]
df=df[~df['lyrics'].str.match("[Instrumental]")]
df=df[~df['lyrics'].str.match("(Instrumental)")]

In [9]:
df.to_csv('titles_with_lyrics.csv',index=False)

### Language Filter

Now, we remove all lyrics that are not in English. Basically, we say that if the song contains more English than non-English words (> 50%), then it is an English song. We use this relatively high cutoff-ratio of 0.5, since a songtext likely contains also names and other special words that are not part of a common English dictionary.

In [10]:
import nltk
#nltk.download()

def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''
    diff=0
    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha())
    unusual = text_vocab.difference(english_vocab)
    if len(text_vocab)!=0:
        diff = len(unusual)/len(text_vocab)
    return diff    

### Remove all non-English lyrics

In [12]:
before = df.shape[0]
for row_id in df.index:
    text = df.loc[row_id]['lyrics']
    diff = eng_ratio(text)
    if diff >= 0.5:
        df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)

89 have been removed.
2023 songs remain in the dataset.


#### Backing up the Titles with English Lyrics

In [14]:
df.to_csv('df_lyrics_backup.csv', index=False)

#### Out of the 2024 songs with english lyrics we chose 650 songs which had the 'year' of release mentioned with them. Also,we manually labelled the mood for them as 'Happy' or 'Sad' based on the lyrics. This data is captured in Data.csv file. Further we randomly divide the data set into training and validation data sets for building our classification models. 

#### Training Dataset (550 songs): training.csv
#### Validation Dataset (100 songs): validation.csv

In [18]:
from random import shuffle
data=pd.read_csv('Data.csv')
indices= data.index.values.tolist()
shuffle(indices)
training=data.loc[indices[:550]]
validation=data.loc[indices[550:650]]
training.to_csv('training.csv',index=False)
validation.to_csv('validation.csv', index=False)