## data collection and preprocessing



In [None]:
%matplotlib inline

# Standard imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys

# We do this to ignore several specific Pandas warnings
import warnings
import requests
import json
warnings.filterwarnings("ignore")

### Extract data from Million Song Subset which is 1.8G in size

In [None]:


dir_tree = '../MillionSongSubset/'

for dir_path, dir_names, file_names in os.walk(dir_tree):
    for file_name in file_names:
        try:
            os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
        except OSError:
            print ("Could not move %s " % os.join(dir_path, file_name))

### Build an artist table with file,title, artist columns

In [None]:
def make_artist_table(base):

# Get file names

    files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
    data = {'file':[], 'artist':[], 'title':[]}

    # Add artist and title data to dictionary
    for f in files:
        store = pd.HDFStore(f)
        title = store.root.metadata.songs.cols.title[0]
        artist = store.root.metadata.songs.cols.artist_name[0]
        data['file'].append(os.path.basename(f))
        data['title'].append(title.decode("utf-8"))
        data['artist'].append(artist.decode("utf-8"))
        store.close()
    
    # Convert dictionary to pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='columns')
    df = df[['file', 'artist', 'title']]
    return df

In [None]:
base = '../MillionSongSubset/'
df = make_artist_table(base)

df.tail()

### Add the lyrics column

In [None]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()

### download the PyLyrics package to download lyrics from the website

In [None]:
#!pip install PyLyrics

In [None]:
from PyLyrics import *
## test this function
print(PyLyrics.getLyrics('justin bieber','Sorry')) #Print the lyrics directly

In [None]:
#!pip install pyprind

In [None]:
import pyprind

### download lyrics with the arguments of artist and track name

In [None]:
i=0
print(df.shape[0])
tpbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    try:
        lyr = PyLyrics.getLyrics(df.loc[row_id]['artist'],df.loc[row_id]['title'])    
        df.loc[row_id,'lyrics'] = lyr
        i+=1
        print(i,end="")
        pbar.update()
    except: #ignore erro when API returns no lyrics 
        continue


In [None]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()

In [None]:
df.to_csv('df_lyr_backup.csv')


### drop rows that has no lyrics

In [None]:
df = pd.read_csv('df_lyr_backup.csv')

df.head()
print(df.shape[0])

In [None]:

df = df[df.lyrics!='']
print(df.shape[0])

### remove songs that is not English song

In [None]:
import nltk
#nltk.download('words')
def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    print(diff)
    return diff

In [None]:
before = df.shape[0]
for row_id in df.index:
    text = df.loc[row_id]['lyrics']
    diff = eng_ratio(text)
    if diff >= 0.5:
        df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)
df.to_csv('df_lyr_xyz.csv')

### Till now, we got the songs with lyrics, but we have to tag each song with mood. Here I download the tags from Last.fm and classified each some with happy mood or sad mood

In [None]:
df = pd.DataFrame.from_csv('df_lyr_xyz.csv')

df.head()
print(df.shape[0])

In [None]:
def getSongTags(artist,track):
    url = "http://ws.audioscrobbler.com/2.0/?method=track.getTopTags&api_key=0f6916aff634cb3e768baa9d5ee89341&artist="+artist+"&track="+track+"&format=json"
#     print(url)
    results = requests.get(url).json()
#     print(results)
    tagList = []
    if 'toptags' in results:
        toptags = results['toptags']
        if 'tag' in toptags:
            taglistss = toptags['tag']           
            for tagItem in taglistss:
                tagList.append(tagItem['name']) 
    return tagList

In [None]:
df['tags'] = ''
pbar = pyprind.ProgBar(df.shape[0])
i=0
for row_id in df.index:
    print(i, end=" ")
    i+=1
    tags = getSongTags(df.loc[row_id]['artist'],df.loc[row_id]['title'])  
    df.loc[row_id,'tags'] = tags
    pbar.update()
    


In [None]:
getSongTags("The Weeknd","Call Out My Name")

In [None]:
df.loc[1,'tags']
# df.head()

In [None]:
for row_id in df.index:     
    if len(df.loc[row_id,'tags']) == 2:
        df = df.drop(row_id)


In [None]:
df.shape[0]

In [None]:
df['year'] = pd.Series('', index=df.index)

base = '../MillionSongSubset/'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    print(year)
    df.loc[row_id]['year'] = year

<img src="image.png">

In [None]:
df.to_csv("dataset_year.csv")

In [None]:
df['year']

## Happy or Sad
### Group id	Tags	num. of tags	num. of songs
#### sad tags:

G15	sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song	8	1,178

G16	depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy	11	471

G28	anger, angry, choleric, fury, outraged, rage, angry music	7	254

G17	grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning, plaintive, regret, sorrowful	14	183

#### happy tags:
G6	cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering, cheery, get happy, rejoice, songs that are cheerful, sunny	13	142

G5	happy, happiness, happy songs, happy music, glad, mood: happy	6	749

G2	upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat	8	543

G1	excitement, exciting, exhilarating, thrill, ardor, stimulating, thrilling, titillating	8	30
TOTAL		135	6,490

### This tag summary comes from the last.fm website which were group into different categories. Here, I choose group 15,16,28,17 as sad tag and group 5,6,2,1 as happy songs

In [None]:
happyTags = "cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering,\
cheery, get happy, rejoice, songs that are cheerful, sunny,happy, happiness, happy songs, happy music, glad, mood: happy,\
upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat,excitement, exciting, exhilarating, thrill,\
ardor, stimulating, thrilling, titillating"
happyTags = happyTags.replace(" ","").split(",")

sagTags = "sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song,\
depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy,\
anger, angry, choleric, fury, outraged, rage, angry music,grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning,\
plaintive, regret, sorrowful"
sagTags = sagTags.replace(" ","").split(",")

In [None]:
happyTags

In [None]:
sagTags

### Based on the tag numbers from sad group or the happy group, we can assign a mood value 1(happy) or 0(sad) to the mood column 

In [None]:
abcd=df.loc[1,'tags'].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")

In [None]:
print(abcd)

In [None]:
df['mood']=""
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
#     tags = df.loc[row_id,'tags']    
    tags = df.loc[row_id,'tags'].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")   
    sad_tags = np.intersect1d(tags,sagTags) 
    happy_tags = np.intersect1d(tags,happyTags)
    if len(sad_tags)>0 or len(happy_tags)>0:# having mood tag
        if len(sad_tags)>len(happy_tags):
            df.loc[row_id,'mood'] = 0
        else:
            df.loc[row_id,'mood'] = 1
    else:
        df = df.drop(row_id)# remove songs that does not have tag
    pbar.update()

In [None]:
df.to_csv("mood.csv")

In [None]:
df.shape[0]

In [None]:
df['mood']

In [None]:
df

In [None]:
df['year'] = pd.Series('', index=df.index)

base = '../MillionSongSubset/'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    df.loc[row_id]['year'] = year

In [None]:
df

In [None]:
df.to_csv('dataset_lyrics.csv', index=False,encoding='utf-8')

In [None]:
df = df.drop("tags",axis=1)

In [None]:
len(df[df.mood==1])/166


In [None]:
# save songs with mood tag into csv file for training and keywords extraction
df.to_csv('dataset_lyrics.csv', index=False,encoding='utf-8')

In [None]:
df = pd.read_csv('Dataset_tags.csv')

In [None]:
df = pd.read_csv('temp.csv')