# Lyrics for billboard/MSD/extra songs

In [235]:
import requests
import re
import urllib
from bs4 import BeautifulSoup
import numpy as np
import pickle
import os
import pandas as pd
import time

def chunk_50(series): # requesting track info needs smaller size of requests
    for i in xrange(0,len(series),50):
        yield series[i:(i+50)]

def get_spotify_names(spotifyIDs):
    '''
    in: [ids,...]
    out: [(title,artist),...]
    '''
    names = []
    
    for chunked_spotifyIDs in chunk_50(spotifyIDs):
        url_head = 'https://api.spotify.com/v1/tracks/?ids='
        detail = ','.join(chunked_spotifyIDs)
        url = url_head+detail
        r = requests.get(url)
        track_lists = r.json()['tracks']

        for track in track_lists:
            title = track['name'].lower()
            artist = [i['name'].lower() for i in track['artists']] # only get first artist for now
            names.append((title,artist))
    return names

def get_lyric(title,artist):
    url_head = u'http://lyrics.wikia.com/wiki/'
    url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')
    lyric = re.sub('<.?div.*?>','',str(lyricbox))
    lyric = re.sub('<br/>','.',lyric)
    return lyric


In [2]:
def find_lyric_multiple_artists(ids,spotify_names,lyric_dict):
    leftovers = []
    for i,(title,artists) in enumerate(spotify_names):
        if i % 100 == 0:
            print i
        uri = ids[i]
        j= 0
        while j <= len(artists)-1:
            artist = artists[j]
            lyric = get_lyric(title,artist)
            if lyric != 'None' and uri not in lyric_dict: # if found lyric add that to the lyrics list
                lyric_dict[uri] = lyric
                break
            j += 1
        # if not found lyric for this title,artists
        if lyric == 'None':
            leftovers.append(uri)
    return leftovers

In [3]:
def get_lyric2(title,artist): 
    # also look for redirect message
    url_head = u'http://lyrics.wikia.com/wiki/'
    url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')
    
    if lyricbox: # if url has lyricbox
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    elif soup.find('div',class_ = 'redirectMsg'): # if url is redirect message page
        href = soup.find('div',class_ = 'redirectMsg').find('a')['href']
        url = u'http://lyrics.wikia.com/'+href
        r = requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        lyricbox = soup.find('div',class_ = 'lyricbox')
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    else:
        return 'None'
def find_lyric_multiple_artists2(ids,spotify_names,lyric_dict):
    leftovers = []
    for i,(title,artists) in enumerate(spotify_names):
        if i % 100 == 0:
            print i
        uri = ids[i]
        j= 0
        while j <= len(artists)-1:
            artist = artists[j]
            lyric = get_lyric2(title,artist)
            if lyric != 'None' and uri not in lyric_dict: # if found lyric add that to the lyrics list
                lyric_dict[uri] = lyric
                break
            j += 1
        # if not found lyric for this title,artists
        if lyric == 'None':
            leftovers.append(uri)
    return leftovers

In [4]:
def get_lyric3(title,artist): # href to url is changed, since some href has /wiki/... instead of wiki/...
    url_head = u'http://lyrics.wikia.com/wiki/'
    url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')
    
    if lyricbox: # if url has lyricbox
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    elif soup.find('div',class_ = 'redirectMsg'): # if url is redirect message page
        href = soup.find('div',class_ = 'redirectMsg').find('a')['href']
        url = u'http://lyrics.wikia.com'+href
        r = requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        lyricbox = soup.find('div',class_ = 'lyricbox')
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    else:
        return 'None'

def find_lyric_multiple_artists3(ids,spotify_names,lyric_dict):
    leftovers = []
    for i,(title,artists) in enumerate(spotify_names):
        if i % 100 == 0:
            print i
        uri = ids[i]
        j= 0
        while j <= len(artists)-1:
            artist = artists[j]
            lyric = get_lyric3(title,artist)
            if lyric != 'None' and uri not in lyric_dict: # if found lyric add that to the lyrics list
                lyric_dict[uri] = lyric
                break
            j += 1
        # if not found lyric for this title,artists
        if lyric == 'None':
            leftovers.append(uri)
    return leftovers

Now just need to get spotify artist groups for both top10_spotify songs and MSD song ids

In [288]:
if not os.path.isfile('lyrics/All_names'):
    audio_data = pd.read_pickle('MSD_audio_features')
    spotifyIDs = audio_data.uri
    audio_data = pd.read_pickle('Spotify_audio_features')
    spotifyIDs = np.append(spotifyIDs,audio_data.uri)
    ids = map(lambda x:x.split(':')[-1],spotifyIDs)
    
    spotify_names = get_spotify_names(ids) # get title,artists info for all songs(spotify+MSD)
    spotifyNames = pd.DataFrame(spotify_names,columns = ['title','artists'])
    spotifyNames['uri'] = ids
    spotifyNames.to_pickle('lyrics/All_names')
else:
    spotifyNames = pd.read_pickle('lyrics/All_names')
    ids = spotifyNames['uri']
    spotify_names = zip(spotifyNames.title.values,spotifyNames.artists.values)

In [289]:
if not os.path.isfile('lyrics/lyrics.pickle'):
    lyrics = {}
    leftovers = find_lyric_multiply_artists(ids,spotify_names,lyrics)
    with open('lyrics/lyrics.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    leftover = pd.DataFrame(leftovers)
    leftover.to_pickle('lyrics/Left_over_songs(no_lyric)')
else:
    lyrics = pickle.load(open('lyrics/lyrics.pickle', 'r'))
    leftovers = pd.read_pickle('lyrics/Left_over_songs(no_lyric)')

In [290]:
print 'Covered {:{prec}} of songs'.format((len(ids)-len(leftovers))/float(len(ids)),prec='.2')

Covered 0.56 of songs


## We are barely covering 55% of the songs!
To get around that, I decided to get more songs from same artists that have produced hit songs on billboard. The benefit of this is that we can control the variation of different producing companies if we compare songs of same artist.

But first I need to get all lyrics for billboard songs...done

## Billboard song lyrics (covering 96% of 4264 songs)

In [292]:
top10 = pd.read_pickle('Billboard_data')
# get spotifyIDs for billboard songs
audio_data = pd.read_pickle('Spotify_audio_features')
spotifyIDs = map(lambda x:x.split(':')[-1],audio_data.uri.values)
# get ids for those who don't have lyrics
ids = [i for i in spotifyIDs if i not in lyrics]
spotify_names = get_spotify_names(ids) # get title,artists info for spotifyIDs
len(spotify_names) # 1111 songs to collect lyrics

458

In [294]:
if not os.path.isfile('lyrics/lyrics2.pickle'):
    # billboard songs lyrics collection, filter 1: re.sub(' -.*','',x)
    spotify_no_lyrics = pd.DataFrame(spotify_names,columns = ['title','artists'])
    spotify_no_lyrics['uri'] = ids
    spotify_no_lyrics.head(2)
    spotify_no_lyrics['trimmed_title'] = map(lambda x:re.sub(' -.*','',x),spotify_no_lyrics.title)
    names = zip(spotify_no_lyrics.trimmed_title.values,spotify_no_lyrics.artists)
    ids   = spotify_no_lyrics.uri
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    # save lyrics after filter 1
    with open('lyrics/lyrics2.pickle', 'wb') as handle:
            pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/spotify_leftover_filter1')
else:
    lyrics = pickle.load(open('lyrics/lyrics2.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter1')
    no_lyrics = map(lambda x:x not in lyrics,tmp.spotifyID.values)
    tmp = tmp[no_lyrics] # 573 songs
    
    
# see how many we missed, how many we have lyrics for, and if the lyric is empty(if len(tmp)!=c)
c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

if not os.path.isfile('lyrics/lyrics3.pickle'):
    # billboard songs lyrics collection, filter 2: use billboard data title names instead of spotify title names
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:5]
    spotify_leftover2['title_y'] = map(lambda x:x.lower(),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop(['date'],axis=1,inplace=True)
    spotify_leftover2.drop_duplicates(subset=['spotifyID'],keep='first',inplace=True)

    ids = spotify_leftover2.spotifyID.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    
    with open('lyrics/lyrics3.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/spotify_leftover_filter2')
else:
    lyrics = pickle.load(open('lyrics/lyrics3.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter2')
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:5]
    # 458 songs left
    
# see how many we missed, how many we have lyrics for, and if the lyric is empty(if len(tmp)!=c)
c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

# billboard songs lyrics collection, filter 3: title.replace(' ','_'),artist.replace(' ','_')
if not os.path.isfile('lyrics/lyrics4.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:5]
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop(['date'],axis=1,inplace=True)
    spotify_leftover2.drop_duplicates(subset=['spotifyID'],keep='first',inplace=True)

    ids = spotify_leftover2.spotifyID.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists2(ids,names,lyrics)
    
    with open('lyrics/lyrics4.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 3
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    
    tmp.to_pickle('lyrics/spotify_leftover_filter3')
else:
    lyrics = pickle.load(open('lyrics/lyrics4.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter3')
    
c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

if not os.path.isfile('lyrics/lyrics5.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop(['date'],axis=1,inplace=True)
    spotify_leftover2.drop_duplicates(subset=['spotifyID'],keep='first',inplace=True)

    ids = spotify_leftover2.spotifyID.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics5.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    
    tmp.to_pickle('lyrics/spotify_leftover_filter4')
else:
    lyrics = pickle.load(open('lyrics/lyrics5.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter4')
    
c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

if not os.path.isfile('lyrics/lyrics6.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop(['date'],axis=1,inplace=True)
    spotify_leftover2.drop_duplicates(subset=['spotifyID'],keep='first',inplace=True)

    ids = spotify_leftover2.spotifyID.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics6.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    
    tmp.to_pickle('lyrics/spotify_leftover_filter5')
else:
    lyrics = pickle.load(open('lyrics/lyrics6.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter5')
    
c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

if not os.path.isfile('lyrics/lyrics7.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,top10,how='inner',on=['spotifyID']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop(['date'],axis=1,inplace=True)
    spotify_leftover2.drop_duplicates(subset=['spotifyID'],keep='first',inplace=True)

    ids = spotify_leftover2.spotifyID.values
    artists = map(lambda x:[x],spotify_leftover2['artist'].values)
    names = zip(spotify_leftover2['title_y'].values,artists)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics7.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['spotifyID'] = leftovers
    
    tmp.to_pickle('lyrics/spotify_leftover_filter6')
else:
    lyrics = pickle.load(open('lyrics/lyrics7.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/spotify_leftover_filter6')

c = 0
for uri in audio_data['uri']:
    ids = uri.split(':')[-1]
    if ids not in lyrics or (not lyrics[ids]):
        c += 1
print len(tmp),len(lyrics),c

573 5102 573
458 5217 458
349 5326 349
319 5356 319
310 5365 310
157 5518 157


In [303]:
print 'Covered: {:{prec}} Billboard songs'.format((len(pd.read_pickle('Spotify_audio_features'))-float(c))/len(pd.read_pickle('Spotify_audio_features')),prec='.2') # decide to ignore these 157 songs since they are small percentage

Covered: 0.96 Billboard songs


# Lyrics for MSD songs

In [5]:
lyrics = pickle.load(open('lyrics/lyrics7.pickle', 'r'))
# get spotifyIDs for MSD songs
audio_data = pd.read_pickle('MSD_audio_features')
spotifyIDs = map(lambda x:x.split(':')[-1],audio_data.uri.values)
# get ids for those who don't have lyrics
ids = [i for i in spotifyIDs if i not in lyrics]
spotify_names = get_spotify_names(ids) # get title,artists info for spotifyIDs
print len(spotify_names) # 2560 songs to collect lyrics
MSD = pd.read_pickle('MSD_tracks')

2560


In [20]:
# @overwrite get_lyric functions to avoid raising unicodeDecodeError
# weird I can't actually find error when I run it outside the function:
# À mon avis les bantous de la capitale

def get_lyric(title,artist):
    url_head = u'http://lyrics.wikia.com/wiki/'
    try:
        url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    except UnicodeDecodeError as e:
        url = 'http://lyrics.wikia.com/wiki'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')
    lyric = re.sub('<.?div.*?>','',str(lyricbox))
    lyric = re.sub('<br/>','.',lyric)
    return lyric

def get_lyric2(title,artist):
    # also look for redirect message
    url_head = u'http://lyrics.wikia.com/wiki/'
    try:
        url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    except UnicodeDecodeError as e:
        url = 'http://lyrics.wikia.com/wiki'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')

    if lyricbox: # if url has lyricbox
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    elif soup.find('div',class_ = 'redirectMsg'): # if url is redirect message page
        href = soup.find('div',class_ = 'redirectMsg').find('a')['href']
        url = u'http://lyrics.wikia.com/'+href
        r = requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        lyricbox = soup.find('div',class_ = 'lyricbox')
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    else:
        return 'None'
    
def get_lyric3(title,artist):
    # href to url is changed, since some href has /wiki/... instead of wiki/...
    url_head = u'http://lyrics.wikia.com/wiki/'
    try:
        url = url_head+urllib.quote(artist.encode('utf-8'))+':'+urllib.quote(title.encode('utf-8')) #encoding in utf-8 to avoid quote() throwing key error. eg.chanté moore
    except UnicodeDecodeError as e:
        print 't:_',title
        print 'a:_',artist
        url = 'http://lyrics.wikia.com/wiki'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html.parser')
    lyricbox = soup.find('div',class_ = 'lyricbox')

    if lyricbox: # if url has lyricbox
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    elif soup.find('div',class_ = 'redirectMsg'): # if url is redirect message page
        href = soup.find('div',class_ = 'redirectMsg').find('a')['href']
        url = u'http://lyrics.wikia.com'+href
        r = requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        lyricbox = soup.find('div',class_ = 'lyricbox')
        lyric = re.sub('<.?div.*?>','',str(lyricbox))
        lyric = re.sub('<br/>','.',lyric)
        return lyric
    else:
        return 'None'

In [23]:
if not os.path.isfile('lyrics/lyrics8.pickle'):
    # MSD songs lyrics collection, filter 1: re.sub(' -.*','',x)
    spotify_no_lyrics = pd.DataFrame(spotify_names,columns = ['title','artists'])
    spotify_no_lyrics['uri'] = ids
    spotify_no_lyrics.head(2)
    spotify_no_lyrics['trimmed_title'] = map(lambda x:re.sub(' -.*','',x),spotify_no_lyrics.title)
    
    names = zip(spotify_no_lyrics.trimmed_title.values,spotify_no_lyrics.artists)
    ids   = spotify_no_lyrics.uri
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    
    # save lyrics after filter 1
    with open('lyrics/lyrics8.pickle', 'wb') as handle:
            pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/MSD_leftover_filter1')
else:
    lyrics = pickle.load(open('lyrics/lyrics8.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter1')

print 'Step 1 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics9.pickle'):
    # billboard songs lyrics collection, filter 2: use billboard data title names instead of spotify title names
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower(),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    
    with open('lyrics/lyrics9.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/MSD_leftover_filter2')
else:
    lyrics = pickle.load(open('lyrics/lyrics9.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter2')
    
print 'Step 2 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

# billboard songs lyrics collection, filter 3: title.replace(' ','_'),artist.replace(' ','_')
if not os.path.isfile('lyrics/lyrics10.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,MSD,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists2(ids,names,lyrics)
    
    with open('lyrics/lyrics10.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 3
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/MSD_leftover_filter3')
else:
    lyrics = pickle.load(open('lyrics/lyrics10.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter3')

        
print 'Step 3 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print


if not os.path.isfile('lyrics/lyrics11.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,MSD,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics11.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/MSD_leftover_filter4')
else:
    lyrics = pickle.load(open('lyrics/lyrics11.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter4')

print 'Step 4 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics12.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,MSD,how='inner',on=['uri']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics12.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/MSD_leftover_filter5')
else:
    lyrics = pickle.load(open('lyrics/lyrics12.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter5')

print 'Step 5 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics13.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,MSD,how='inner',on=['uri']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    artists = map(lambda x:[x],spotify_leftover2['artist'].values)
    names = zip(spotify_leftover2['title_y'].values,artists)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics13.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/MSD_leftover_filter6')
else:
    lyrics = pickle.load(open('lyrics/lyrics13.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/MSD_leftover_filter6')

print 'Step 6 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

Step 1 result:
2283 5789 2283

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
Step 2 result:
2237 5814 2258

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
Step 3 result:
2172 5879 2191

0
100
200
t:_ À_mon_avis
a:_ les bantous de la capitale
300
400
500
600
700
t:_ dang_pyar_da 
a:_ naseebo lal
800
900
1000
1100
1200
t:_ ek_pardesi_di 
a:_ naseebo lal
1300
1400
1500
1600
1700
1800
1900
2000
t:_ lovesick_-_obsession_in_¾_time
a:_ philippe sarde
2100
t:_ Èay!_que_pena
a:_ los chunguitos
Step 4 result:
2168 5883 2187

0
100
200
t:_ À_Mon_Avis
a:_ les bantous de la capitale
300
400
500
600
700
t:_ Dang_Pyar_Da 
a:_ naseebo lal
800
900
1000
1100
1200
t:_ Ek_Pardesi_Di 
a:_ naseebo lal
1300
1400
1500
1600
1700
1800
1900
2000
t:_ LOVESICK_-_Obsession_in_¾_Time
a:_ philippe sarde
2100
t:_ Èay!_Que_Pena
a:_ los chunguitos
Step 5 result:
2167 5884 2186

0
100
200
t:_ À_Mon_Avis
a:_ Le

In [28]:
uris = audio_data.uri
c = 0
for ID in map(lambda x:x.split(':')[-1],uris):
    if ID in lyrics and lyrics[ID]!= 'None':
        c += 1
print 'Covered {:{prec}} MSD songs.'.format(float(c)/len(uris),prec='.2')

Covered 0.46 MSD songs.


## Lyrics for same album/similar artist songs

In [243]:
lyrics = pickle.load(open('lyrics/lyrics7.pickle', 'r'))
# get spotifyIDs for songs
audio_data = pd.read_pickle('Same_album_track_audio_features')
spotifyIDs = map(lambda x:x.split(':')[-1],audio_data.uri.values)
# get ids for those who don't have lyrics
ids = [i for i in spotifyIDs if i not in lyrics]
spotify_names = get_spotify_names(ids) # get title,artists info for spotifyIDs
print len(spotify_names) # 3840 songs to collect lyrics


3840


In [244]:
if not os.path.isfile('Extra_tracks'):
    Extra_tracks = pd.DataFrame(spotify_names,columns=['title','artist'])
    Extra_tracks['uri']=ids
    Extra_tracks.to_pickle('Extra_tracks')
else:
    Extra_tracks = pd.read_pickle('Extra_tracks')

In [238]:
if not os.path.isfile('lyrics/lyrics_same8.pickle'):
    # songs lyrics collection, filter 1: re.sub(' -.*','',x)
    spotify_no_lyrics = pd.DataFrame(spotify_names,columns = ['title','artists'])
    spotify_no_lyrics['uri'] = ids
    
    spotify_no_lyrics['trimmed_title'] = map(lambda x:re.sub(' -.*','',x),spotify_no_lyrics.title)
    
    names = zip(spotify_no_lyrics.trimmed_title.values,spotify_no_lyrics.artists)
    ids   = spotify_no_lyrics.uri
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    
    # save lyrics after filter 1
    with open('lyrics/lyrics_same8.pickle', 'wb') as handle:
            pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/same_leftover_filter1')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same8.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter1')

print 'Step 1 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics_same9.pickle'):
    # billboard songs lyrics collection, filter 2: use billboard data title names instead of spotify title names
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower(),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists(ids,names,lyrics)
    
    with open('lyrics/lyrics_same9.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 2
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    tmp.head()
    tmp.to_pickle('lyrics/same_leftover_filter2')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same9.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter2')
    
print 'Step 2 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

# billboard songs lyrics collection, filter 3: title.replace(' ','_'),artist.replace(' ','_')
if not os.path.isfile('lyrics/lyrics_same10.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists2(ids,names,lyrics)
    
    with open('lyrics/lyrics_same10.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # collect leftover songs for filter 3
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/same_leftover_filter3')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same10.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter3')

        
print 'Step 3 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print


if not os.path.isfile('lyrics/lyrics_same11.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri'])
    spotify_leftover2['title_y'] = map(lambda x:x.lower().replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics_same11.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/same_leftover_filter4')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same11.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter4')

print 'Step 4 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics_same12.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics_same12.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/same_leftover_filter5')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same12.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter5')

print 'Step 5 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

if not os.path.isfile('lyrics/lyrics_same13.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri']).ix[:,:6]
    spotify_leftover2['title_y'] = map(lambda x:x.replace(' ','_'),spotify_leftover2['title_y'].values)
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_multiple_artists3(ids,names,lyrics)
    
    with open('lyrics/lyrics_same13.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/same_leftover_filter6')
else:
    lyrics = pickle.load(open('lyrics/lyrics_same13.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_filter6')

print 'Step 6 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

Step 1 result:
946 8412 946

Step 2 result:
945 8413 945

Step 3 result:
827 8531 827

Step 4 result:
799 8559 799

Step 5 result:
799 8559 799

Step 6 result:
799 8559 799



In [239]:
def get_lyric_az(title,artist):
    t = re.sub(r'\W','',title.lower())
    a = re.sub(r'\W','',artist.lower())
    url = 'http://www.azlyrics.com/lyrics/'+a+'/'+t+'.html'
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
    if r.status_code == 200:
        lyricbox = re.findall(r'<!-- Usage of .*?>.*<!-- MxM banner -->',re.sub('\n','',r.text))[0]
        lyrics = re.sub('<br>','.',lyricbox)
        lyrics = re.sub(r'<.*?>',' ',lyrics)
        lyrics = re.sub('\r','',lyrics)
        lyrics = unicode(lyrics).encode('ascii','ignore')
        return lyrics
    return None

def find_lyric_az(ids,spotify_names,lyric_dict):
    leftovers = []
    for i,(title,artists) in enumerate(spotify_names):
        if i % 50 == 0:
            print i
        uri = ids[i]
        j= 0
        while j <= len(artists)-1:
            artist = artists[j]
            lyric = get_lyric_az(title,artist)
            if lyric and uri not in lyric_dict: # if found lyric add that to the lyrics list
                lyric_dict[uri] = lyric
                break
            j += 1
        # if not found lyric for this title,artists
        if not lyric:
            leftovers.append(uri)
        time.sleep(0.2)
    return leftovers


In [286]:
if not os.path.isfile('lyrics/lyrics_final.pickle'):
    # join table and remove duplicate rows
    spotify_leftover2 = pd.merge(tmp,Extra_tracks,how='inner',on=['uri']).ix[:,:6]
    spotify_leftover2.drop_duplicates(subset=['uri'],keep='first',inplace=True)

    ids = spotify_leftover2.uri.values
    names = zip(spotify_leftover2['title_y'].values,spotify_leftover2['artists'].values)
    
    leftovers = find_lyric_az(ids,names,lyrics)
    
    with open('lyrics/lyrics_final.pickle', 'wb') as handle:
        pickle.dump(lyrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    leftover_names = get_spotify_names(leftovers)
    tmp = pd.DataFrame(leftover_names,columns = ['title','artists'])
    tmp['uri'] = leftovers
    
    tmp.to_pickle('lyrics/same_leftover_final')
else:
    lyrics = pickle.load(open('lyrics/lyrics_final.pickle', 'r'))
    tmp = pd.read_pickle('lyrics/same_leftover_final')

print 'Step 7 result:'
c = 0
for uri in audio_data['uri']:
    ID = uri.split(':')[-1]
    if ID not in lyrics or (not lyrics[ID]):
        c += 1
print len(tmp),len(lyrics),c
print

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
Step 7 result:
746 8612 746

