# Retrieving song info + lyrics
## A code that builds a 16K songs dataset via SpotiPy and LyricsGenius
### The second part of the Lyrics Entropy project is available [here](https://carlosfg97.github.io/MusicLyricEntropy/MusicLyricsEntropy.html)
### and the main repo is [here](https://github.com/carlosfg97/MusicLyricEntropy)

In [1]:
#!pip install spotipy
# https://spotipy.readthedocs.io/en/2.19.0/
# https://medium.com/@maxtingle/getting-started-with-spotifys-api-spotipy-197c3dc6353b

In [2]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
import pickle
cid =  # add Spotify API client ID
secret =  # ADD secret token 
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [4]:
#taken from https://github.com/MaxHilsdorf/introduction_to_spotipy/blob/master/introduction_to_spotipy.ipynb
def get_playlist_tracks(username,playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

def get_playlist_tracks(playlist_id):
    results = sp.playlist_tracks(playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

### TRACK

In [7]:
playlist_ids = [ '532F1h299qKD894BlPfJJF', '7bVwBxsWk84iKeMPXa8oRs', '6dSeKJvN1MhLP1IEwBWFXg',
                '0f2CvetujnCTOxB6KCXhII', '37i9dQZF1DXcBWIGoYBM5M', '37i9dQZF1DX4bSrsRWE9cd',
                '37i9dQZF1DWU3bkMPOyjie', '37i9dQZF1DWY4xHQp97fN6', '37i9dQZF1DX4WYpdgoIcn6', 
                '37i9dQZF1DX5NsFgylu4qQ', '37i9dQZF1DX82pCGH5USnM', '37i9dQZF1DWYs83FtTMQFw',
                '37i9dQZF1DXcGnc6d1f20P','6dSeKJvN1MhLP1IEwBWFXg', '532F1h299qKD894BlPfJJF',
                '3UVNqcek0Hg8ne1ijJn9kc','6ZlFKcTzJVslgjudScTX4G','3NgkCmLvO7UrtMsVBEeFVU',
                '37i9dQZF1DX4UtSsGT1Sbe','37i9dQZF1DXbTxeAdrVG2l','3NVUQ2r0Ty7eWH2AwItUE4',
                '72LA3OR3WCoXu6ZC7opyz9','5UtamknFCkYdgfwYrCbquZ','23zeB07Gmr1X5xJLV69GI0',]

In [8]:
track_lists = []
for i in playlist_ids:
    track_list = get_playlist_tracks(i)
    print(sp.playlist(i)['name'], str(len(track_list)) + ' songs')
    #print(len(track_list))
    track_lists.extend(track_list)

West's Bests 63 songs
Olds 314 songs
🌀FLOW🌀 160 songs
Today's Top Hits 50 songs
Bliss 72 songs
Summer Rock Vibes 73 songs
Get Turnt 100 songs
Chill Hits 130 songs
Waves 78 songs
Lounge - Soft House 202 songs
Hot Rhythmic 90 songs
This Is J. Cole 48 songs
Olds 314 songs
Longest Playlist 50/60/70/80/90/00/10/20 5727 songs
Longest playlist everrrrrrrrrrrrrrr 1983 songs
Longest Playlist 3596 songs
70s Hits 305 songs
All Out 80s 150 songs
All Out 90s 150 songs
LONG ASS INDIE ETC PLAYLIST 433 songs
the longest rock playlist on spotify 9421 songs
a totally work appropriate upbeat indie playlist that's also over SEVENTEEN hours long 276 songs
excessively long chill songs playlist 2691 songs


In [10]:
tracks_ids = []
for i in track_lists:
    try:
        tracks_ids.append(i['track']['id'])
    except:
        continue   

tracks_ids = list(set(tracks_ids))

In [11]:
len(tracks_ids)

24036

In [12]:
def get_complete_track_info(track_id):
    
    # basic info
    r = sp.track(track_id)
    album_id = r['album']['id']
    artist_id = [i['id'] for i in r['artists']]
    duration_ms = r['duration_ms']
    explicit = r['explicit']
    url = r['href'] 
    song_id = r['id']
    song_name = r['name']
    song_popularity = r['popularity']
    song_type = r['type']
    song_release_date = r['album']['release_date']
    
    
    # track audio analysis attributes
    af = sp.audio_features([track_id])[0]
    track_danceability = af['danceability']
    track_energy = af['energy']
    track_key = af['key']
    track_loudness = af['loudness']
    track_mode = af['mode']
    track_speechiness = af['speechiness']
    track_acousticeess = af['acousticness']
    track_instrumentalness = af['instrumentalness']
    track_liveness = af['liveness']
    track_valence = af['valence']
    track_tempo = af['tempo']
    
    output = {
        'track_id' : track_id,
        'album_id' : album_id,
        'artist_id' : artist_id,
        'duration_ms' : duration_ms,
        'flg_explicit' : explicit,
        'url' : url,
        'track_name' : song_name,
        'track_popularity' : song_popularity,
        'track_type' : song_type,
        'track_danceability' : track_danceability,
        'track_energy' : track_energy,
        'track_key' : track_key,
        'track_loudness' : track_loudness,
        'track_mode' : track_mode,
        'track_speechiness' : track_speechiness,
        'track_acousticeess' : track_acousticeess,
        'track_instrumentalness' : track_instrumentalness,
        'track_liveness' : track_liveness,
        'track_valence' : track_valence,
        'track_tempo' : track_tempo   , 
        'track_release_date' : song_release_date
    }
    
    return output


In [13]:
partition_size = int(len(tracks_ids)/4)

In [14]:
len(tracks_ids)

24036

In [16]:
p1 = 0 + partition_size*1
p2 = 0 + partition_size*2
p3 = 0 + partition_size*3
#p4 = print(0 + partition_size*4)
print(p1,p2,p3)

6009 12018 18027


In [17]:

track_info = []
f = 0 
for i in tracks_ids[:p1]:
    try:
        track_info.append(get_complete_track_info(i))
    except:
        continue   
        
    f += 1
    if f in [1000,2000,4000,5000,6000]:
        print(f)

    

1000
2000
4000
5000
6000


In [18]:
df_tracks1 = pd.DataFrame(track_info)
df_tracks1.to_csv('df_tracks1.csv')

In [19]:
track_info = []
f=0

for i in tracks_ids[p1:p2]:
    try:
        track_info.append(get_complete_track_info(i))
    except:
        continue  
    
    f += 1
    if f in [1000,2000,3000,4000]:
        print(f)


    

1000
2000
3000
4000


In [20]:
df_tracks2 = pd.DataFrame(track_info)
df_tracks2.to_csv('df_tracks2.csv')

In [21]:
track_info = []
f = 0

for i in tracks_ids[p2:p3]:
    try:
        track_info.append(get_complete_track_info(i))
    except:
        continue   
    f += 1
    if f in [1000,2000,4000,5000]:
        print(f)

    

1000
2000
4000
5000


In [22]:
df_tracks3 = pd.DataFrame(track_info)
df_tracks3.to_csv('df_tracks3.csv')

In [23]:
track_info = []
f = 0
for i in tracks_ids[p3:]:
    try:
        track_info.append(get_complete_track_info(i))
    except:
        continue   
    f += 1
    if f in [1000,2000,4000,5000]:
        print(f)

    

1000
2000
4000
5000


In [24]:
df_tracks4 = pd.DataFrame(track_info)
df_tracks4.to_csv('df_tracks4.csv')

In [26]:
df1 = pd.read_csv('df_tracks1.csv')
df2 = pd.read_csv('df_tracks2.csv')
df3 = pd.read_csv('df_tracks3.csv')
df4 = pd.read_csv('df_tracks4.csv')

In [17]:
df_alltracks.artist_id = df_alltracks.artist_id.apply(lambda x: x[0])

In [None]:
import pickle
with open('df_alltracks.pickle', 'wb') as handle:
    pickle.dump(df_alltracks, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open('df_alltracks.pickle', 'rb') as handle:
    df_alltracks = pickle.load(handle)

### ARTIST

In [25]:
df_tracks = df_alltracks.copy()
artist_ids  = df_tracks.artist_id.drop_duplicates()
artist_ids = list(set(artist_ids))

In [27]:
def get_complete_artist_info(artist_id):
    
    # basic info
    a = sp.artist(artist_id)
    artist_followers =  a['followers']['total']
    artist_genres = a['genres']
    artist_name = a['name']
    artist_popularity = a['popularity']
    artist_url = a['href']
    
    
    output = {
        'artist_id' : artist_id,
        'artist_followers' : artist_followers,
        'artist_genres' : artist_genres,
        'artist_name' : artist_name,
        'artist_popularity' : artist_popularity,
        'artist_url' : artist_url
    }
    
    return output

In [28]:
artist_info = []
for i in artist_ids:
    artist_info.append(get_complete_artist_info(i))

In [29]:
df_artist = pd.DataFrame(artist_info)

In [30]:
df_artist = df_artist[['artist_id', 'artist_name',    'artist_genres'       , 'artist_followers', 'artist_popularity']]

In [31]:
df_artist.to_csv('artists.csv',index=False)

### TRACKS LYRICS (GENIUS API)

In [None]:
#!pip install lyricsgenius
# https://pypi.org/project/lyricsgenius/
import lyricsgenius
genius.verbose = False # Turn off status messages
genius.remove_section_headers = True 

In [None]:
token = ### insert personal token 
genius = lyricsgenius.Genius(token)

In [None]:
with open('df_alltracks.pickle', 'rb') as handle:
    df_alltracks = pickle.load(handle)

In [None]:
artists = pd.read_csv('artists.csv')

In [None]:
df_alltracks = df_alltracks[df_alltracks.artist_id.apply(lambda x: len(x)) == 1]
df_alltracks.artist_id = df_alltracks.artist_id.apply(lambda x: x[0])
print(df_alltracks.shape)
df_alltracks = df_alltracks.merge(artists ,  how='left', left_on = 'artist_id', right_on = 'artist_id')


In [None]:
lyrics_ls = []

f = 0
for song, artist_ in zip(df_alltracks.track_name[:] ,  df_alltracks.artist_name[:]):
    try:
        lyrics_ls.append(genius.search_song(title = song, artist = artist_ ).lyrics )
    except:
        lyrics_ls.append('')
        
    f += 1    
    if f in [333*i for i in range(1,15)]:
        print(f)
    if f == 4000:
        break
        
df_4k = df_alltracks[:4000].copy()
df_4k['lyrics'] = lyrics_ls
df_4k.to_csv('tracks4k.csv', index=None)

In [None]:
lyrics_ls_2 = []

f = 0
for song, artist_ in zip(df_alltracks.track_name[4000:8000] ,  df_alltracks.artist_name[4000:8000]):
    try:
        lyrics_ls.append(genius.search_song(title = song, artist = artist_ ).lyrics )
    except:
        lyrics_ls.append('')
        
    f += 1    
    if f in [333*i for i in range(1,15)]:
        print(f)
        
df_4k = df_alltracks[4000:8000].copy()
df_4k['lyrics'] = lyrics_ls[4000:8000]

df_4k.to_csv('tracks4_8k.csv', index=None)

In [None]:
lyrics_ls_3 = []

f = 0
for song, artist_ in zip(df_alltracks.track_name[8000:12000] ,  df_alltracks.artist_name[8000:12000]):
    try:
        lyrics_ls_3.append(genius.search_song(title = song, artist = artist_ ).lyrics )
    except:
        lyrics_ls_3.append('')
        
    f += 1    
    if f in [333*i for i in range(1,15)]:
        print(f)
        
df_4k = df_alltracks[8000:12000].copy()
df_4k['lyrics'] = lyrics_ls_3

df_4k.to_csv('tracks8_12k.csv', index=None)

In [None]:
lyrics_ls_4 = []

f = 0
for song, artist_ in zip(df_alltracks.track_name[12000:16000] ,  df_alltracks.artist_name[12000:16000]):
    try:
        lyrics_ls_4.append(genius.search_song(title = song, artist = artist_ ).lyrics )
    except:
        lyrics_ls_4.append('')
        
    f += 1    
    if f in [333*i for i in range(1,15)]:
        print(f)
        
df_4k = df_alltracks[12000:16000].copy()
df_4k['lyrics'] = lyrics_ls_4

df_4k.to_csv('tracks12_16k.csv', index=None)

In [None]:
lyrics_ls_5 = []

f = 0
for song, artist_ in zip(df_alltracks.track_name[16000:20000] ,  df_alltracks.artist_name[16000:20000]):
    try:
        lyrics_ls_5.append(genius.search_song(title = song, artist = artist_ ).lyrics )
    except:
        lyrics_ls_5.append('')
        
    f += 1    
    if f in [333*i for i in range(1,15)]:
        print(f)
        
df_4k = df_alltracks[16000:20000].copy()
df_4k['lyrics'] = lyrics_ls_5

df_4k.to_csv('tracks16_20k.csv', index=None)

In [27]:
df1 = pd.read_csv('tracks4k.csv')
df2 = pd.read_csv('tracks4_8k.csv')
df3 = pd.read_csv('tracks8_12k.csv')
df4 = pd.read_csv('tracks12_16k.csv')
df5 = pd.read_csv('tracks16_20k.csv')

In [28]:
df_all = pd.concat([df1,df2,df3,df4, df5])
df_all = df_all[ ~ df_all.lyrics.isna() ]


In [29]:
names = df_all.track_name.to_list()
words = []
for name in names:
    words.extend(str(name).split(' '))
    
pd.Series(words).value_counts().head(60)

-             1796
The           1544
You           1189
I              823
the            766
Me             754
Love           690
My             554
Remastered     540
of             495
Remaster       465
A              460
Of             442
In             438
It             390
On             360
To             346
to             336
Your           298
a              293
Version        281
Don't          266
All            256
in             256
Be             242
Is             239
Live           220
For            203
Like           196
Time           187
One            183
No             182
And            178
and            169
I'm            165
This           164
Down           162
2011           162
Man            161
What           158
Up             147
Out            146
Way            137
Back           137
Get            136
Go             135
Single         131
Song           130
on             130
Life           129
Heart          127
Never          127
Do          

In [30]:
df_all = df_all[ ~ df_all.track_name.astype(str).str.contains('Remaster')]
df_all = df_all[ ~ df_all.track_name.astype(str).str.contains('Version')].reset_index(drop=True)
df_all = df_all[ ~ df_all.track_name.astype(str).str.contains('Live')].reset_index(drop=True)

(16762, 26)

In [31]:
names = df_all.track_name.to_list()
words = []
for name in names:
    words.extend(str(name).split(' '))
    
pd.Series(words).value_counts().head(60)

The       1381
You       1064
I          737
Me         687
the        667
Love       618
My         496
of         438
A          412
-          405
Of         401
In         394
It         346
On         328
To         300
to         295
Your       267
a          263
All        234
Don't      234
Be         222
Is         208
in         205
For        190
Like       182
Time       167
No         167
One        162
And        154
I'm        150
This       147
Down       146
and        146
What       139
Man        138
Up         137
Out        130
Go         124
Heart      123
Back       123
Way        118
Song       117
Life       115
Get        115
on         114
Girl       114
Never      113
for        112
World      112
Do         111
We         108
It's       107
Rock       105
Day        104
Little     104
If         103
Know       102
&          102
That       102
Away       102
dtype: int64

In [32]:
df_all.to_csv('16k_songs_and_lyrics.csv', index=None)
df_all.shape

(16762, 26)

### The second part of the Lyrics Entropy project is available [here](https://carlosfg97.github.io/MusicLyricEntropy/MusicLyricsEntropy.html)

### ALBUM

In [33]:
album_id = list(set( df_tracks.album_id.to_list()))

In [34]:
def get_complete_album_info(album):

    ab = sp.album(album)
    album_artist =  ab['artists'][0]['id']
    album_name =ab['name']
    album_type = ab['album_type']
    album_release_date = ab['release_date']
    album_label = ab['label']
       
    output = {
        'album_id' : album,
        'album_artist' : album_artist,
        'album_name' : album_name,
        'album_type' : album_type,
        'album_release_date' : album_release_date #,
        #'album_label' : album_label
    }
    
    return output

In [35]:
album_info = []
for i in album_id:
    album_info.append(get_complete_album_info(i))

In [37]:
df_album = pd.DataFrame(album_info)

In [38]:
df_album.to_csv('albums.csv',index=False)

### PLAYLIST

In [54]:
playlist_ids 

['532F1h299qKD894BlPfJJF',
 '7bVwBxsWk84iKeMPXa8oRs',
 '6dSeKJvN1MhLP1IEwBWFXg',
 '0f2CvetujnCTOxB6KCXhII',
 '37i9dQZF1DXcBWIGoYBM5M',
 '37i9dQZF1DX4bSrsRWE9cd',
 '37i9dQZF1DWU3bkMPOyjie',
 '37i9dQZF1DWY4xHQp97fN6',
 '37i9dQZF1DX4WYpdgoIcn6',
 '37i9dQZF1DX5NsFgylu4qQ',
 '37i9dQZF1DX82pCGH5USnM',
 '37i9dQZF1DWYs83FtTMQFw']

In [55]:
def get_playlist_info(playlist_id):
    
    # basic info
    p = sp.playlist(playlist_id)
    flg_collaborative = p['collaborative']
    playlist_description =  p['description']
    playlist_followers =     p['followers']['total']
    playlist_url =    p['href']
    playlist_name =     p['name']
    playlist_owner =    p['owner']['id']
    flg_public =    p['public']

    
    output = {
        'playlist_id' : playlist_id,
        'flg_collaborative' : flg_collaborative,
        'playlist_description' : playlist_description,
        'playlist_followers' : playlist_followers,
        'playlist_url' : playlist_url,
        'playlist_name' : playlist_name,
        'playlist_owner' : playlist_owner,
        'flg_public' : flg_public,
        
    }
    
    return output

In [56]:
playlist_info = []

for i in playlist_ids:
    playlist_info.append(get_playlist_info(i))

In [57]:
df_playlist = pd.DataFrame(playlist_info)

In [59]:
df_playlist.to_clipboard(index=False)

In [63]:
def get_playlist_track_relation(playlist_id):
    
    pt = sp.playlist_items(playlist_id)['items']
    
    playlist_track_added_by = [i['added_by']['id'] for i in pt]
    playlist_track_added_at =  [i['added_at'] for i in pt]
    playlist_track_id = [i['track']['id'] for i in pt]
    
    output = {
        'playlist_id' : playlist_id,
        #'playlist_track_added_by' : playlist_track_added_by,
        #'playlist_track_added_at' : playlist_track_added_at,
        'track_id' : playlist_track_id   
    }
    
    return output

In [65]:
df_playlist_track  = pd.DataFrame(get_playlist_track_relation('37i9dQZF1DWYs83FtTMQFw'))[0:0]
for i in df_playlist.playlist_id:
    
    df_playlist_track = pd.concat([df_playlist_track, 
                    pd.DataFrame(get_playlist_track_relation(i))])

In [70]:
df_playlist_track = df_playlist_track.reset_index(drop=True)

In [71]:
df_playlist_track.to_clipboard(index = False)