### Libraries

In [1]:
import pandas as pd
import numpy as np

import json
import requests
import base64
import urllib.parse
from bs4 import BeautifulSoup

import unicodedata
import ast
import time
import datetime

import dotenv
import os

### Options

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
dotenv.load_dotenv(dotenv.find_dotenv("/home/b4/Documents/VisualCodeStudio/spotify_project/docs/.env"))

True

### Get Credentials

In [4]:
client_id = os.getenv("spotify_id")
client_secret = os.getenv("spotify_secret")

In [5]:
client_creds = f"{client_id}:{client_secret}"
client_creds_b64 = base64.b64encode(client_creds.encode())

In [6]:
token_url = "https://accounts.spotify.com/api/token"
method = "POST"
token_data = {
    "grant_type": "client_credentials"
}
token_headers = {
    "Authorization": f"Basic {client_creds_b64.decode()}"
}

### Functions

##### Request

In [7]:
# Create item "artist:id" in dictionary
artists_id = {}
def get_artist_id(artist):
    artist_list_id = []
    query = urllib.parse.quote(artist)

    url_search = f"https://api.spotify.com/v1/search?q={query}&type=artist&limit=1"
    result_search = requests.get(url_search, headers=headers)

    info_search = pd.json_normalize(json.loads(result_search.content))
    df_info_search = pd.DataFrame(info_search['artists.items'][0])

    '''
    Create item in dict with Name and ID of artist
    '''
    artists_id[df_info_search['name'][0]] = df_info_search['id'][0]
    return artists_id


In [8]:
# Request artist info from artists dictionary
def get_artist():

	artist_dataframe = pd.DataFrame()

	'''
	Request artists info.
	'''
	for id in artists_id.values():
		url_artist = f"	https://api.spotify.com/v1/artists/{id}"
		result_artist = requests.get(url_artist, headers=headers)

		info_artist = pd.json_normalize(json.loads(result_artist.content))
		artist_dataframe = pd.concat([artist_dataframe, info_artist])

	'''
	Clean artists dataframe.
	'''
	genres = []
	for row in artist_dataframe['genres']:
		gn = []
		for genre in row:
			gn.append(genre)
			
		gn = tuple(gn)
		genres.append(gn)
			
	artist_dataframe['genres'] = genres
	artist_dataframe.drop(columns=['type', 'followers.href'], inplace=True)
	artist_dataframe.reset_index(drop=True, inplace=True)

	return artist_dataframe


In [9]:
# Request albums from artists
def get_artist_albums(df):

	album_dataframe = pd.DataFrame()

	'''
	Base album request.
	'''
	for id, name, pop in zip(df.id, df.name, df.popularity):
		url_album = f"https://api.spotify.com/v1/artists/{id}/albums?limit=50&include_groups=album,single"
		result_album = requests.get(url_album, headers=headers)

		info_album = pd.json_normalize(json.loads(result_album.content))
		df_info_album = pd.DataFrame(info_album['items'][0])
		df_info_album['artist_name'] = name
		df_info_album['artist_pop'] = pop
		album_dataframe = pd.concat([album_dataframe, df_info_album])

	'''
	Request next pages of albums.
	'''
	while len(info_album['items'][0]) == 50:
		url_album_extra = info_album['next'][0]
		result_album_extra = requests.get(url_album_extra, headers=headers)

		info_album = pd.json_normalize(json.loads(result_album_extra.content))
		df_info_album_extra = pd.DataFrame(info_album['items'][0])
		df_info_album_extra['artist_name'] = name
		df_info_album_extra['artist_pop'] = pop
		album_dataframe = pd.concat([album_dataframe, df_info_album_extra])

	album_dataframe.reset_index(drop=True, inplace=True)

	'''
	Get album popularity.
	'''
	poplist = []
	for id in album_dataframe['id']:
		url_album_extra = f"https://api.spotify.com/v1/albums/{id}"
		result_album_extra = requests.get(url_album_extra, headers=headers)

		info_album_extra = pd.json_normalize(json.loads(result_album_extra.content))
		poplist.append(info_album_extra['popularity'][0])

	album_dataframe['popularity'] = poplist

	'''
	Clean album dataframe.
	'''
	album_dataframe = album_dataframe[['name', 'id', 'total_tracks', 'popularity', 'release_date', 'artist_name', 'artist_pop', 'album_type', 'external_urls', 'images', 'href', 'uri', 'artists']]

	return album_dataframe


In [10]:
# Request tracks from albums
def get_album_tracks(df, headers, time_limit, now):

	track_dataframe = pd.DataFrame()
	track_extra_dataframe = pd.DataFrame()
	print("Preparing...", end='')
	for id, art, artpop, alb, albpop in zip(df.id, df.artist_name, df.artist_pop, df.name, df.popularity):
		url_tracks = f"https://api.spotify.com/v1/albums/{id}/tracks"                 #| Request 
		result_tracks = requests.get(url_tracks, headers=headers)

		track_info = pd.json_normalize(json.loads(result_tracks.content))
		df_track_info = pd.DataFrame(track_info['items'][0])
		df_track_info['artist_name'] = art
		df_track_info['artist_pop'] = artpop
		df_track_info['album_name'] = alb
		df_track_info['album_pop'] = albpop
		track_dataframe = pd.concat([track_dataframe, df_track_info], ignore_index=True)

		# Avoid disconnecting API
		if now > time_limit - datetime.timedelta(seconds=120):
			headers, time_limit, now = get_auth()
			now = datetime.datetime.now()
			time_limit = now + datetime.timedelta(seconds=expires_in) 

	print(f"\r<Albuns successfully collected.>")
	for id_track in track_dataframe['id']:
		url_tracks_extra_info = f"https://api.spotify.com/v1/audio-features/{id_track}"     #| Extra info request 
		result_tracks_extra = requests.get(url_tracks_extra_info, headers=headers)

		track_extra_info = pd.json_normalize(json.loads(result_tracks_extra.content))

		track_extra_dataframe = pd.concat([track_extra_dataframe, track_extra_info], ignore_index=True)

		print(f"\rCollecting tracks: {round((len(track_extra_dataframe))/(len(track_dataframe))*100, 2)} %", end='')

	print('\033[A\r<Tracks successfully colected.>\n')
	print('Preparing...', end='')
	'''
	Get track popularity.
	'''
	poplist = []
	for id in track_dataframe['id']:
		url_track_extra = f"https://api.spotify.com/v1/tracks?ids={id}"
		result_track_extra = requests.get(url_track_extra, headers=headers)

		info_track_extra = pd.json_normalize(json.loads(result_track_extra.content))
		poplist.append(info_track_extra['tracks'][0][0]['popularity'])

		# Avoid disconnecting API
		if now > time_limit - datetime.timedelta(seconds=120):
			headers, time_limit, now = get_auth()
			now = datetime.datetime.now()
			time_limit = now + datetime.timedelta(seconds=expires_in)

	track_dataframe['track_pop'] = poplist

	'''
	Clean dataframe
	'''
	track_extra_dataframe.drop(columns=['type', 'id', 'uri', 'track_href', 'duration_ms'], inplace=True)
	track_dataframe = pd.concat([track_dataframe, track_extra_dataframe], axis=1)
	track_dataframe = track_dataframe[['track_number', 'name', 'id', 'duration_ms', 'track_pop', 'album_name', 'album_pop', 'artist_name',
		'artist_pop', 'artists', 'href', 'external_urls', 'uri', 'analysis_url', 'preview_url',
		'explicit', 'disc_number','available_markets', 'danceability', 'energy', 'key', 'loudness',
		'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','time_signature']]
	print("\rDone!       ")
	return track_dataframe


##### Cleaning

In [11]:
# Drop Playbacks and repeated songs
def drop_playbacks_copy(df_tracks):
  df_tracks.drop(df_tracks[df_tracks['album_name'].str.contains("Playback")].index, inplace=True) #| Playbacks in Albums
  df_tracks.drop(df_tracks[df_tracks['name'].str.contains("Playback")].index, inplace=True)       #| Playbacks in Tracks

  df_tracks.drop_duplicates(subset=['name','duration_ms'], inplace=True)                          #| Tracks repeated

  df_tracks.reset_index(drop=True, inplace=True)
  
  return df_tracks


In [12]:
# Disjoint 'artists' column in featured artists column
def disjoint_artists(df_tracks):
  feat_artista = []
  for musica in df_tracks['artists']:
    art = []
    for artista in musica:
      art.append(artista['name'])
    art = tuple(art)
    feat_artista.append(art)
    
  df_tracks['artists_feat'] = feat_artista
  return df_tracks


In [13]:
# Create columns with duration in minutes
def convert_duration_min(df_tracks):
  duration = []
  for x in df_tracks['duration_ms']:
    dur = str(datetime.timedelta(milliseconds=x))[2:7]
    duration.append(dur)
  df_tracks['duration_min'] = duration
  return df_tracks


In [14]:
# Reorder columns
def reorder_columns(df_tracks):
  df_tracks = df_tracks[['track_number', 'name', 'id', 'duration_min', 'track_pop', 'album_name', 'album_pop', 'artist_name',
                         'artist_pop', 'artists_feat', 'href', 'external_urls', 'uri', 'analysis_url', 'preview_url',
                         'explicit', 'disc_number','available_markets', 'duration_ms', 'danceability', 'energy', 'key', 'loudness',
                         'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','time_signature']]
  return df_tracks


#### Pipeline

In [15]:
# Pipeline with all cleaning steps
def pipeline_clean(df_tracks):
    df1 = convert_duration_min(df_tracks)
    df2 = drop_playbacks_copy(df1)
    df3 = disjoint_artists(df2)
    df_tracks = reorder_columns(df3)
    
    return df_tracks


In [16]:
# Pipeline with all requesting data steps
# Update: Gift_box - wrap up all dataframes into one
def pipeline_request():
    headers, time_limit, now = get_auth()
    df_artists = get_artist()
    df_albums = get_artist_albums(df_artists)
    tracks = get_album_tracks(df_albums, headers, time_limit, now)
    df_tracks = pipeline_clean(tracks)
    df_box = gift_box(df_tracks, df_artists, df_albums)

    return df_box

#### Authorization Counter

In [17]:
# Provide access token and request headers
def get_auth():
    r = requests.post(token_url, data=token_data, headers=token_headers)
    token_response_data = r.json()
    access_token = token_response_data['access_token']
    expires_in = token_response_data['expires_in']
    now = datetime.datetime.now()
    time_limit = now + datetime.timedelta(seconds=expires_in)
    headers = { "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
                }
    return headers, time_limit, now

In [18]:
# Perform validation of auth
# def validation():
#     try:
#         if now > time_limit - datetime.timedelta(seconds=120):
#             headers, time_limit, now = get_auth()
#             print('chegou dois minutos antes e eu autorizei')

#         else:
#             headers, time_limit, now = get_auth()
#             print('autorizei pela primeira vez')
#     except:
#         pass

#     return headers, time_limit, now


#### Extra functions

In [19]:
# Wrap up dataframes into one
def gift_box(df_tracks, df_artists, df_albums):
    df_box = pd.DataFrame()

    tracks_list = []
    artists_list = []
    albums_list = []

    tracks_list.append(df_tracks.to_dict())
    artists_list.append(df_artists.to_dict())
    albums_list.append(df_albums.to_dict())

    df_box['tracks'] = tracks_list
    df_box['artists'] = artists_list
    df_box['albums'] = albums_list

    return df_box


In [20]:
# Search song name in dataframe
def search_song(song, dataframe):
  nfkd_form = unicodedata.normalize('NFKD', song)
  only_ascii = nfkd_form.encode('ASCII', 'ignore')
  search = only_ascii.decode()
  return dataframe[dataframe['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.contains(search, case=False)]


In [21]:
# Drop selected albums (--Better visualized in Google Colab--)
def drop_album(dataframe, artist, name=None):

  df = dataframe[dataframe['artist_name'] == artist]

  if name == None:
    print(*sorted(list(set(df['album_name'].unique()))), sep = "\n")
    name = input("Choose an album. ")
    output.clear()
    print(df[df['album_name'] == name]['name'])
    drop_album_aux(name)
    '''
    If 'name' is 'None', prints list of albuns for choosing,
    prints tracks from selected album, asks for confirmation
    and drops album.
    '''

  elif name == tuple(name):
    while True:
      choice = input(f"Drop {len(name)} albums? ")
      if choice == 'y':
        for x in range(0,len(name)):
          dataframe.drop(index=(df[df['album_name']==name[x]].index), inplace=True)
        print(f"{len(name)} Albums droped.")
        break
      elif choice == 'n':
        output.clear()
        print('No alterations.')
        break
      else:
        print("Yes: 'y' or No: 'n'")
    '''
    If 'name' is a Tuple of multiple albums, asks for confirmation and drops albums.
    '''

  else:
    print(df[df['album_name'] == name]['name'])
    drop_album_aux(name)
    '''
    If 'name' is an album name, prints tracks from selected album,
    asks for confirmation and drops album.
    '''


### New Requests

In [22]:
headers = get_auth()[0]

In [236]:
artists_id = {}

In [23]:
get_artist_id('AIELLO')

{'AIELLO': '5bxbPQo0VkFgZKemF0YKb4'}

In [24]:
spotify_dataframe = pipeline_request()

<Albuns successfully collected.>
<Tracks successfully colected.>

Done!       


##### Easy List

In [22]:
mylistfacil = [ 'Paulo Cesar Baruk',
                'Eli Soares',
                'Davi Sacer',
                'Nívea Soares',
                'Fernandinho',
                'Kleber Lucas',
                'Fernanda Brum',
                'Aline Barros',
                'Diante do Trono',
                'Gabriela Rocha',
                'Ministério Vineyard',
                'Renascer Praise',
                'Adhemar De Campos',
                'Vencedores por Cristo'
                ]

In [23]:
for i in mylistfacil:
    get_artist_id(i)

In [None]:
tracks_dataframe[0]

### Save

In [None]:
# spotify_dataframe.to_csv("/home/b4/Documents/VisualCodeStudio/spotify_project/data/spotify_dataframe.csv")

In [None]:
spotify_dataframe = read_csv("/home/b4/Documents/VisualCodeStudio/spotify_project/data/spotify_dataframe.csv")

In [30]:
pd.DataFrame(spotify_dataframe['tracks'][0])

Unnamed: 0,track_number,name,id,duration_min,track_pop,album_name,album_pop,artist_name,artist_pop,artists_feat,href,external_urls,uri,analysis_url,preview_url,explicit,disc_number,available_markets,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1,FINO ALL'ALBA (ti sento),2Qwof7tnDgOWzso1jAbTg4,02:27,43,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/2Qwof7tnDgOW...,{'spotify': 'https://open.spotify.com/track/2Q...,spotify:track:2Qwof7tnDgOWzso1jAbTg4,https://api.spotify.com/v1/audio-analysis/2Qwo...,https://p.scdn.co/mp3-preview/f232dbef5bc19bf5...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",147500,0.626,0.727,2,-5.817,1,0.256,0.0974,0.0,0.106,0.804,120.125,4
1,2,INTRO (dove vanno a finire),69uQKfPj6QYPxk9O7IRdpv,02:19,27,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/69uQKfPj6QYP...,{'spotify': 'https://open.spotify.com/track/69...,spotify:track:69uQKfPj6QYPxk9O7IRdpv,https://api.spotify.com/v1/audio-analysis/69uQ...,https://p.scdn.co/mp3-preview/27a452d90cd1f287...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",139946,0.444,0.303,0,-13.023,0,0.15,0.839,2e-06,0.327,0.323,64.591,4
2,3,FARFALLE,00gHqVmXYmsPmedN1Gv3sl,02:42,38,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/00gHqVmXYmsP...,{'spotify': 'https://open.spotify.com/track/00...,spotify:track:00gHqVmXYmsPmedN1Gv3sl,https://api.spotify.com/v1/audio-analysis/00gH...,https://p.scdn.co/mp3-preview/951d4877b7431e75...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",162986,0.681,0.655,9,-6.989,0,0.0779,0.377,0.0,0.176,0.285,87.026,4
3,4,CHE CANZONE SIAMO,7lqoDGFRGv88ofjONJrOLj,03:04,39,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/7lqoDGFRGv88...,{'spotify': 'https://open.spotify.com/track/7l...,spotify:track:7lqoDGFRGv88ofjONJrOLj,https://api.spotify.com/v1/audio-analysis/7lqo...,https://p.scdn.co/mp3-preview/7fef11a17086fb2c...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",184304,0.671,0.403,0,-6.327,1,0.0394,0.233,0.0,0.484,0.298,127.886,4
4,5,CERTE LUNE,2eLYcBcxTBtqiyejPGlIMa,02:44,37,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/2eLYcBcxTBtq...,{'spotify': 'https://open.spotify.com/track/2e...,spotify:track:2eLYcBcxTBtqiyejPGlIMa,https://api.spotify.com/v1/audio-analysis/2eLY...,https://p.scdn.co/mp3-preview/7780228daa2a36ca...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",164520,0.524,0.502,7,-10.185,1,0.306,0.696,0.0,0.0987,0.35,160.084,4
5,6,PER LA PRIMA VOLTA,16RzOthatYy60zyRtpqO8T,03:18,33,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/16RzOthatYy6...,{'spotify': 'https://open.spotify.com/track/16...,spotify:track:16RzOthatYy60zyRtpqO8T,https://api.spotify.com/v1/audio-analysis/16Rz...,https://p.scdn.co/mp3-preview/9490db6f3c87e3d8...,True,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",198119,0.763,0.535,0,-9.143,1,0.142,0.264,0.000134,0.366,0.409,119.952,4
6,7,VIENIMI (a ballare),5eyiXwK0oAnfFaXbmDBRTH,02:52,46,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/5eyiXwK0oAnf...,{'spotify': 'https://open.spotify.com/track/5e...,spotify:track:5eyiXwK0oAnfFaXbmDBRTH,https://api.spotify.com/v1/audio-analysis/5eyi...,https://p.scdn.co/mp3-preview/5346587b54fd6cf3...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",172568,0.697,0.888,5,-5.009,0,0.378,0.254,0.0,0.295,0.582,103.938,4
7,8,SCOMPOSTO,7eBYgNaeG57uWC12RoblXL,02:47,35,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/7eBYgNaeG57u...,{'spotify': 'https://open.spotify.com/track/7e...,spotify:track:7eBYgNaeG57uWC12RoblXL,https://api.spotify.com/v1/audio-analysis/7eBY...,https://p.scdn.co/mp3-preview/7dc56f6583784441...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",167305,0.681,0.541,0,-6.395,1,0.121,0.421,0.0,0.0973,0.516,135.721,4
8,9,LA LA LAND,5XypIfWeZVEi3HTIeFKCos,02:40,24,MERIDIONALE,42,AIELLO,49,"(AIELLO,)",https://api.spotify.com/v1/tracks/5XypIfWeZVEi...,{'spotify': 'https://open.spotify.com/track/5X...,spotify:track:5XypIfWeZVEi3HTIeFKCos,https://api.spotify.com/v1/audio-analysis/5Xyp...,https://p.scdn.co/mp3-preview/565f5eccb8e31564...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",160574,0.744,0.698,7,-4.641,1,0.13,0.275,0.0,0.117,0.347,110.059,4
9,10,DI TE NIENTE,2sqnDDR17FdVPm1amblmod,03:02,34,MERIDIONALE,42,AIELLO,49,"(AIELLO, SVM)",https://api.spotify.com/v1/tracks/2sqnDDR17FdV...,{'spotify': 'https://open.spotify.com/track/2s...,spotify:track:2sqnDDR17FdVPm1amblmod,https://api.spotify.com/v1/audio-analysis/2sqn...,https://p.scdn.co/mp3-preview/ff2027e4d679af61...,False,1,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",182334,0.617,0.54,0,-7.258,1,0.0907,0.239,0.0,0.0859,0.539,119.883,4
