# Spotify artists data collection

We perform the network extraction using Spotipy API library: choices regarding the attribute definitions, the choice regarding the starting artists of the graph and the extraction procedure itself are specified in the comments that start the main blocks of the following notebook

In [3]:
#install required API spotify library
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.1 spotipy-2.23.0


In [4]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from bs4 import BeautifulSoup
import urllib.request

In [None]:
#in order to employ the API, a client_id and a client_secret code are needed
#these are private for each user so they are note reported in the final notebook

client_id = ''
client_secret = ''

In [None]:
credmanager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=credmanager)

In [None]:
#we define a function that given a spotify search result creates a dictionary with relevant information regarding the artist

def artist_features(spotify_search_result):
    result = {
        'artist_name': spotify_search_result.get('name', 'artist_name_not_available'),
        'artist_id': spotify_search_result.get('id', 'artist_id_not_available'),
        'artist_popularity': spotify_search_result.get('popularity', 0),
        'artist_first_genre': (spotify_search_result.get('genres', ['genre_not_available']) + ['genre_not_available']),
        'artist_n_followers': spotify_search_result.get('followers', {}).get('total', 0),
    }
    return result

In [None]:
drake_search = sp.search('Drake', type='artist')['artists']['items'][0]

In [None]:
drake_features = artist_features(drake_search)
drake_features

{'artist_name': 'Drake',
 'artist_id': '3TVXtAsR1Inumwj472S9r4',
 'artist_popularity': 95,
 'artist_first_genre': ['canadian hip hop',
  'canadian pop',
  'hip hop',
  'pop rap',
  'rap',
  'genre_not_available'],
 'artist_n_followers': 81801944}

In [None]:
#artist_related_artists allows us to access the top related artists associated with a single artist_id
sp.artist_related_artists(drake_features['artist_id'])

{'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1RyvyyTE3xzB2ZywiAwp0i'},
   'followers': {'href': None, 'total': 15247690},
   'genres': ['atl hip hop', 'hip hop', 'rap', 'southern hip hop', 'trap'],
   'href': 'https://api.spotify.com/v1/artists/1RyvyyTE3xzB2ZywiAwp0i',
   'id': '1RyvyyTE3xzB2ZywiAwp0i',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/ab6761610000e5eb24e41f491b129093a6fee383',
     'width': 640},
    {'height': 320,
     'url': 'https://i.scdn.co/image/ab6761610000517424e41f491b129093a6fee383',
     'width': 320},
    {'height': 160,
     'url': 'https://i.scdn.co/image/ab6761610000f17824e41f491b129093a6fee383',
     'width': 160}],
   'name': 'Future',
   'popularity': 87,
   'type': 'artist',
   'uri': 'spotify:artist:1RyvyyTE3xzB2ZywiAwp0i'},
  {'external_urls': {'spotify': 'https://open.spotify.com/artist/1URnnhqYAYcrqrcwql10ft'},
   'followers': {'href': None, 'total': 15474157},
   'genres': ['atl hip hop', 'hip ho

In [None]:
#use as reference https://kworb.net/spotify/artists.html to find the top 3000 top stremed artists
fp = urllib.request.urlopen("https://kworb.net/spotify/artists.html")
mybytes = fp.read()
mystr = mybytes.decode("utf8")
fp.close()


In [None]:
def remove_bound(string):
  string = str(string)
  string = string.split('>')[1]
  string = string.split('<')[0]
  return string

In [None]:
#we exploit BeautifulSoupt in order to crawl the artists' names
#these will be used as a starting point for building the graph

from bs4 import BeautifulSoup

data_info = []
artist_names = []

soup = BeautifulSoup(mystr)
columns = soup.find_all('th')
artists = soup.find_all('a')

for col in columns:
  col = remove_bound(col)
  data_info.append(col)

for artist in artists:
  artist = remove_bound(artist)
  artist_names.append(artist)

artist_names = artist_names[14:]


In [None]:
(len(artist_names), artist_names[:10])

(3000,
 ['Drake',
  'Bad Bunny',
  'Taylor Swift',
  'The Weeknd',
  'Ed Sheeran',
  'Justin Bieber',
  'Eminem',
  'Ariana Grande',
  'J Balvin',
  'Travis Scott'])

In [None]:
artists_name_list = artist_names
artists_name_list = [x.replace('&amp;',"&") for x in artists_name_list]
print('There are', len(artists_name_list), 'artists in the initial list.')

There are 3000 artists in the initial list.


In [None]:
#we set a popularity_threshold for the artists to add in the graph in order to avoid too "noisy" elements (artists with very small population/relevance)
popularity_threshold = 20

In [None]:
G = nx.Graph()

In [None]:
#we add the main artists in an empty graph along with their attributes
for name in artists_name_list:
    search = sp.search(name, type='artist')['artists']['items'][0]
    curr_artist = artist_features(search)

    if curr_artist['artist_popularity'] >= popularity_threshold:
        G.add_node(curr_artist['artist_name'], **curr_artist, related_found=False)

In [None]:
#the while loop allows us to build the final graph:
#at each iteration, if we haven't checked and added the realted artists of one of the nodes in the graph
#we identify the related artists with spotipy and add them to the graph.

#the process ends after a threshold size value is reached in order to avoid a too large graph with too much "less relevant" artists

stop = False

while stop == False:
    l = len(G) # number of nodes in the graph currently

    for node in list(G): #for each node
        if G.nodes[node]['related_found'] == False: # initialize the artist with False considering his/her related
            relateds = sp.artist_related_artists(G.nodes[node]['artist_id'])['artists']

            relateds = [artist_features(r) for r in relateds]
            relateds_names = [r['artist_name'] for r in relateds]
            G.nodes[node]['related_found'] = True # we found the related artists, so we set at True its attribute

            for name, rdict in zip(relateds_names, relateds):
                if rdict['artist_popularity'] >= popularity_threshold: #set a popularity threshold to avoid not relevant artists

                    if name in G: # node already in G
                        pass # do nothing

                    else:
                        G.add_node(name, **rdict, related_found=False) # we added a new node, we don't know its relateds yet


                    G.add_edge(node, name) # we add an edge between x and its related name

    if len(G) == l or len(G) > 60000: #we add a stopping condition to containt the extraction
        stop = True
        print('Done.')

In [None]:
#for each artist we save in an extra attribute called 'main_genre' the first element of the 'artist_first_genre' list associated with the node
for node in G.nodes():
  G.nodes[node]['main_genre'] = G.nodes[node]['artist_first_genre'][0]

In [None]:
# save graph object to file
pickle.dump(G, open('artists_graph_20.pickle', 'wb'))

In [5]:
G_raw = nx.Graph()
for u, v in G.edges():
  G_raw.add_edge(u,v)
nx.write_gexf(G_raw,"artists_graph.gexf")