In [1]:
import spotipy
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pickle

In [2]:
client_id = '38913c78c0a14880b6bea949a078cd53'
client_secret = '2d09deca815d44d7b8feb4f924f1e9bc'

In [3]:
from spotipy.oauth2 import SpotifyClientCredentials

In [4]:
credmanager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(
    client_credentials_manager=credmanager,
    requests_timeout=20  # secondi
)

In [5]:
def artist_features(spotify_search_result):
    result = {
        'artist_name': spotify_search_result.get('name', 'artist_name_not_available'),
        'artist_id': spotify_search_result.get('id', 'artist_id_not_available'),
        'artist_popularity': spotify_search_result.get('popularity', 0),
        'artist_first_genre': (spotify_search_result.get('genres', ['genre_not_available']) + ['genre_not_available'])[0],
        'artist_n_followers': spotify_search_result.get('followers', {}).get('total', 0),
    }
    return result

In [6]:
drake_search = sp.search('Drake', type='artist')['artists']['items'][0]

In [7]:
drake_features = artist_features(drake_search)
drake_features

{'artist_name': 'Drake',
 'artist_id': '3TVXtAsR1Inumwj472S9r4',
 'artist_popularity': 99,
 'artist_first_genre': 'rap',
 'artist_n_followers': 101439447}

In [8]:
from bs4 import BeautifulSoup
import urllib.request

In [9]:
#use as reference https://kworb.net/spotify/artists.html to find the top 3000 top stremed artists
fp = urllib.request.urlopen("https://kworb.net/spotify/artists.html")
mybytes = fp.read()
mystr = mybytes.decode("utf8")
fp.close()


In [10]:
def remove_bound(string):
  string = str(string)
  string = string.split('>')[1]
  string = string.split('<')[0]
  return string

In [11]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(mystr, 'html.parser')
artists_html = soup.find_all('a')
artist_names = [remove_bound(a) for a in artists_html][14:]  # escludi intestazioni e link inutili
artist_names = [name.replace('&amp;', '&') for name in artist_names]  # decode HTML entity

print(f"Artisti trovati: {len(artist_names)}")
print("Primi 10:", artist_names[:10])

Artisti trovati: 3000
Primi 10: ['Drake', 'Taylor Swift', 'Bad Bunny', 'The Weeknd', 'Justin Bieber', 'Ariana Grande', 'Eminem', 'Ed Sheeran', 'Travis Scott', 'Kanye West']


In [12]:
(len(artist_names), artist_names[:10])

(3000,
 ['Drake',
  'Taylor Swift',
  'Bad Bunny',
  'The Weeknd',
  'Justin Bieber',
  'Ariana Grande',
  'Eminem',
  'Ed Sheeran',
  'Travis Scott',
  'Kanye West'])

In [13]:
artists_name_list = artist_names
print('There are', len(artists_name_list), 'artists in the initial list.')

There are 3000 artists in the initial list.


In [14]:
#we set a popularity_threshold for the artists to add in the graph in order to avoid too "noisy" elements (artists with very small population/relevance)
popularity_threshold = 20

In [15]:
G = nx.Graph()

In [16]:
from itertools import combinations

def add_weighted_edges_from_artists(G, artist_ids, pairwise=True, anchor=None):
    """
    Aggiunge/incrementa archi pesati in base alle co-presenze su UNA traccia.

    - pairwise=True  -> collega TUTTE le coppie (grafo di co-autorship "completo" per la traccia)
    - pairwise=False -> collega solo l'anchor con gli altri (stella centrata su 'anchor')

    anchor: id artista centrale (se pairwise=False). Se None e pairwise=False, non fa nulla.
    """
    # togli duplicati intra-traccia
    ids = list(dict.fromkeys(artist_ids))
    if len(ids) < 2:
        return

    if pairwise:
        pairs = combinations(sorted(ids), 2)
    else:
        if not anchor or anchor not in ids:
            return
        pairs = ((anchor, b) for b in ids if b != anchor)

    for u, v in pairs:
        if G.has_edge(u, v):
            G[u][v]['weight'] = G[u][v].get('weight', 1) + 1
        else:
            G.add_edge(u, v, weight=1)

In [17]:
from collections import Counter

def get_collaborators(artist_id, max_albums=5):
    """
    Ritorna un Counter: collab_id -> #collaborazioni con artist_id
    Conta una collaborazione ogni volta che artist_id e collab_id co-compaiono
    tra gli 'artists' della stessa traccia.
    """
    counts = Counter()
    seen_album_ids = set()
    got = 0

    # Considera album, singoli e (opzionale) apparizioni
    album_types = ["album", "single", "appears_on"]

    for a_type in album_types:
        offset = 0
        while True:
            res = sp.artist_albums(artist_id=artist_id, album_type=a_type, limit=50, offset=offset)
            items = res.get("items", [])
            if not items:
                break

            for album in items:
                aid = album.get("id")
                if not aid or aid in seen_album_ids:
                    continue
                seen_album_ids.add(aid)

                got += 1
                if max_albums is not None and got > max_albums:
                    break

                # scorro le tracce
                tr_off = 0
                while True:
                    tr_res = sp.album_tracks(album_id=aid, limit=50, offset=tr_off)
                    tracks = tr_res.get("items", [])
                    if not tracks:
                        break

                    for tr in tracks:
                        tr_artists = [a.get("id") for a in tr.get("artists", []) if a.get("id")]
                        if artist_id in tr_artists:
                            for cid in tr_artists:
                                if cid and cid != artist_id:
                                    counts[cid] += 1

                    tr_off += 50

            if (max_albums is not None and got > max_albums) or len(items) < 50:
                break
            offset += 50

    return counts


In [18]:
with open("grafo_parziale.pkl", "rb") as f:
    G = pickle.load(f)

In [19]:
start_index = 2751  # o il numero che ricordi (es. 620, 640…)

In [20]:
import time

In [None]:
# Continua dal punto in cui eri rimasto
for idx, name in enumerate(artists_name_list[start_index:], start=start_index):
    print(f"[{idx+1}/{len(artists_name_list)}] Analizzo artista: {name}")
    try:
        search = sp.search(q=name, type='artist', limit=1)['artists']['items']
        if not search:
            continue
        artist = search[0]
        if artist['popularity'] < popularity_threshold:
            continue

        artist_data = artist_features(artist)
        artist_id = artist_data['artist_id']

        if artist_id in G:
            continue  # Già elaborato

        G.add_node(artist_id, **artist_data)

        # Trova collaboratori: Counter(collab_id -> count)
        collab_counts = get_collaborators(artist_id)

        for collab_id, count in collab_counts.items():
            # crea nodo collab se manca (rispettando la soglia)
            if not G.has_node(collab_id):
                try:
                    collab_data = sp.artist(collab_id)
                    collab_feat = artist_features(collab_data)
                    if collab_feat['artist_popularity'] >= popularity_threshold:
                        G.add_node(collab_id, **collab_feat)
                    else:
                        continue  # sotto soglia → salto arco
                except:
                    continue

            # aggiorna/incrementa il peso dell'arco
            if G.has_edge(artist_id, collab_id):
                G[artist_id][collab_id]['weight'] = G[artist_id][collab_id].get('weight', 0) + int(count)
            else:
                G.add_edge(artist_id, collab_id, weight=int(count))

        # Salva ogni 10 artisti
        if idx % 10 == 0:
            with open("grafo_parziale.pkl", "wb") as f:
                pickle.dump(G, f)
            print(f"💾 Grafo salvato a {idx} nodi.")

        time.sleep(1)

    except spotipy.exceptions.SpotifyException as se:
        if se.http_status == 429:
            retry_after = int(se.headers.get("Retry-After", 60))
            print(f"⏳ Rate limit raggiunto. Attendo {retry_after} secondi...")
            time.sleep(retry_after + 5)
            continue
        else:
            print(f"Errore Spotify per {name}: {se}")
            continue

    except Exception as e:
        print(f"Errore generico per {name}: {e}")
        continue

# Salvataggio finale
with open("grafo_finale_pesato.pkl", "wb") as f:
    pickle.dump(G, f)

print("✅ Raccolta completata. Grafo salvato.")


[2392/3000] Analizzo artista: Damas Gratis
[2393/3000] Analizzo artista: Vicentico
[2394/3000] Analizzo artista: Mandy Moore
[2395/3000] Analizzo artista: Chris MC
[2396/3000] Analizzo artista: Bishop Briggs
[2397/3000] Analizzo artista: TREASURE
[2398/3000] Analizzo artista: Frank Miami
[2399/3000] Analizzo artista: Corinne Bailey Rae
[2400/3000] Analizzo artista: Tom Rosenthal
[2401/3000] Analizzo artista: SiR
[2402/3000] Analizzo artista: KeBlack
[2403/3000] Analizzo artista: Chet Baker
[2404/3000] Analizzo artista: R. D. Burman
[2405/3000] Analizzo artista: Szpaku
[2406/3000] Analizzo artista: Jeet Gannguli
[2407/3000] Analizzo artista: Olexesh
[2408/3000] Analizzo artista: Maelo Ruiz
[2409/3000] Analizzo artista: Jay Park
[2410/3000] Analizzo artista: Herencia de Patrones
[2411/3000] Analizzo artista: Dhruv
💾 Grafo salvato a 2410 nodi.
[2412/3000] Analizzo artista: Kailash Kher
[2413/3000] Analizzo artista: DJ Scheme
[2414/3000] Analizzo artista: SUGA
[2415/3000] Analizzo artista:



💾 Grafo salvato a 2710 nodi.
[2712/3000] Analizzo artista: Ari Lennox
[2713/3000] Analizzo artista: Danny Elfman




[2714/3000] Analizzo artista: El Trono de Mexico
[2715/3000] Analizzo artista: A Perfect Circle




[2716/3000] Analizzo artista: Hoodie Allen
[2717/3000] Analizzo artista: Foxes




[2718/3000] Analizzo artista: Tai Verdes




[2719/3000] Analizzo artista: Ben Harper
[2720/3000] Analizzo artista: Gurlez Akhtar
[2721/3000] Analizzo artista: Naresh Iyer
[2722/3000] Analizzo artista: Parov Stelar




[2723/3000] Analizzo artista: Sam Fischer
[2724/3000] Analizzo artista: BM




[2725/3000] Analizzo artista: Shaarib Toshi
[2726/3000] Analizzo artista: 4 Non Blondes




[2727/3000] Analizzo artista: Dj Guuga
[2728/3000] Analizzo artista: ACRAZE




[2729/3000] Analizzo artista: Dizzee Rascal
[2730/3000] Analizzo artista: Saint Motel
[2731/3000] Analizzo artista: Landon Cube




💾 Grafo salvato a 2730 nodi.
[2732/3000] Analizzo artista: The Cinematic Orchestra




[2733/3000] Analizzo artista: Teoman
[2734/3000] Analizzo artista: Dubdogz




[2735/3000] Analizzo artista: Elodie
[2736/3000] Analizzo artista: KATSEYE




[2737/3000] Analizzo artista: Rebelution




[2738/3000] Analizzo artista: Leiva
[2739/3000] Analizzo artista: Jazmine Sullivan




[2740/3000] Analizzo artista: Mitchell Tenpenny
[2741/3000] Analizzo artista: La T y La M
[2742/3000] Analizzo artista: St. Vincent
[2743/3000] Analizzo artista: Against The Current
[2744/3000] Analizzo artista: Vybz Kartel
[2745/3000] Analizzo artista: Young M.A




[2746/3000] Analizzo artista: Pia Mia
[2747/3000] Analizzo artista: José Alfredo Jimenez
[2748/3000] Analizzo artista: HIRAIDAI




[2749/3000] Analizzo artista: Laylow
[2750/3000] Analizzo artista: Serdar Ortaç
[2751/3000] Analizzo artista: Harrdy Sandhu




💾 Grafo salvato a 2750 nodi.
[2752/3000] Analizzo artista: FAST BOY
[2753/3000] Analizzo artista: Timbiriche




[2754/3000] Analizzo artista: Wolfine
[2755/3000] Analizzo artista: Rizky Febian




[2756/3000] Analizzo artista: Loredana
[2757/3000] Analizzo artista: Sawano Hiroyuki




[2758/3000] Analizzo artista: Veysel
[2759/3000] Analizzo artista: Alabama Shakes
[2760/3000] Analizzo artista: Marcos Witt
[2761/3000] Analizzo artista: Hades66
[2762/3000] Analizzo artista: Tash Sultana
[2763/3000] Analizzo artista: Paky
[2764/3000] Analizzo artista: Caloncho




[2765/3000] Analizzo artista: Chaka Khan
[2766/3000] Analizzo artista: Lil Jon & The East Side Boyz




[2767/3000] Analizzo artista: Glenn Fredly


