In [None]:
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import trange
import pickle

with open("C:/Users/acast/Desktop/Progetto sna/data collection/grafo_finale.pickle", 'rb') as f:
    G = pickle.load(f)
print(G)  
print("N nodi:", G.number_of_nodes(), " | N archi:", G.number_of_edges())

import random
for n, d in random.sample(list(G.nodes(data=True)), 3):
    print(n, d)

Graph with 11083 nodes and 14497 edges
N nodi: 11083  | N archi: 14497
3rVSQrIoRn6F2AdRWxsKa8 {'artist_name': 'Number48', 'artist_id': '3rVSQrIoRn6F2AdRWxsKa8', 'artist_popularity': 21, 'artist_first_genre': 'hyperpop', 'artist_n_followers': 760}
66TrUkUZ3RM29dqeDQRgyA {'artist_name': 'Ella Eyre', 'artist_id': '66TrUkUZ3RM29dqeDQRgyA', 'artist_popularity': 68, 'artist_first_genre': 'genre_not_available', 'artist_n_followers': 865121}
0oOet2f43PA68X5RxKobEy {'artist_name': 'Shreya Ghoshal', 'artist_id': '0oOet2f43PA68X5RxKobEy', 'artist_popularity': 88, 'artist_first_genre': 'bollywood', 'artist_n_followers': 31666351}


In [None]:
def to_macro_genre(g: str) -> str:
    if not g or g == "genre_not_available":
        return "unknown"
    s = g.lower().strip()

    # HIP HOP / RAP
    if any(k in s for k in ["hip hop", "hip-hop", "rap", "drill", "grime", "phonk", "boom bap"]):
        return "hip hop / rap"
    if "trap" in s:
        if any(k in s for k in ["edm", "electro", "electron", "house", "bass"]):
            return "electronic / dance"
        if any(k in s for k in ["latin", "latino", "reggaeton"]):
            return "latin / urbano"
        return "hip hop / rap"

    # ELECTRONIC / DANCE
    if any(k in s for k in [
        "edm","electronic","electronica","house","techno","trance","dubstep","dnb","drum and bass","drum & bass",
        "electro","dancefloor","hardstyle","future bass","progressive house","deep house","big room","dance"
    ]):
        return "electronic / dance"

    # LATIN / URBANO
    if any(k in s for k in [
        "latin","latino","urbano","reggaeton","corrido","banda","mariachi","bachata","cumbia","salsa","merengue",
        "norteño","regional mexican"
    ]):
        return "latin / urbano"

    # R&B / SOUL
    if any(k in s for k in ["r&b", "rnb", "soul", "neo-soul", "neosoul"]):
        return "r&b / soul"

    # K/J-POP
    if "k-pop" in s or "kpop" in s or "j-pop" in s or "jpop" in s:
        return "k-pop / j-pop"

    # POP
    if "pop" in s:
        return "pop"

    # ROCK
    if any(k in s for k in ["rock", "punk", "emo", "grunge", "psych rock", "psychedelic rock"]):
        return "rock"

    # METAL
    if any(k in s for k in ["metal", "metalcore", "deathcore", "nu metal", "thrash", "black metal", "doom"]):
        return "metal"

    # INDIE / ALTERNATIVE
    if any(k in s for k in ["indie", "alternative", "shoegaze", "lo-fi", "lofi", "bedroom"]):
        return "indie / alternative"

    # AFRO
    if any(k in s for k in ["afrobeats", "afrobeat", "afropop", "amapiano", "naija", "bongo"]):
        return "afro"

    # REGGAE / DANCEHALL
    if any(k in s for k in ["reggae", "dancehall", "ragga"]):
        return "reggae / dancehall"

    # COUNTRY / AMERICANA
    if any(k in s for k in ["country", "americana", "bluegrass"]):
        return "country / americana"

    # JAZZ
    if "jazz" in s:
        return "jazz"

    # CLASSICAL
    if any(k in s for k in ["classical", "baroque", "orchestra", "symphony", "opera"]):
        return "classical"

    # FOLK / CANTAUTORE
    if any(k in s for k in ["folk", "cantautor", "singer-songwriter", "singer songwriter"]):
        return "folk / cantautore"

    # BLUES
    if "blues" in s:
        return "blues"

    # SOUNDTRACK / OST
    if any(k in s for k in ["soundtrack", "score", "ost", "film score", "movie score"]):
        return "soundtrack / ost"

    return "other"

for n, d in G.nodes(data=True):
    g = d.get("artist_first_genre", None)
    d["macro_genre"] = to_macro_genre(g)

macro_counts = pd.Series([d["macro_genre"] for _,d in G.nodes(data=True)]).value_counts()
macro_counts.to_frame("count")


Unnamed: 0,count
unknown,3861
other,1999
hip hop / rap,1552
electronic / dance,844
latin / urbano,650
pop,634
rock,345
classical,184
r&b / soul,182
country / americana,177


In [6]:
r = nx.attribute_assortativity_coefficient(G, "macro_genre")
print("Assortatività (macro_genre):", round(r, 4))


Assortatività (macro_genre): 0.3842


In [None]:
macro = {n: d.get("macro_genre", "unknown") for n,d in G.nodes(data=True)}
pairs = []
for u, v in G.edges():
    a, b = macro[u], macro[v]
    if a <= b: pairs.append((a,b))
    else:      pairs.append((b,a))

df_pairs = pd.DataFrame(pairs, columns=["g1","g2"])
mat = (df_pairs
       .value_counts()
       .rename("edges")
       .reset_index()
       .pivot(index="g1", columns="g2", values="edges")
       .fillna(0)
       .astype(int))

mat = mat.reindex(index=sorted(set(macro.values())), columns=sorted(set(macro.values()))).fillna(0).astype(int)
mat = mat + mat.T - np.diag(np.diag(mat.values))

display(mat)

mat.to_csv("matrix_macro_genre_edges.csv")
print("Salvata matrix_macro_genre_edges.csv")


g2,afro,blues,classical,country / americana,electronic / dance,folk / cantautore,hip hop / rap,indie / alternative,jazz,k-pop / j-pop,latin / urbano,metal,other,pop,r&b / soul,reggae / dancehall,rock,soundtrack / ost,unknown
g1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
afro,26,0,0,0,4,0,20,0,1,0,5,0,11,4,10,8,0,0,34
blues,0,1,0,2,3,0,0,0,2,0,0,1,0,0,1,0,6,0,6
classical,0,0,97,3,0,2,0,1,3,1,3,5,62,5,1,1,2,47,40
country / americana,0,2,3,124,2,5,3,1,3,0,4,1,43,3,0,1,24,1,100
electronic / dance,4,3,0,2,414,3,84,7,3,7,40,15,207,93,13,17,19,0,869
folk / cantautore,0,0,2,5,3,1,8,1,0,0,9,0,8,5,0,0,5,0,25
hip hop / rap,20,0,0,3,84,8,1360,55,4,25,238,20,369,147,132,11,24,3,1110
indie / alternative,0,0,1,1,7,1,55,39,0,2,8,1,40,30,1,1,5,0,87
jazz,1,2,3,3,3,0,4,0,13,0,1,1,35,4,4,0,2,2,29
k-pop / j-pop,0,0,1,0,7,0,25,2,0,50,0,0,31,6,3,0,2,1,96


Salvata matrix_macro_genre_edges.csv


In [None]:
use_nx_louvain = hasattr(nx.algorithms.community, "louvain_communities")

if use_nx_louvain:
    from networkx.algorithms.community import louvain_communities
    comms = louvain_communities(G, weight=None, seed=42)  
    from networkx.algorithms.community.quality import modularity
    Q = modularity(G, comms, weight=None)
else:
    import community as community_louvain
    part = community_louvain.best_partition(G)  
    comm_map = defaultdict(set)
    for n, c in part.items():
        comm_map[c].add(n)
    comms = list(comm_map.values())
    from networkx.algorithms.community.quality import modularity
    Q = modularity(G, comms, weight=None)

print("N community:", len(comms))
print("Modularità Q:", round(Q, 4))

node2comm = {}
for cid, C in enumerate(comms):
    for n in C:
        node2comm[n] = cid


N community: 592
Modularità Q: 0.8436


In [None]:
def participation_coefficient(G, node, node2comm):
    ki = G.degree(node)  
    if ki == 0: 
        return 0.0
    sums = defaultdict(int)
    for nbr in G[node]:
        sums[node2comm[nbr]] += 1
    return 1.0 - sum((w/ki)**2 for w in sums.values())

P = {n: participation_coefficient(G, n, node2comm) for n in G.nodes()}
dfP = (pd.DataFrame({
        "artist_id": list(P.keys()),
        "P": list(P.values()),
        "artist_name": [G.nodes[n].get("artist_name","") for n in G.nodes()],
        "macro_genre": [G.nodes[n].get("macro_genre","") for n in G.nodes()],
        "popularity": [G.nodes[n].get("artist_popularity", np.nan) for n in G.nodes()]
      })
      .sort_values("P", ascending=False))

display(dfP.head(20))  
dfP.to_csv("top_bridge_participation.csv", index=False)
print("Salvato top_bridge_participation.csv")


Unnamed: 0,artist_id,P,artist_name,macro_genre,popularity
71,6eUKZXaKkcviH0Ku9w2n3V,0.835556,Ed Sheeran,pop,92
1001,6GEykX11lQqp92UVOQQCC7,0.816327,DJ Premier,hip hop / rap,68
228,2ExGrw6XpbtUAJHTLtUXUD,0.8,Stefflon Don,electronic / dance,68
1453,0haZhu4fFKt0Ag94kZDiz2,0.8,Sofía Reyes,latin / urbano,67
1063,7xTKLpo7UCzXSnlH7fOIoM,0.8,Redman,hip hop / rap,67
33,7FNnA9vBm6EKceENgCGRMb,0.785714,Anitta,pop,82
197,1Cs0zKBU1kc0i8ypK3B9ai,0.785,David Guetta,electronic / dance,92
337,0IF46mUS8NXjgHabxk2MCM,0.777778,Kelis,unknown,66
2125,3oSJ7TBVCWMDMiYjXNiCKE,0.765432,Kane Brown,country / americana,76
1482,1Xylc3o4UrD53lo9CvFvVg,0.764444,Zara Larsson,unknown,79


Salvato top_bridge_participation.csv


In [None]:
def assortativity_macro(G):
    return nx.attribute_assortativity_coefficient(G, "macro_genre")

r_obs = assortativity_macro(G)

deg_seq = [d for _, d in G.degree()]
m = G.number_of_edges()

Rs = []
for _ in trange(50, desc="Null models"):
    CM = nx.configuration_model(deg_seq, seed=None)
    H = nx.Graph(CM)
    H.remove_edges_from(nx.selfloop_edges(H))
    for n in H.nodes():
        pass

def randomize_by_double_edge_swap(G, n_swaps_factor=5, seed=None):
    H = G.copy()
    n_swaps = G.number_of_edges() * n_swaps_factor
    try:
        nx.double_edge_swap(H, nswap=n_swaps, max_tries=n_swaps*10, seed=seed)
    except Exception as e:
        pass
    return H

Rs = []
for i in trange(50, desc="Null models (double-edge-swap)"):
    H = randomize_by_double_edge_swap(G, n_swaps_factor=5, seed=i)
    rH = assortativity_macro(H)
    Rs.append(rH)

Rs = np.array(Rs)
mu, sigma = Rs.mean(), Rs.std(ddof=1) if len(Rs) > 1 else (0, 1)
z = (r_obs - mu) / sigma if sigma > 0 else np.inf

print(f"Assortatività osservata r = {r_obs:.4f}")
print(f"Media null μ = {mu:.4f}  |  σ = {sigma:.4f}")
print(f"z-score = {z:.2f}")

pd.Series(Rs).to_csv("assortativita_null_distribution.csv", index=False)


Null models: 100%|██████████| 50/50 [00:09<00:00,  5.28it/s]
Null models (double-edge-swap): 100%|██████████| 50/50 [00:40<00:00,  1.24it/s]

Assortatività osservata r = 0.3842
Media null μ = -0.0018  |  σ = 0.0036
z-score = 106.53





In [11]:
print("=== RISULTATI CHIAVE (bozza) ===")
print("Assortatività per macro_genere:", round(r, 4))
print("Modularità (Louvain):", round(Q, 4))
print("Top 10 bridge (participation):")
print(dfP.head(10)[["artist_name","macro_genre","P","popularity"]].to_string(index=False))

print("\nBaseline assortatività:")
print(f"r_obs={r_obs:.4f} | mu_null={mu:.4f} | sigma_null={sigma:.4f} | z={z:.2f}")


=== RISULTATI CHIAVE (bozza) ===
Assortatività per macro_genere: 0.3842
Modularità (Louvain): 0.8436
Top 10 bridge (participation):
 artist_name         macro_genre        P  popularity
  Ed Sheeran                 pop 0.835556          92
  DJ Premier       hip hop / rap 0.816327          68
Stefflon Don  electronic / dance 0.800000          68
 Sofía Reyes      latin / urbano 0.800000          67
      Redman       hip hop / rap 0.800000          67
      Anitta                 pop 0.785714          82
David Guetta  electronic / dance 0.785000          92
       Kelis             unknown 0.777778          66
  Kane Brown country / americana 0.765432          76
Zara Larsson             unknown 0.764444          79

Baseline assortatività:
r_obs=0.3842 | mu_null=-0.0018 | sigma_null=0.0036 | z=106.53
