# Artist Collaboration Network Analysis (2017~2019)

In [None]:

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import numpy as np

# File paths and column names by year
files = {
    2017: '/content/global-artist_network-2017.csv',
    2018: '/content/global-artist_network-2018.csv',
    2019: '/content/global-artist_network-2019.csv'
}

# Store graphs, partitions, and degrees by year
graphs = {}
partitions = {}
degrees = {}

for year, path in files.items():
    # 1. Read file (auto-detect delimiter)
    try:
        df = pd.read_csv(path, sep='\t', engine='python')
        edge_cols = ['artist_1', 'artist_2', 'count']
    except Exception:
        df = pd.read_csv(path, sep='\s+', engine='python')
        edge_cols = ['artist1', 'artist2', 'count']
    df.columns = df.columns.str.strip()
    edges = df[edge_cols].dropna()
    edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
    # 2. Create graph
    G = nx.Graph()
    for _, row in edges.iterrows():
        G.add_edge(row[edge_cols[0]], row[edge_cols[1]], weight=row['count'])
    # 3. Community detection
    partition = community_louvain.best_partition(G, weight='weight')
    deg = dict(G.degree(weight='weight'))
    graphs[year] = G
    partitions[year] = partition
    degrees[year] = deg

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(24, 8))
cmap = plt.get_cmap('tab20')

for idx, year in enumerate(sorted(files.keys())):
    G = graphs[year]
    partition = partitions[year]
    deg = degrees[year]
    communities = [partition[n] for n in G.nodes()]
    colors = [cmap(c % 20) for c in communities]
    node_sizes = [deg[n]*5 for n in G.nodes()]
    # Fix layout with the same seed
    pos = nx.spring_layout(G, k=0.15, seed=42)
    ax = axes[idx]
    nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, alpha=0.7, ax=ax)
    nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.2 for u, v in G.edges()], alpha=0.2, ax=ax)
    ax.set_title(f'{year} Artist Collaboration Network')
    ax.axis('off')

plt.suptitle('2017-2019 Artist Collaboration Network Comparison (Color=Community, Size=Degree)', fontsize=18)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re

# Load network data (2017)
df = pd.read_csv('/content/global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# Create a DataFrame for each node's community and degree
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Calculate the size of each community
community_sizes = nodes_df.groupby('community').size()

# Minimum community size for labeling
min_community_size = 10
large_communities = community_sizes[community_sizes >= min_community_size].index

# Select representative artists (top N by degree in large communities only)
N = 3
rep_artist_df = nodes_df[nodes_df['community'].isin(large_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(N, 'degree')).reset_index(drop=True)
rep_artist_set = set(rep_artist_df['artist'])

# Function to escape special characters in labels
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

# Visualization
cmap = plt.get_cmap('tab20')
communities = [partition[n] for n in G.nodes()]
colors = [cmap(c % 20) for c in communities]
node_sizes = [degrees[n]*5 for n in G.nodes()]
pos = nx.spring_layout(G, k=0.15, seed=42)

plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.2 for u, v in G.edges()], alpha=0.2)
# Show labels only for representative artists (in large communities)
label_dict = {n: escape_label(n) for n in G.nodes() if n in rep_artist_set}
nx.draw_networkx_labels(G, pos, labels=label_dict, font_size=10, font_weight='bold')
plt.title('2017 Artist Collaboration Network (Labels for Representative Artists in Large Communities)')
plt.axis('off')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re

# Load network data (2018)
df = pd.read_csv('/content/global-artist_network-2018.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# Create a DataFrame for each node's community and degree
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Calculate the size of each community
community_sizes = nodes_df.groupby('community').size()

# Minimum community size for labeling
min_community_size = 10
large_communities = community_sizes[community_sizes >= min_community_size].index

# Select representative artists (top N by degree in large communities only)
N = 3
rep_artist_df = nodes_df[nodes_df['community'].isin(large_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(N, 'degree')).reset_index(drop=True)
rep_artist_set = set(rep_artist_df['artist'])

# Function to escape special characters in labels
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

# Visualization
cmap = plt.get_cmap('tab20')
communities = [partition[n] for n in G.nodes()]
colors = [cmap(c % 20) for c in communities]
node_sizes = [degrees[n]*5 for n in G.nodes()]
pos = nx.spring_layout(G, k=0.15, seed=42)

plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.2 for u, v in G.edges()], alpha=0.2)
# Show labels only for representative artists (in large communities)
label_dict = {n: escape_label(n) for n in G.nodes() if n in rep_artist_set}
nx.draw_networkx_labels(G, pos, labels=label_dict, font_size=10, font_weight='bold')
plt.title('2018 Artist Collaboration Network (Labels for Representative Artists in Large Communities)')
plt.axis('off')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re

# Load network data (2019)
df = pd.read_csv('/content/global-artist_network-2019.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# Create a DataFrame for each node's community and degree
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Calculate the size of each community
community_sizes = nodes_df.groupby('community').size()

# Minimum community size for labeling
min_community_size = 10
large_communities = community_sizes[community_sizes >= min_community_size].index

# Select representative artists (top N by degree in large communities only)
N = 3
rep_artist_df = nodes_df[nodes_df['community'].isin(large_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(N, 'degree')).reset_index(drop=True)
rep_artist_set = set(rep_artist_df['artist'])

# Function to escape special characters in labels
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

# Visualization
cmap = plt.get_cmap('tab20')
communities = [partition[n] for n in G.nodes()]
colors = [cmap(c % 20) for c in communities]
node_sizes = [degrees[n]*5 for n in G.nodes()]
pos = nx.spring_layout(G, k=0.15, seed=42)

plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=node_sizes, alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']*0.2 for u, v in G.edges()], alpha=0.2)
# Show labels only for representative artists (in large communities)
label_dict = {n: escape_label(n) for n in G.nodes() if n in rep_artist_set}
nx.draw_networkx_labels(G, pos, labels=label_dict, font_size=10, font_weight='bold')
plt.title('2019 Artist Collaboration Network (Labels for Representative Artists in Large Communities)')
plt.axis('off')
plt.show()


## Visualization of Core Artist Network


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re
import numpy as np

# --- Load network data and detect communities ---
df = pd.read_csv('global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# --- Select representative artist (top 1 by degree per community) ---
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Representative artist (most collaborations) per community
rep_artist_df = nodes_df.groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()

# --- Create community-level graph ---
community_graph = nx.Graph()
for comm in nodes_df['community'].unique():
    community_graph.add_node(comm)

for u, v, data in G.edges(data=True):
    comm_u = partition[u]
    comm_v = partition[v]
    if comm_u != comm_v:
        key = tuple(sorted([comm_u, comm_v]))
        if community_graph.has_edge(*key):
            community_graph[key[0]][key[1]]['weight'] += data['weight']
        else:
            community_graph.add_edge(key[0], key[1], weight=data['weight'])

# --- Extract only core communities (with the largest total edge weights) ---
# Example: Keep only the top 15 communities by total degree (number of collaborations)
community_degrees = {comm: len(nodes_df[nodes_df['community']==comm]) for comm in community_graph.nodes()}
top_communities = sorted(community_degrees, key=community_degrees.get, reverse=True)[:15]
subgraph = community_graph.subgraph(top_communities).copy()

# --- Visualization (minimize node overlap, zoom in) ---
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

sub_labels = {comm: escape_label(rep_artist_names.get(comm, str(comm))) for comm in subgraph.nodes()}
node_sizes = [community_degrees[comm]*200 for comm in subgraph.nodes()]
edge_widths = [subgraph[u][v]['weight']*0.2 for u,v in subgraph.edges()]

plt.figure(figsize=(10, 8))
# Increasing k spreads nodes further apart (minimizing overlap)
pos = nx.spring_layout(subgraph, seed=42, k=5)
nx.draw_networkx_nodes(subgraph, pos, node_color='skyblue', node_size=node_sizes)
nx.draw_networkx_edges(subgraph, pos, width=edge_widths, alpha=0.6)
# Adjust label position slightly above the node
for comm, (x, y) in pos.items():
    plt.text(x, y+0.07, sub_labels[comm], fontsize=13, fontweight='bold', ha='center', va='bottom')
plt.title('2017 Core Community-Level Collaboration Network (Representative Artist Labels)')
plt.axis('off')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re
import numpy as np

# --- Load network data and detect communities ---
df = pd.read_csv('global-artist_network-2018.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# --- Select representative artist (top 1 by degree per community) ---
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Representative artist (most collaborations) per community
rep_artist_df = nodes_df.groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()

# --- Create community-level graph ---
community_graph = nx.Graph()
for comm in nodes_df['community'].unique():
    community_graph.add_node(comm)

for u, v, data in G.edges(data=True):
    comm_u = partition[u]
    comm_v = partition[v]
    if comm_u != comm_v:
        key = tuple(sorted([comm_u, comm_v]))
        if community_graph.has_edge(*key):
            community_graph[key[0]][key[1]]['weight'] += data['weight']
        else:
            community_graph.add_edge(key[0], key[1], weight=data['weight'])

# --- Extract only core communities (with the largest total edge weights) ---
# Example: Keep only the top 15 communities by total degree (number of collaborations)
community_degrees = {comm: len(nodes_df[nodes_df['community']==comm]) for comm in community_graph.nodes()}
top_communities = sorted(community_degrees, key=community_degrees.get, reverse=True)[:15]
subgraph = community_graph.subgraph(top_communities).copy()

# --- Visualization (minimize node overlap, zoom in) ---
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

sub_labels = {comm: escape_label(rep_artist_names.get(comm, str(comm))) for comm in subgraph.nodes()}
node_sizes = [community_degrees[comm]*200 for comm in subgraph.nodes()]
edge_widths = [subgraph[u][v]['weight']*0.2 for u,v in subgraph.edges()]

plt.figure(figsize=(10, 8))
# Increasing k spreads nodes further apart (minimizing overlap)
pos = nx.spring_layout(subgraph, seed=42, k=5)
nx.draw_networkx_nodes(subgraph, pos, node_color='skyblue', node_size=node_sizes)
nx.draw_networkx_edges(subgraph, pos, width=edge_widths, alpha=0.6)
# Adjust label position slightly above the node
for comm, (x, y) in pos.items():
    plt.text(x, y+0.07, sub_labels[comm], fontsize=13, fontweight='bold', ha='center', va='bottom')
plt.title('2018 Core Community-Level Collaboration Network (Representative Artist Labels)')
plt.axis('off')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re
import numpy as np

# --- Load network data and detect communities ---
df = pd.read_csv('global-artist_network-2019.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)

G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# --- Select representative artist (top 1 by degree per community) ---
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)

# Representative artist (most collaborations) per community
rep_artist_df = nodes_df.groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()

# --- Create community-level graph ---
community_graph = nx.Graph()
for comm in nodes_df['community'].unique():
    community_graph.add_node(comm)

for u, v, data in G.edges(data=True):
    comm_u = partition[u]
    comm_v = partition[v]
    if comm_u != comm_v:
        key = tuple(sorted([comm_u, comm_v]))
        if community_graph.has_edge(*key):
            community_graph[key[0]][key[1]]['weight'] += data['weight']
        else:
            community_graph.add_edge(key[0], key[1], weight=data['weight'])

# --- Extract only core communities (with the largest total edge weights) ---
# Example: Keep only the top 15 communities by total degree (number of collaborations)
community_degrees = {comm: len(nodes_df[nodes_df['community']==comm]) for comm in community_graph.nodes()}
top_communities = sorted(community_degrees, key=community_degrees.get, reverse=True)[:15]
subgraph = community_graph.subgraph(top_communities).copy()

# --- Visualization (minimize node overlap, zoom in) ---
def escape_label(label):
    return re.sub(r'([$\\_^\{\}#&%~])', r'\\\1', str(label))

sub_labels = {comm: escape_label(rep_artist_names.get(comm, str(comm))) for comm in subgraph.nodes()}
node_sizes = [community_degrees[comm]*200 for comm in subgraph.nodes()]
edge_widths = [subgraph[u][v]['weight']*0.2 for u,v in subgraph.edges()]

plt.figure(figsize=(10, 8))
# Increasing k spreads nodes further apart (minimizing overlap)
pos = nx.spring_layout(subgraph, seed=42, k=5)
nx.draw_networkx_nodes(subgraph, pos, node_color='skyblue', node_size=node_sizes)
nx.draw_networkx_edges(subgraph, pos, width=edge_widths, alpha=0.6)
# Adjust label position slightly above the node
for comm, (x, y) in pos.items():
    plt.text(x, y+0.07, sub_labels[comm], fontsize=13, fontweight='bold', ha='center', va='bottom')
plt.title('2019 Core Community-Level Collaboration Network (Representative Artist Labels)')
plt.axis('off')
plt.show()


### Genre Distribution Aggregation for Top 15 Communities


In [None]:

import pandas as pd
import networkx as nx
import community as community_louvain
import matplotlib.pyplot as plt
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build network and assign communities
df = pd.read_csv('/content/global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])
partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# 3. Calculate total degree (collaboration strength) per community
community_degree = {}
for comm in set(partition.values()):
    nodes = [n for n, c in partition.items() if c == comm]
    deg_sum = sum(dict(G.degree(nodes, weight='weight')).values())
    community_degree[comm] = deg_sum

# 4. Select top 15 communities (by total degree)
top_15_communities = sorted(community_degree, key=community_degree.get, reverse=True)[:15]

# 5. Select representative artist (top 1 by degree) and community size for each community
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)
rep_artist_df = nodes_df[nodes_df['community'].isin(top_15_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()
community_sizes = nodes_df['community'].value_counts().to_dict()

# 6. Aggregate genre distribution for each community (top 15 only)
community_genres = {}
for node, comm in partition.items():
    if comm in top_15_communities:
        genres = artist_genre_map.get(node, '')
        for genre in str(genres).split(','):
            if comm not in community_genres:
                community_genres[comm] = []
            community_genres[comm].append(genre.strip())

cmap = plt.get_cmap('tab20')
for comm in top_15_communities:
    if comm in community_genres:
        genre_counts = Counter(community_genres[comm])
        top_genres = genre_counts.most_common(10)
        labels, counts = zip(*top_genres)
        rep_artist = rep_artist_names.get(comm, f'Community {comm}')
        comm_size = community_sizes.get(comm, 0)
        plt.figure(figsize=(8, 4))
        plt.bar(labels, counts, color=cmap(comm % 20))
        plt.title(f"Community {comm} ({rep_artist}, size={comm_size}) - Top 10 Genres (2017)")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print(f'Community {comm} has no genre data.')


In [None]:

import pandas as pd
import networkx as nx
import community as community_louvain
import matplotlib.pyplot as plt
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build network and assign communities
df = pd.read_csv('/content/global-artist_network-2018.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])
partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# 3. Calculate total degree (collaboration strength) per community
community_degree = {}
for comm in set(partition.values()):
    nodes = [n for n, c in partition.items() if c == comm]
    deg_sum = sum(dict(G.degree(nodes, weight='weight')).values())
    community_degree[comm] = deg_sum

# 4. Select top 15 communities (by total degree)
top_15_communities = sorted(community_degree, key=community_degree.get, reverse=True)[:15]

# 5. Select representative artist (top 1 by degree) and community size for each community
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)
rep_artist_df = nodes_df[nodes_df['community'].isin(top_15_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()
community_sizes = nodes_df['community'].value_counts().to_dict()

# 6. Aggregate genre distribution for each community (top 15 only)
community_genres = {}
for node, comm in partition.items():
    if comm in top_15_communities:
        genres = artist_genre_map.get(node, '')
        for genre in str(genres).split(','):
            if comm not in community_genres:
                community_genres[comm] = []
            community_genres[comm].append(genre.strip())

cmap = plt.get_cmap('tab20')
for comm in top_15_communities:
    if comm in community_genres:
        genre_counts = Counter(community_genres[comm])
        top_genres = genre_counts.most_common(10)
        labels, counts = zip(*top_genres)
        rep_artist = rep_artist_names.get(comm, f'Community {comm}')
        comm_size = community_sizes.get(comm, 0)
        plt.figure(figsize=(8, 4))
        plt.bar(labels, counts, color=cmap(comm % 20))
        plt.title(f"Community {comm} ({rep_artist}, size={comm_size}) - Top 10 Genres (2018)")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print(f'Community {comm} has no genre data.')


In [None]:

import pandas as pd
import networkx as nx
import community as community_louvain
import matplotlib.pyplot as plt
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build network and assign communities
df = pd.read_csv('/content/global-artist_network-2019.csv', sep='\t', engine='python', quoting=3)
df.columns = df.columns.str.strip()
edges = df[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])
partition = community_louvain.best_partition(G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
degrees = dict(G.degree(weight='weight'))

# 3. Calculate total degree (collaboration strength) per community
community_degree = {}
for comm in set(partition.values()):
    nodes = [n for n, c in partition.items() if c == comm]
    deg_sum = sum(dict(G.degree(nodes, weight='weight')).values())
    community_degree[comm] = deg_sum

# 4. Select top 15 communities (by total degree)
top_15_communities = sorted(community_degree, key=community_degree.get, reverse=True)[:15]

# 5. Select representative artist (top 1 by degree) and community size for each community
nodes_data = []
for node in G.nodes():
    deg = degrees.get(node, 0)
    comm = partition.get(node, -1)
    nodes_data.append({'artist': node, 'degree': deg, 'community': comm})
nodes_df = pd.DataFrame(nodes_data)
rep_artist_df = nodes_df[nodes_df['community'].isin(top_15_communities)] \
    .groupby('community').apply(lambda x: x.nlargest(1, 'degree')).reset_index(drop=True)
rep_artist_names = rep_artist_df.set_index('community')['artist'].to_dict()
community_sizes = nodes_df['community'].value_counts().to_dict()

# 6. Aggregate genre distribution for each community (top 15 only)
community_genres = {}
for node, comm in partition.items():
    if comm in top_15_communities:
        genres = artist_genre_map.get(node, '')
        for genre in str(genres).split(','):
            if comm not in community_genres:
                community_genres[comm] = []
            community_genres[comm].append(genre.strip())

cmap = plt.get_cmap('tab20')
for comm in top_15_communities:
    if comm in community_genres:
        genre_counts = Counter(community_genres[comm])
        top_genres = genre_counts.most_common(10)
        labels, counts = zip(*top_genres)
        rep_artist = rep_artist_names.get(comm, f'Community {comm}')
        comm_size = community_sizes.get(comm, 0)
        plt.figure(figsize=(8, 4))
        plt.bar(labels, counts, color=cmap(comm % 20))
        plt.title(f"Community {comm} ({rep_artist}, size={comm_size}) - Top 10 Genres (2019)")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print(f'Community {comm} has no genre data.')


# Genre Network Analysis


In [None]:
import pandas as pd
import networkx as nx
from collections import Counter

# 1. Load files
artist_network = pd.read_csv('/content/global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build genre network
genre_G = nx.Graph()

for idx, row in artist_network.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    # If genres are stored as strings, split by comma (or whitespace if needed)
    if isinstance(genres1, str):
        genres1 = [g.strip() for g in genres1.replace("'", "").replace("[", "").replace("]", "").split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip() for g in genres2.replace("'", "").replace("[", "").replace("]", "").split(',') if g.strip()]
    # Create all genre pairs
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            # Undirected network: sort alphabetically to avoid duplicates
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(edge[0], edge[1]):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

print(f"Number of genre nodes: {genre_G.number_of_nodes()}")
print(f"Number of genre edges: {genre_G.number_of_edges()}")

# Print top genre collaboration pairs
top_edges = sorted(genre_G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:15]
print("\nTop 15 genre-genre collaborations:")
for g1, g2, data in top_edges:
    print(f"{g1} - {g2}: {data['weight']}")


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# 1. Data loading
artist_network = pd.read_csv('/content/global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build genre network
genre_G = nx.Graph()
for idx, row in artist_network.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    # Clean and split genre strings
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    # Add edges for each genre pair
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 3. Visualize the network (top collaboration genres only)
# Since there are many nodes, show only top 100 edges by weight
edges_to_draw = sorted(genre_G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:100]
subG = nx.Graph()
for g1, g2, data in edges_to_draw:
    subG.add_edge(g1, g2, weight=data['weight'])

plt.figure(figsize=(15, 12))
pos = nx.spring_layout(subG, k=0.8, seed=42)
edge_widths = [subG[u][v]['weight']*0.05 for u, v in subG.edges()]
nx.draw_networkx_nodes(subG, pos, node_color='skyblue', node_size=600, alpha=0.8)
nx.draw_networkx_edges(subG, pos, width=edge_widths, alpha=0.5)
nx.draw_networkx_labels(subG, pos, font_size=10, font_weight='bold')
plt.title('Genre Collaboration Network (Top 100 Edges by Weight, 2017)')
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# 1. Data loading
artist_network = pd.read_csv('/content/global-artist_network-2018.csv', sep='\t', engine='python', quoting=3)
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build genre network
genre_G = nx.Graph()
for idx, row in artist_network.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    # Clean and split genre strings
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    # Add edges for each genre pair
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 3. Visualize the network (top collaboration genres only)
# Since there are many nodes, show only top 100 edges by weight
edges_to_draw = sorted(genre_G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:100]
subG = nx.Graph()
for g1, g2, data in edges_to_draw:
    subG.add_edge(g1, g2, weight=data['weight'])

plt.figure(figsize=(15, 12))
pos = nx.spring_layout(subG, k=0.8, seed=42)
edge_widths = [subG[u][v]['weight']*0.05 for u, v in subG.edges()]
nx.draw_networkx_nodes(subG, pos, node_color='skyblue', node_size=600, alpha=0.8)
nx.draw_networkx_edges(subG, pos, width=edge_widths, alpha=0.5)
nx.draw_networkx_labels(subG, pos, font_size=10, font_weight='bold')
plt.title('Genre Collaboration Network (Top 100 Edges by Weight, 2018)')
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# 1. Data loading
artist_network = pd.read_csv('/content/global-artist_network-2019.csv', sep='\t', engine='python', quoting=3)
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build genre network
genre_G = nx.Graph()
for idx, row in artist_network.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    # Clean and split genre strings
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    # Add edges for each genre pair
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 3. Visualize the network (top collaboration genres only)
# Since there are many nodes, show only top 100 edges by weight
edges_to_draw = sorted(genre_G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:100]
subG = nx.Graph()
for g1, g2, data in edges_to_draw:
    subG.add_edge(g1, g2, weight=data['weight'])

plt.figure(figsize=(15, 12))
pos = nx.spring_layout(subG, k=0.8, seed=42)
edge_widths = [subG[u][v]['weight']*0.05 for u, v in subG.edges()]
nx.draw_networkx_nodes(subG, pos, node_color='skyblue', node_size=600, alpha=0.8)
nx.draw_networkx_edges(subG, pos, width=edge_widths, alpha=0.5)
nx.draw_networkx_labels(subG, pos, font_size=10, font_weight='bold')
plt.title('Genre Collaboration Network (Top 100 Edges by Weight, 2019)')
plt.axis('off')
plt.tight_layout()
plt.show()


In [None]:
# Genre network analysis and visualization for 2017-2019

import pandas as pd
import networkx as nx
import community as community_louvain
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

# File paths
files = {
    2017: '/content/global-artist_network-2017.csv',
    2018: '/content/global-artist_network-2018.csv',
    2019: '/content/global-artist_network-2019.csv'
}
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

def build_genre_network(network_file):
    # Load network data
    df = pd.read_csv(network_file, sep=None, engine='python')
    if 'artist_1' not in df.columns:
        df.columns = [c.strip() for c in df.columns]
    edges = df[['artist_1', 'artist_2', 'count']].dropna()
    edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
    # Build genre network
    G = nx.Graph()
    for _, row in edges.iterrows():
        a1, a2, count = row['artist_1'], row['artist_2'], row['count']
        genres1 = artist_genre_map.get(a1, '')
        genres2 = artist_genre_map.get(a2, '')
        # Clean genre strings
        if isinstance(genres1, str):
            genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
        if isinstance(genres2, str):
            genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
        for g1 in genres1:
            for g2 in genres2:
                if not g1 or not g2:
                    continue
                edge = tuple(sorted([g1, g2]))
                if G.has_edge(*edge):
                    G[edge[0]][edge[1]]['weight'] += int(count)
                else:
                    G.add_edge(edge[0], edge[1], weight=int(count))
    return G

def analyze_and_visualize_genre_network(G, year):
    # Community detection
    partition = community_louvain.best_partition(G, weight='weight')
    nx.set_node_attributes(G, partition, 'community')
    communities = set(partition.values())
    print(f"\n=== {year} Genre Network Analysis ===")
    # Community-wise top genres, entropy, crossover
    comm_to_genres = {}
    for node, comm in partition.items():
        comm_to_genres.setdefault(comm, []).append(node)
    for comm, genres in comm_to_genres.items():
        genre_counts = Counter(genres)
        total = sum(genre_counts.values())
        probs = np.array(list(genre_counts.values())) / total if total > 0 else np.array([1])
        entropy = -np.sum(probs * np.log2(probs))
        top_genres = genre_counts.most_common(5)
        print(f"Community {comm}: #genres={len(genres)}, entropy={entropy:.2f}, top genres={top_genres}")
    # Network visualization (top 100 edges)
    top_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:100]
    subG = nx.Graph()
    for g1, g2, data in top_edges:
        subG.add_edge(g1, g2, weight=data['weight'])
    sub_partition = {n: partition[n] for n in subG.nodes()}
    cmap = plt.get_cmap('tab20')
    node_colors = [cmap(sub_partition[n] % 20) for n in subG.nodes()]
    node_sizes = [subG.degree(n, weight='weight')*10 for n in subG.nodes()]
    pos = nx.spring_layout(subG, k=1.2, seed=42)
    plt.figure(figsize=(14, 11))
    nx.draw_networkx_nodes(subG, pos, node_color=node_colors, node_size=node_sizes, alpha=0.85)
    nx.draw_networkx_edges(subG, pos, width=[subG[u][v]['weight']*0.1 for u, v in subG.edges()], alpha=0.5)
    nx.draw_networkx_labels(subG, pos, font_size=9, font_weight='bold')
    plt.title(f"{year} Genre Collaboration Network (Top 100 Edges)")
    plt.axis('off')
    plt.show()
    # Centrality and participation coefficient
    deg = nx.degree_centrality(G)
    btw = nx.betweenness_centrality(G, weight='weight')
    def participation_coefficient(G, partition):
        pc = {}
        for node in G.nodes():
            comm = partition[node]
            deg_node = G.degree(node)
            if deg_node == 0:
                pc[node] = 0
                continue
            comm_deg = Counter()
            for nbr in G.neighbors(node):
                nbr_comm = partition[nbr]
                comm_deg[nbr_comm] += 1
            sum_sq = sum((count/deg_node)**2 for count in comm_deg.values())
            pc[node] = 1 - sum_sq
        return pc
    pc = participation_coefficient(G, partition)
    # Hub/bridge genres
    bridge_df = pd.DataFrame({
        'genre': list(G.nodes()),
        'community': [partition[n] for n in G.nodes()],
        'degree_centrality': [deg[n] for n in G.nodes()],
        'participation_coeff': [pc[n] for n in G.nodes()],
        'betweenness': [btw[n] for n in G.nodes()]
    })
    print("\nHub/Bridge genres (top 10 by participation_coeff, betweenness):")
    bridges = bridge_df.sort_values(['participation_coeff', 'betweenness'], ascending=False).head(10)
    print(bridges[['genre', 'community', 'degree_centrality', 'participation_coeff', 'betweenness']])

# Run for each year
for year, path in files.items():
    Gg = build_genre_network(path)
    analyze_and_visualize_genre_network(Gg, year)


# Artists who played a central (hub/bridge) role in both artist and genre collaboration networks


In [None]:
import pandas as pd
import networkx as nx
import community as community_louvain
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build artist collaboration network
edges = pd.read_csv('/content/global-artist_network-2017.csv', sep='\t', engine='python', quoting=3)
edges = edges[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

# 3. Calculate participation coefficient and betweenness for artist network
partition = community_louvain.best_partition(G, weight='weight')
def participation_coefficient(G, partition):
    pc = {}
    for node in G.nodes():
        comm = partition[node]
        deg = G.degree(node)
        if deg == 0:
            pc[node] = 0
            continue
        comm_deg = Counter()
        for nbr in G.neighbors(node):
            nbr_comm = partition[nbr]
            comm_deg[nbr_comm] += 1
        sum_sq = sum((count/deg)**2 for count in comm_deg.values())
        pc[node] = 1 - sum_sq
    return pc
pc_artist = participation_coefficient(G, partition)
bc_artist = nx.betweenness_centrality(G, weight='weight')

# 4. Build genre collaboration network
genre_G = nx.Graph()
for _, row in edges.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 5. Calculate participation coefficient and betweenness for genre network
partition_genre = community_louvain.best_partition(genre_G, weight='weight')
pc_genre = participation_coefficient(genre_G, partition_genre)
bc_genre = nx.betweenness_centrality(genre_G, weight='weight')

# 6. Select hub/bridge genres (top 10% by participation coefficient or betweenness)
pc_thr = pd.Series(pc_genre).quantile(0.9)
bc_thr = pd.Series(bc_genre).quantile(0.9)
hub_bridge_genres = {g for g in genre_G.nodes() if pc_genre[g] > pc_thr or bc_genre[g] > bc_thr}

# 7. Identify central artists
# - Artists in the top 10% for participation or betweenness in the artist network
pc_thr_a = pd.Series(pc_artist).quantile(0.9)
bc_thr_a = pd.Series(bc_artist).quantile(0.9)
hub_bridge_artists = [a for a in G.nodes() if pc_artist[a] > pc_thr_a or bc_artist[a] > bc_thr_a]

# - Among these, artists with at least one genre in the hub/bridge genre set
central_artists_2017 = []
for a in hub_bridge_artists:
    genres = artist_genre_map.get(a, '')
    if isinstance(genres, str):
        genres = [g.strip().strip("'[]") for g in genres.split(',') if g.strip()]
    if any(g in hub_bridge_genres for g in genres):
        central_artists_2017.append(a)

# 8. Output results
print("Artists who played a central (hub/bridge) role in both artist and genre collaboration networks (2017):")
for a in central_artists_2017:
    print(f"- {a} | genres: {artist_genre_map.get(a, '')}")


In [None]:
import pandas as pd
import networkx as nx
import community as community_louvain
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build artist collaboration network
edges = pd.read_csv('/content/global-artist_network-2018.csv', sep='\t', engine='python', quoting=3)
edges = edges[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

# 3. Calculate participation coefficient and betweenness for artist network
partition = community_louvain.best_partition(G, weight='weight')
def participation_coefficient(G, partition):
    pc = {}
    for node in G.nodes():
        comm = partition[node]
        deg = G.degree(node)
        if deg == 0:
            pc[node] = 0
            continue
        comm_deg = Counter()
        for nbr in G.neighbors(node):
            nbr_comm = partition[nbr]
            comm_deg[nbr_comm] += 1
        sum_sq = sum((count/deg)**2 for count in comm_deg.values())
        pc[node] = 1 - sum_sq
    return pc
pc_artist = participation_coefficient(G, partition)
bc_artist = nx.betweenness_centrality(G, weight='weight')

# 4. Build genre collaboration network
genre_G = nx.Graph()
for _, row in edges.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 5. Calculate participation coefficient and betweenness for genre network
partition_genre = community_louvain.best_partition(genre_G, weight='weight')
pc_genre = participation_coefficient(genre_G, partition_genre)
bc_genre = nx.betweenness_centrality(genre_G, weight='weight')

# 6. Select hub/bridge genres (top 10% by participation coefficient or betweenness)
pc_thr = pd.Series(pc_genre).quantile(0.9)
bc_thr = pd.Series(bc_genre).quantile(0.9)
hub_bridge_genres = {g for g in genre_G.nodes() if pc_genre[g] > pc_thr or bc_genre[g] > bc_thr}

# 7. Identify central artists
# - Artists in the top 10% for participation or betweenness in the artist network
pc_thr_a = pd.Series(pc_artist).quantile(0.9)
bc_thr_a = pd.Series(bc_artist).quantile(0.9)
hub_bridge_artists = [a for a in G.nodes() if pc_artist[a] > pc_thr_a or bc_artist[a] > bc_thr_a]

# - Among these, artists with at least one genre in the hub/bridge genre set
central_artists_2018 = []
for a in hub_bridge_artists:
    genres = artist_genre_map.get(a, '')
    if isinstance(genres, str):
        genres = [g.strip().strip("'[]") for g in genres.split(',') if g.strip()]
    if any(g in hub_bridge_genres for g in genres):
        central_artists_2018.append(a)

# 8. Output results
print("Artists who played a central (hub/bridge) role in both artist and genre collaboration networks (2018):")
for a in central_artists_2018:
    print(f"- {a} | genres: {artist_genre_map.get(a, '')}")


In [None]:
import pandas as pd
import networkx as nx
import community as community_louvain
from collections import Counter

# 1. Load artist information
artists_info = pd.read_csv('/content/spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 2. Build artist collaboration network
edges = pd.read_csv('/content/global-artist_network-2019.csv', sep='\t', engine='python', quoting=3)
edges = edges[['artist_1', 'artist_2', 'count']].dropna()
edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
G = nx.Graph()
for _, row in edges.iterrows():
    G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])

# 3. Calculate participation coefficient and betweenness for artist network
partition = community_louvain.best_partition(G, weight='weight')
def participation_coefficient(G, partition):
    pc = {}
    for node in G.nodes():
        comm = partition[node]
        deg = G.degree(node)
        if deg == 0:
            pc[node] = 0
            continue
        comm_deg = Counter()
        for nbr in G.neighbors(node):
            nbr_comm = partition[nbr]
            comm_deg[nbr_comm] += 1
        sum_sq = sum((count/deg)**2 for count in comm_deg.values())
        pc[node] = 1 - sum_sq
    return pc
pc_artist = participation_coefficient(G, partition)
bc_artist = nx.betweenness_centrality(G, weight='weight')

# 4. Build genre collaboration network
genre_G = nx.Graph()
for _, row in edges.iterrows():
    a1, a2, count = row['artist_1'], row['artist_2'], row['count']
    genres1 = artist_genre_map.get(a1, '')
    genres2 = artist_genre_map.get(a2, '')
    if isinstance(genres1, str):
        genres1 = [g.strip().strip("'[]") for g in genres1.split(',') if g.strip()]
    if isinstance(genres2, str):
        genres2 = [g.strip().strip("'[]") for g in genres2.split(',') if g.strip()]
    for g1 in genres1:
        for g2 in genres2:
            if not g1 or not g2:
                continue
            edge = tuple(sorted([g1, g2]))
            if genre_G.has_edge(*edge):
                genre_G[edge[0]][edge[1]]['weight'] += int(count)
            else:
                genre_G.add_edge(edge[0], edge[1], weight=int(count))

# 5. Calculate participation coefficient and betweenness for genre network
partition_genre = community_louvain.best_partition(genre_G, weight='weight')
pc_genre = participation_coefficient(genre_G, partition_genre)
bc_genre = nx.betweenness_centrality(genre_G, weight='weight')

# 6. Select hub/bridge genres (top 10% by participation coefficient or betweenness)
pc_thr = pd.Series(pc_genre).quantile(0.9)
bc_thr = pd.Series(bc_genre).quantile(0.9)
hub_bridge_genres = {g for g in genre_G.nodes() if pc_genre[g] > pc_thr or bc_genre[g] > bc_thr}

# 7. Identify central artists
# - Artists in the top 10% for participation or betweenness in the artist network
pc_thr_a = pd.Series(pc_artist).quantile(0.9)
bc_thr_a = pd.Series(bc_artist).quantile(0.9)
hub_bridge_artists = [a for a in G.nodes() if pc_artist[a] > pc_thr_a or bc_artist[a] > bc_thr_a]

# - Among these, artists with at least one genre in the hub/bridge genre set
central_artists_2019 = []
for a in hub_bridge_artists:
    genres = artist_genre_map.get(a, '')
    if isinstance(genres, str):
        genres = [g.strip().strip("'[]") for g in genres.split(',') if g.strip()]
    if any(g in hub_bridge_genres for g in genres):
        central_artists_2019.append(a)

# 8. Output results
print("Artists who played a central (hub/bridge) role in both artist and genre collaboration networks (2019):")
for a in central_artists_2019:
    print(f"- {a} | genres: {artist_genre_map.get(a, '')}")


In [None]:
# Prepare sets of central artists for each year (using previous analysis results)
central_artists_2017 = set(central_artists_2017)
central_artists_2018 = set(central_artists_2018)
central_artists_2019 = set(central_artists_2019)

# Extract only artists who were central in all three years
central_all_years = central_artists_2017 & central_artists_2018 & central_artists_2019

# Sort the result as a list
central_all_years_list = sorted(list(central_all_years))

print("Artists who played a central (bridge/hub) role in all of 2017, 2018, and 2019:")
for artist in central_all_years_list:
    print("-", artist)


In [None]:
import pandas as pd

# Load artist info
artists_info = pd.read_csv('spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
info_map = artists_info.set_index('name').to_dict('index')

# Assume central_all_years_list is already defined as a list of artist names
# Example:
# central_all_years_list = ['Anitta', 'Bebe Rexha', 'Camila Cabello', ...]

# Collect info for each central artist
central_artists_data = []
for artist in central_all_years_list:
    info = info_map.get(artist, {})
    central_artists_data.append({
        'artist': artist,
        'genres': info.get('genres', ''),
        'followers': info.get('followers', ''),
        'popularity': info.get('popularity', '')
    })

# Create DataFrame and sort by popularity
central_artists_df = pd.DataFrame(central_artists_data)
central_artists_df = central_artists_df.sort_values(by='popularity', ascending=False).reset_index(drop=True)

# Display the DataFrame
from IPython.display import display
display(central_artists_df[['artist', 'genres', 'followers', 'popularity']])


## Emerging Genre Community Detection + Central Artists


In [None]:
import pandas as pd
import networkx as nx
import community as community_louvain
from collections import Counter

# Network file paths by year
files = {
    2017: 'global-artist_network-2017.csv',
    2018: 'global-artist_network-2018.csv',
    2019: 'global-artist_network-2019.csv'
}

# Load artist information
artists_info = pd.read_csv('spotify_artists_info_complete.csv', sep='\t', encoding='utf-8')
artists_info = artists_info.drop_duplicates(subset=['name'], keep='first')
artist_genre_map = artists_info.set_index('name')['genres'].to_dict()

# 1. Aggregate degree by community for each year
community_degree_time = {}
for year, path in files.items():
    df = pd.read_csv(path, sep='\t', engine='python', quoting=3)
    df.columns = df.columns.str.strip()
    edges = df[['artist_1', 'artist_2', 'count']].dropna()
    edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
    G = nx.Graph()
    for _, row in edges.iterrows():
        G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])
    partition = community_louvain.best_partition(G, weight='weight')
    degrees = dict(G.degree(weight='weight'))
    comm_degree = {}
    for comm in set(partition.values()):
        nodes = [n for n, c in partition.items() if c == comm]
        deg_sum = sum(dict(G.degree(nodes, weight='weight')).values())
        comm_degree[comm] = deg_sum
    community_degree_time[year] = comm_degree

# 2. Integrate community indices and convert to DataFrame
all_communities = set()
for comms in community_degree_time.values():
    all_communities.update(comms.keys())
all_communities = list(all_communities)

degree_data = {}
for year in files.keys():
    degree_data[year] = [community_degree_time[year].get(comm, 0) for comm in all_communities]
degree_df = pd.DataFrame(degree_data, index=all_communities)

# 3. Add columns for year-over-year change
degree_df['diff_18_17'] = degree_df[2018] - degree_df[2017]
degree_df['diff_19_18'] = degree_df[2019] - degree_df[2018]

# Relaxed condition: 2017 degree ≤ 100, and (2018-2017 > 200) or (2019-2018 > 200)
emerging = degree_df[(degree_df[2017] <= 100) & ((degree_df['diff_18_17'] > 200) | (degree_df['diff_19_18'] > 200))]
emerging = emerging.sort_values(['diff_18_17', 'diff_19_18'], ascending=False)

print("Emerging genre communities:")
print(emerging[[2017, 2018, 2019, 'diff_18_17', 'diff_19_18']])

# 4. Function to find top genres and central artist for emerging communities
def get_community_detail(year, comm):
    df = pd.read_csv(files[year], sep='\t', engine='python', quoting=3)
    df.columns = df.columns.str.strip()
    edges = df[['artist_1', 'artist_2', 'count']].dropna()
    edges['count'] = pd.to_numeric(edges['count'], errors='coerce').fillna(1).astype(int)
    G = nx.Graph()
    for _, row in edges.iterrows():
        G.add_edge(row['artist_1'], row['artist_2'], weight=row['count'])
    partition = community_louvain.best_partition(G, weight='weight')
    degrees = dict(G.degree(weight='weight'))
    comm_nodes = [n for n, c in partition.items() if c == comm]
    genres = []
    for n in comm_nodes:
        g = artist_genre_map.get(n)
        if isinstance(g, str):
            genres.extend([x.strip() for x in g.replace("'", "").replace("[", "").replace("]", "").replace('"', '').split(',') if x.strip()])
    genre_counts = Counter(genres)
    top_genres = genre_counts.most_common(5)
    if comm_nodes:
        rep_artist = max(comm_nodes, key=lambda n: degrees.get(n, 0))
    else:
        rep_artist = None
    return top_genres, rep_artist

print("\nEmerging communities (2019) - Top genres and central artist:")
for comm in emerging.index:
    top_genres, rep_artist = get_community_detail(2019, comm)
    print(f"- Community {comm}:")
    print(f"  Top 5 genres: {top_genres}")
    print(f"  Central artist: {rep_artist}")
    if rep_artist:
        genres = artist_genre_map.get(rep_artist, '')
        print(f"    (Genres: {genres})")
