# Import Required Libraries
Import the necessary libraries, including sqlite3, random, networkx, pandas, community_louvain, and plotly.

In [60]:
import sqlite3
import random
import networkx as nx
import pandas as pd
import community as community_louvain
import plotly.graph_objects as go

# Load Data from Database
Load data from the SQLite database into a pandas DataFrame using a SQL query. The query selects pairs of artists who have collaborated on tracks, along with the count of their collaborations. The data is filtered based on the popularity of the artists and tracks.

In [61]:
# Load data from the SQLite database into a pandas DataFrame using a SQL query
def load(conn, filters = {"artist_popularity": 50, "track_popularity": 10}):
    query = """
        SELECT 
            a1.name AS artist_1,
            a2.name AS artist_2,
            COUNT(*) AS collaboration_count
        FROM TrackArtist ta1
        JOIN TrackArtist ta2 ON ta1.track_id = ta2.track_id AND ta1.artist_id < ta2.artist_id
        JOIN Artist a1 ON ta1.artist_id = a1.id
        JOIN Artist a2 ON ta2.artist_id = a2.id
        JOIN Track ON ta1.track_id = Track.id
        WHERE 
            a1.name IS NOT NULL 
            AND a2.name IS NOT NULL
            AND Track.popularity > ?
            AND a1.popularity > ?
        GROUP BY artist_1, artist_2
        ORDER BY collaboration_count DESC
    """
    data_frame = pd.read_sql_query(query, conn, params=(filters["track_popularity"], filters["artist_popularity"]))
    return data_frame

# Establish a connection to the SQLite database
conn = sqlite3.connect("../db/spotify.sqlite")

# Define the filters to apply to the data
filters = {
    "artist_popularity": 70, # The minimum popularity of an artist
    "track_popularity": 40 # The minimum popularity of a track
}

# Load the data into a DataFrame
data_frame = load(conn, filters)

# Display the first few rows of the DataFrame
data_frame.head()

Unnamed: 0,artist_1,artist_2,collaboration_count
0,Metro Boomin,Future,33
1,Dimitri Vegas,Dimitri Vegas & Like Mike,22
2,Keinemusik,Adam Port,20
3,Adam Port,&ME,19
4,Keinemusik,&ME,18


# Show tracks
List the first 10 tracks of the artist pair with the highest collaboration count.

In [62]:
# Display the first 10 track names of the top artist pair

# Define the specific artist pair
artist_1 = data_frame.iloc[0]["artist_1"]
artist_2 = data_frame.iloc[0]["artist_2"]

# SQL query to fetch the top 10 tracks of the specific artist pair
query = """
SELECT DISTINCT
    Track.name as track_name,
    Track.id as track_id
FROM Track
JOIN TrackArtist ON Track.id = TrackArtist.track_id
JOIN Artist ON TrackArtist.artist_id = Artist.id
WHERE Artist.name IN (?, ?)
AND Track.popularity > ?
"""

# Execute the query and load the data into a DataFrame
top_tracks_df = pd.read_sql_query(query, conn, params=(artist_1, artist_2, filters["track_popularity"]))

# Display the top 10 tracks
top_tracks_df.head(10)

Unnamed: 0,track_name,track_id
0,All I Know,0NWqNXBJTpXbkI5rPWNy3p
1,My Darlin' (feat. Future),7hJ1fEr1nf2e4qsE4bfmcc
2,Cold (feat. Future),2NlTOhsAamXOaZciOXbITb
3,Thrusting (feat. Swae Lee & Future),4ZrutH1rzpA4v3Bg9nma55
4,Turn On The Lights again.. (feat. Future & Fre...,4ptnQ0kQnN1U1Ig8TSslj6
5,"Fine Whine (feat. Joe Fox, Future & M.I.A.)",2WxzLbJfPA2vJGxQiKd7Jq
6,On Time (with John Legend),0YFqKxV9uNu6LUeYkLOKRS
7,Superhero (Heroes & Villains) [with Future & C...,0vjeOZ3Ft5jvAi9SBFJm1j
8,Too Many Nights (feat. Don Toliver & with Future),2Hh3ETdQKrmSI3QS0hme7g
9,Raindrops (Insane) [with Travis Scott],1pacwLXyRO47ka0v6LTIiY


# Create Collaboration Graph
Create a NetworkX graph from the DataFrame, filtering edges based on a minimum number of collaborations.

In [63]:
# Create NetworkX graph from the DataFrame, filtering edges based on a minimum number of collaborations
def create_graph(data_frame: pd.DataFrame, min_collaborations: int = 3) -> nx.Graph:
    # Create NetworkX graph
    G = nx.Graph()
    
    # Add edges to the graph
    for _, row in data_frame.iterrows():
        artist_1, artist_2 = row["artist_1"], row["artist_2"]
        weight = row["collaboration_count"]
        G.add_edge(artist_1, artist_2, weight=weight)

    # Filter out low-weight edges
    filtered_edges = [(u, v) for u, v, d in G.edges(data=True) if d["weight"] > min_collaborations]
    G_filtered = G.edge_subgraph(filtered_edges).copy()

    return G_filtered

# Set the minimum number of collaborations
min_collaborations = 4

# Create the filtered graph
G_filtered = create_graph(data_frame, min_collaborations)

# Display the number of nodes and edges in the filtered graph
print(f"Number of nodes: {G_filtered.number_of_nodes()}")
print(f"Number of edges: {G_filtered.number_of_edges()}")

Number of nodes: 115
Number of edges: 121


# Calculate Centrality Measures
Calculate degree centrality and betweenness centrality for the nodes in the graph.

In [64]:
# Calculate degree centrality
degree_centrality = nx.degree_centrality(G_filtered)

# Calculate betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G_filtered)

# Display the top 5 nodes by degree centrality
top_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 nodes by degree centrality:")
for node, centrality in top_degree_centrality:
    print(f"{node}: {centrality}")

# Display the top 5 nodes by betweenness centrality
top_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 nodes by betweenness centrality:")
for node, centrality in top_betweenness_centrality:
    print(f"{node}: {centrality}")


Top 5 nodes by degree centrality:
The Weeknd: 0.10526315789473684
A$AP Rocky: 0.09649122807017543
Metro Boomin: 0.06140350877192982
Justin Bieber: 0.06140350877192982
Future: 0.05263157894736842

Top 5 nodes by betweenness centrality:
The Weeknd: 0.1583863789266677
David Guetta: 0.08678776587486416
Nicki Minaj: 0.08655488278217668
Metro Boomin: 0.08259587020648967
Drake: 0.06963204471355379


# Detect Communities
Use the Louvain method to detect communities within the graph and print the number of detected communities.

In [65]:
# Use the Louvain method to detect communities within the graph
partition = community_louvain.best_partition(G_filtered)

# Print the number of detected communities
num_communities = len(set(partition.values()))
print(f"Detected {num_communities} artist communities.")

Detected 24 artist communities.


# Visualize Collaboration Network
Visualize the collaboration network using Plotly, with nodes colored by community.

In [66]:
# Compute positions for NetworkX graph with increased spacing
pos = nx.spring_layout(G_filtered, seed=42, k=0.3, iterations=100)

# Extract node positions
node_x = [pos[node][0] for node in G_filtered.nodes()]
node_y = [pos[node][1] for node in G_filtered.nodes()]

# Create figure
fig = go.Figure()

# Add edges (collaborations)
edge_x, edge_y = [], []
for edge in G_filtered.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

fig.add_trace(go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
))

# Add nodes (artists, colored by community)
community_colors = {community: f"#{random.randint(0, 0xFFFFFF):06x}" for community in set(partition.values())}
node_colors = [community_colors[partition[node]] for node in G_filtered.nodes()]
fig.add_trace(go.Scatter(
    x=node_x, y=node_y, mode="markers",
    marker=dict(color=node_colors, size=10),
    text=[f"{node}" for node in G_filtered.nodes()],
    hoverinfo="text"
))

fig.update_layout(
    showlegend=False,
    hovermode="closest", 
    title="🎵 Artist Collaboration Network",
    width=1200,  # Set the width of the figure
    height=800   # Set the height of the figure
)
fig.show()