# **Music collaborations and lyric similarities**

## **1. Data acquisition, network and corpus creation**

**DO NOT RUN THIS PART UNLESS YOU HAVE 2.5 HOURS TO LOSE**

*The network and lyrics corpus will be downloaded from git in a later part*

### **1.1 Network creation**

The goal of the project is to uncover if a correlation exists between collaboration between singers and lyrical similarities in their songs.
We start by selecting the artists. For that, we use the hot100 csv file of the Billboard Hot 100 & more kaggle dataset
to retrieve all the singers having spend more than 5 weeks on the Billboard Hot 100 chart since 2010. They will constitute our artists basis.
We then query the data for 300 songs (or less if they have less) per singer to look for featuring that include another artist from our list.
Finally, we create the network with the singers being nodes and the collaborations being links.
We cut out the isolated nodes and keep only the largest component as it contains the vast majority of the artists.

In [7]:
import pandas as pd
import re
import math
import musicbrainzngs as mb
import json
import os
import time
from tqdm import tqdm
import networkx as nx
from networkx.readwrite import json_graph
import matplotlib.pyplot as plt
import requests
import unicodedata
import random

#### **1.1.1 Get the artists**

In [4]:
# Load Billboard dataset
df = pd.read_csv("hot100.csv")

# Convert date column if needed
df['Date'] = pd.to_datetime(df['Date'])

# Filter to 2010–2024
df = df[(df['Date'].dt.year >= 2000) & (df['Date'].dt.year <= 2022)]


  df = pd.read_csv("hot100.csv")


In [5]:
def split_artists(artist_str):
    if pd.isna(artist_str):
        return []

    s = artist_str.strip()

    # 1. First split on pipes
    parts = re.split(r'\s*\|\s*', s)

    final = []
    for p in parts:

        # 2. Replace all common separators with commas
        cleaned = re.sub(
            r'''(?ix)
            \bfeat\.?\b       |
            \bfeaturing\b     |
            \bwith\b          |
            \band\b           |
            \bx\b             |
            \bor\b            |
            &                 |
            \+
            ''',
            ',',
            p
        )

        # 3. Split on commas
        subparts = [c.strip() for c in cleaned.split(',')]

        # 4. Remove leading punctuation and empty strings
        subparts = [re.sub(r'^[^\w]+', '', a) for a in subparts]
        subparts = [a for a in subparts if a.strip()]  # remove empty/whitespace-only

        final.extend(subparts)

    return final


In [6]:
df['artist_list'] = df['Artist'].apply(split_artists)
#print(df[['Artist', 'artist_list']].head(20))

flat_data = []
for _, row in df.iterrows():
    date = row['Date']  # replace with your actual date column
    for artist in row['artist_list']:
        flat_data.append({'artist': artist, 'date': date})

flat_df = pd.DataFrame(flat_data)

# Convert date column to datetime if not already
flat_df['date'] = pd.to_datetime(flat_df['date'])

# Group by artist
artist_stats = flat_df.groupby('artist').agg(
    occurrences=('date', 'count'),
    first_appearance=('date', 'min'),
    last_appearance=('date', 'max')
).reset_index()

# Optional: filter by popularity
seed_artists = artist_stats[artist_stats['occurrences'] >= 5]

artists = seed_artists['artist'].tolist()

print(seed_artists.head(20))


print("Number of seed artists:", len(seed_artists))

                 artist  occurrences first_appearance last_appearance
2                   112          132       2000-07-26      2005-07-13
3              2 Chainz          505       2012-04-18      2022-02-02
4             2 Pistols           20       2008-02-27      2008-07-09
5             21 Savage          463       2016-09-28      2022-12-28
6              24kGoldn           53       2019-11-20      2021-08-11
9                  2Pac           78       2000-02-23      2009-02-18
10         3 Doors Down          268       2000-04-05      2011-02-16
11   30 Seconds To Mars           28       2006-08-02      2007-03-28
12                  311           20       2004-04-28      2004-09-08
13             347aidan            5       2022-06-08      2022-07-06
14                  3LW           66       2000-11-01      2002-10-30
15                3OH!3          101       2008-11-12      2012-01-18
16  4*TOWN (From Disney            5       2022-03-30      2022-04-27
17              42 D

#### **1.1.2 Get the links and create the network**

In [8]:
######### Get and verify MBIDs #########

# ---------------------------
# 1. MusicBrainz setup
# ---------------------------
mb.set_useragent(
    "MusicNLPProject",
    "1.0",
    "your_email@example.com"
)

def get_mbid(name):
    """Query MusicBrainz for the artist MBID."""
    try:
        result = mb.search_artists(artist=name, limit=1)
        if result["artist-list"]:
            return result["artist-list"][0]["id"]
    except Exception as e:
        # If MusicBrainz rate-limits you or another error occurs
        print(f"Error for {name}: {e}")
    return None


# ---------------------------
# 2. Load or create cache
# ---------------------------
CACHE_FILE = "mbid_cache.json"

if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        mbid_map = json.load(f)
else:
    mbid_map = {}

print(f"Loaded {len(mbid_map)} cached entries.")


# ---------------------------
# 3. Query missing artists
# ---------------------------
to_query = [a for a in artists if a not in mbid_map]

print(f"Need to query {len(to_query)} artists.")


for artist in tqdm(to_query):
    mbid_map[artist] = get_mbid(artist)

    # Save cache every time — safe against crashes
    with open(CACHE_FILE, "w") as f:
        json.dump(mbid_map, f, indent=2)

    time.sleep(0.5)  # MusicBrainz polite rate limit


# ---------------------------
# 4. Final filtered list
# ---------------------------
seed_artists_with_mbid = [
    artist for artist, mbid in mbid_map.items() if mbid is not None
]

print(f"\nFinal artists with MBID: {len(seed_artists_with_mbid)}")


Loaded 13 cached entries.
Need to query 2173 artists.


100%|██████████| 2173/2173 [45:59<00:00,  1.27s/it]


Final artists with MBID: 2181





In [11]:
print(f"\nFinal artists with MBID: {len(seed_artists_with_mbid)}")


Final artists with MBID: 4


In [12]:
########### Get collaboration links and build graph #########

# ---------------------------
# 1. Setup MusicBrainz
# ---------------------------
mb.set_useragent("MusicNLPProject", "1.0", "your_email@example.com")

# ---------------------------
# 2. Limit seed artists
# ---------------------------
MAX_RECORDINGS = 300

# ---------------------------
# 3. Load / create MBID cache
# ---------------------------
MBID_CACHE_FILE = "mbid_cache.json"
if os.path.exists(MBID_CACHE_FILE):
    with open(MBID_CACHE_FILE, "r") as f:
        mbid_map = json.load(f)
else:
    mbid_map = {}

# ---------------------------
# 4. Lookup MBIDs for seed artists
# ---------------------------
def get_mbid(name):
    try:
        result = mb.search_artists(artist=name, limit=1)
        if result["artist-list"]:
            return result["artist-list"][0]["id"]
    except Exception as e:
        print(f"Error fetching MBID for {name}: {e}")
    return None

to_query = [a for a in seed_artists if a not in mbid_map]
for artist in tqdm(to_query, desc="Fetching MBIDs"):
    mbid_map[artist] = get_mbid(artist)
    with open(MBID_CACHE_FILE, "w") as f:
        json.dump(mbid_map, f, indent=2)
    time.sleep(1)  # polite rate limiting

# Keep only artists with MBIDs
seed_artists_with_mbid = [a for a in artists if mbid_map.get(a)]
mbid_to_name = {v: k for k, v in mbid_map.items() if v is not None}

# ---------------------------
# 5. Load / create collaborator cache
# ---------------------------
COLLAB_CACHE_FILE = "collaborators_cache.json"
if os.path.exists(COLLAB_CACHE_FILE):
    with open(COLLAB_CACHE_FILE, "r") as f:
        collaborator_cache = json.load(f)
else:
    collaborator_cache = {}

# ---------------------------
# 6. Fetch collaborators function
# ---------------------------
def get_collaborators(artist_mbid, max_recordings=300):
    if artist_mbid in collaborator_cache:
        return collaborator_cache[artist_mbid]

    collaborators = set()
    try:
        limit = 100
        offset = 0
        total_fetched = 0

        while total_fetched < max_recordings:
            result = mb.browse_recordings(
                artist=artist_mbid,
                includes=["artist-credits"],
                limit=limit,
                offset=offset
            )

            recordings = result.get("recording-list", [])
            if not recordings:
                break

            for rec in recordings:
                credits = rec.get("artist-credit", [])
                track_artists = [
                    ac["artist"]["name"]
                    for ac in credits
                    if "artist" in ac
                ]
                for name in track_artists:
                    if name != mbid_to_name.get(artist_mbid, None):
                        collaborators.add(name)

            fetched = len(recordings)
            total_fetched += fetched
            offset += fetched
            if fetched < limit:
                break

    except Exception as e:
        print(f"Error fetching collaborators for {artist_mbid}: {e}")

    collaborator_cache[artist_mbid] = list(collaborators)
    with open(COLLAB_CACHE_FILE, "w") as f:
        json.dump(collaborator_cache, f, indent=2)

    return list(collaborators)

# ---------------------------
# 7. Build the seed-only collaboration graph
# ---------------------------
G = nx.DiGraph()
G.add_nodes_from(seed_artists_with_mbid)
seed_artist_set = set(seed_artists_with_mbid)

for artist in tqdm(seed_artists_with_mbid, desc="Building seed-only graph"):
    mbid = mbid_map[artist]
    collabs = get_collaborators(mbid, max_recordings=MAX_RECORDINGS)

    # Only keep collaborators that are in the seed list
    collabs_in_seed = [c for c in collabs if c in seed_artist_set]

    for c in collabs_in_seed:
        G.add_edge(artist, c)

    time.sleep(0.5)  # polite rate limiting


Fetching MBIDs: 0it [00:00, ?it/s]
Building seed-only graph:  45%|████▌     | 989/2181 [40:04<41:55,  2.11s/it]

Error fetching collaborators for 8b0f05ce-354e-4121-9e0b-8b4732ea844f: string indices must be integers, not 'str'


Building seed-only graph:  69%|██████▉   | 1513/2181 [1:00:21<16:31,  1.48s/it]

Error fetching collaborators for 186e216a-2f8a-41a1-935f-8e30c018a8fe: string indices must be integers, not 'str'


Building seed-only graph:  71%|███████   | 1542/2181 [1:01:25<15:53,  1.49s/it]

Error fetching collaborators for 411d0b59-1f96-430d-acaf-39261e5057a1: string indices must be integers, not 'str'


Building seed-only graph: 100%|██████████| 2181/2181 [1:23:49<00:00,  2.31s/it]


In [15]:
######### Clean the graph and analyze components #########

# Remove isolated nodes
G.remove_nodes_from(list(nx.isolates(G)))
print("Number of nodes after removing isolated nodes:", G.number_of_nodes())

#
# H=G.copy()
# G=G.to_undirected()

# Get all connected components
components = list(nx.connected_components(G))
print("Number of connected components:", len(components))

# Find the largest component
largest_cc = max(components, key=len)

# Create a subgraph with only the largest component
G_largest = G.subgraph(largest_cc).copy()

print("Number of artists in largest component:", G_largest.number_of_nodes())
print("Number of collaborations in largest component:", G_largest.number_of_edges())

# Remove all self-loops
G_largest.remove_edges_from(nx.selfloop_edges(G_largest))

print("Number of edges after removing self-loops:", G_largest.number_of_edges())


Number of nodes after removing isolated nodes: 1928
Number of connected components: 11
Number of artists in largest component: 1902
Number of collaborations in largest component: 20897
Number of edges after removing self-loops: 20855


In [16]:
# GraphML
#nx.write_graphml(G_largest, "music_collaboration_seed_largest.graphml")

# CSV edge list
#nx.write_edgelist(G_largest, "music_collab_seed_largest_edges.csv", delimiter=",")

# JSON (node-link format)

data = json_graph.node_link_data(G_largest)
with open("collaborations_network2.json", "w") as f:
    json.dump(data, f, indent=2)

print("Largest component graph saved!")


Largest component graph saved!


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


### **1.2 Corpus preparation**

For each artist that is part of our graph, we want to constitute a corpus of lyrics. For that we get the Genius Song Lyrics dataset from kaggle and keep only the songs belonging to the singers in the network. We then create a folder where we store the concatenated lyrics for each artist in a text file.

In [3]:
# Get list of artists in the largest component
artists = list(G_largest.nodes)

# Get the dataframe of lyrics
lyrics_df = pd.read_csv("song_lyrics.csv")


#### **1.2.1 Name cleaning**

We noticed that some of the names in the network are not written the same way as in the lyrics dataset. Notably, names with accentuated letters are not handled in the same fashion. Let's take Beyoncé as an example: in the network, her name is written Beyonce while in the dataset, her name is Beyonc. However, we noticed that the dataset kept the name with the accent in the feature column. We are going the use that to match those names. We also normalise names for example to avoid mismatches on capitalized letters.

In [131]:
def remove_accented_letters(text):
    """
    Remove any base letter that had a combining accent in the NFD decomposition.
    'Beyoncé' -> 'Beyonc'
    'Arcángel' -> 'Arcngel'
    """
    if not isinstance(text, str):
        return text
    # NFD: letters + combining marks
    norm = unicodedata.normalize("NFD", text)
    out_chars = []
    i = 0
    while i < len(norm):
        ch = norm[i]
        # If this character is a letter (not combining), look ahead for combining marks.
        if unicodedata.combining(ch):
            # skip stray combining marks (shouldn't happen often)
            i += 1
            continue

        # collect base char
        base = ch
        # collect following combining marks
        j = i + 1
        combining_seq = []
        while j < len(norm) and unicodedata.combining(norm[j]):
            combining_seq.append(norm[j])
            j += 1

        # If there were combining marks, that means base had accents:
        if combining_seq:
            # drop the base (i.e. remove accented letter entirely)
            pass
        else:
            out_chars.append(base)
        i = j
    return "".join(out_chars)



In [132]:
def deaccent_keep_letters(text):
    """
    Removes accents but keeps letters.
    'Beyoncé' -> 'Beyonce'
    'Arcángel' -> 'Arcangel'
    """
    if not isinstance(text, str):
        return text
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if not unicodedata.combining(c)
    )


In [133]:
def feature_matches_artist(artist_name, feat_name):
    """
    artist_name = Kaggle dirty name ('Beyonc', 'Arcngel')
    feat_name   = Feature name with accents ('Beyoncé', 'Arcángel')
    """
    artist_clean = artist_name.lower().strip()
    feat_removed = remove_accented_letters(feat_name).lower().strip()
    return artist_clean == feat_removed


In [134]:
def parse_features(raw):
    if pd.isna(raw):
        return []
    
    s = raw.strip()

    # Remove outer { }
    if s.startswith("{") and s.endswith("}"):
        s = s[1:-1]

    if s == "":
        return []

    # Split on commas that are NOT inside quotes
    parts = re.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', s)

    cleaned = []
    for p in parts:
        p = p.strip()

        # Remove surrounding quotes if present
        if p.startswith('"') and p.endswith('"'):
            p = p[1:-1]

        # Fix escaped quotes like Cam\'ron
        p = p.replace("\\'", "'").strip()

        p = p.replace("\\\'", "'").strip()

        if p:
            cleaned.append(p)

    return cleaned


In [135]:
lyrics_df["features_parsed"] = lyrics_df["features"].apply(parse_features)

lyrics_df["feat_count"] = lyrics_df["features_parsed"].apply(len)
df_onefeat = lyrics_df[lyrics_df["feat_count"] == 1]

df_onefeat["single_feat"] = df_onefeat["features_parsed"].str[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_onefeat["single_feat"] = df_onefeat["features_parsed"].str[0]


In [136]:
df_onefeat["feat_deaccent"] = df_onefeat["single_feat"].apply(remove_accented_letters)
df_onefeat["new_name"] = df_onefeat["single_feat"].apply(deaccent_keep_letters)

def matches(a, b):
    a = a.lower()
    b = b.lower()
    return a == b

df_onefeat["match"] = df_onefeat.apply(
    lambda row: matches(row["artist"], row["feat_deaccent"]),
    axis=1
)
df_matches = df_onefeat[df_onefeat["match"]]

cleaned_names = (
    df_matches.groupby("artist")["new_name"]
    .first()
    .to_dict()
)
print(cleaned_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_onefeat["feat_deaccent"] = df_onefeat["single_feat"].apply(remove_accented_letters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_onefeat["new_name"] = df_onefeat["single_feat"].apply(deaccent_keep_letters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_onefeat["match"] = df_onefeat.apply



In [137]:
lyrics_df['cleaned_name'] = lyrics_df['artist'].map(cleaned_names).fillna(lyrics_df['artist'])
lyrics_df["cleaned_name_lower"] = lyrics_df["cleaned_name"].astype(str).str.lower()

#### **1.2.2 Creation of the lyrics corpus**

We clean and concatenate the lyrics for each artist in the network to have a dictionary where the key is the artist name and the lyrics are the values. We then store this dictionary in json files. There are still 214 singers that couldn't be matched with lyrics (we will remove them later). 

In [144]:
df_artists = lyrics_df[lyrics_df['cleaned_name_lower'].isin([a.lower() for a in artists])]

df_artists['network_name']= df_artists['cleaned_name_lower'].apply(
    lambda x: next((a for a in artists if a.lower() == x), x)
)

df_artists.drop(columns=['cleaned_name_lower', 'feat_count', 'cleaned_name', 'language_cld3', 'language_ft'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_artists['network_name']= df_artists['cleaned_name_lower'].apply(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_artists.drop(columns=['cleaned_name_lower', 'feat_count', 'cleaned_name', 'language_cld3', 'language_ft'], inplace=True)


In [145]:
df_artists.to_csv("lyrics_top100_artists.csv", index=False)

In [149]:
missing_artists = set(artists) - set(df_artists['network_name'].unique())
print("Number of missing artists from lyrics dataset:", len(missing_artists))
print("Percentage missing:", round(len(missing_artists)/len(artists)*100, 2), '%')

Number of missing artists from lyrics dataset: 214
Percentage missing: 11.25 %


In [3]:
df_artists=pd.read_csv("lyrics_top100_artists.csv")

In [4]:
def clean_lyrics(text):
    if not isinstance(text, str):
        return ""
    
    # Remove [Intro], [Verse: Artist], [Chorus], etc.
    text = re.sub(r"\[.*?\]", " ", text)

    # Remove escaped sequences
    text = text.replace("\\n", " ")   # escaped newline
    text = text.replace("\n", " ")    # actual newline
    text = text.replace("\\t", " ")   # escaped tab
    text = text.replace("\t", " ")    # actual tab
    
    # Remove escaped quotes
    text = text.replace('\\"', '"')   # remove \"
    text = text.replace("\\'", "'")   # remove \'
    
    # Remove any remaining double-backslashes (\\)
    text = text.replace("\\\\", " ")
    
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [5]:
lyrics_by_node = {}

for node in df_artists['network_name'].unique():
    sub = df_artists[df_artists['network_name'] == node]
    
    # Apply cleaning to each song
    cleaned_texts = sub['lyrics'].fillna("").apply(clean_lyrics)
    
    # Concatenate all songs for the artist
    full_text = " ".join(cleaned_texts)
    
    lyrics_by_node[node] = full_text


In [8]:
def split_corpus(corpus_dict, n_parts=3, prefix="lyrics_part"):
    artists = list(corpus_dict.keys())
    random.shuffle(artists)
    total = len(artists)
    chunk_size = math.ceil(total / n_parts)

    for i in range(n_parts):
        part_artists = artists[i * chunk_size : (i + 1) * chunk_size]
        part_dict = {a: corpus_dict[a] for a in part_artists}

        filename = f"{prefix}{i+1}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(part_dict, f, ensure_ascii=False, indent=2)

        print(f"Saved {filename} ({len(part_dict)} artists)")

split_corpus(lyrics_by_node, n_parts=4)

Saved lyrics_part1.json (422 artists)
Saved lyrics_part2.json (422 artists)
Saved lyrics_part3.json (422 artists)
Saved lyrics_part4.json (422 artists)


## **2. Network analysis**

In [None]:
url = "https://github.com/agathedAC/music_collaborations/raw/main/collaborations_network2.json"

response = requests.get(url)
data = response.json()
G_largest = json_graph.node_link_graph(data)

print("Loaded graph with", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges")

## **3. Lyrics analysis**

In [None]:
def load_multi_json(urls):
    merged = {}
    for url in urls:
        data = requests.get(url).json()
        merged.update(data)
    return merged

urls = [
    "https://github.com/agathedAC/music_collaborations/raw/main/lyrics_part1.json",
    "https://github.com/agathedAC/music_collaborations/raw/main/lyrics_part2.json",
    "https://github.com/agathedAC/music_collaborations/raw/main/lyrics_part3.json",
    "https://github.com/agathedAC/music_collaborations/raw/main/lyrics_part4.json"
]

lyrics_corpus = load_multi_json(urls)

print("Loaded", len(lyrics_corpus), "artists")

Loaded 1688 artists
