# Artist-Artist relationships

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import json
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x77deeb101440; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [5]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x77deeb0376a0; closed: 0>

In [6]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [7]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url, pool_size=10, max_overflow=0)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Types of artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(711661,)


Seems like a fairly big number, let's check how they relate to each other.

In [9]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are three that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one entity of the same artist in the final CSVs. I'm going to store all occurrences of an artist in a list and stay with the most used instance.

Id 292. This relationship links a voice actor with their character.

## Artist dataset

We now have all the information needed to tackle the task of generating an artist dataset. Using relationships 108, 292, 1079 we'll create a list of known ids and names for each artist and store them following the JSONL convention.

The first subtask is to find pairs of entities that represents the same artist.

In [10]:
query =\
f"""
SELECT a0.id AS a0_id, a0.name AS a0_name, a1.id AS a1_id, a1.name AS a1_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type IN (1079, 108, 292)
)
"""
pairs = pd.read_sql_query(query, engine, dtype=str)
pairs.drop_duplicates(inplace=True)
pairs

Unnamed: 0,a0_id,a0_name,a1_id,a1_name
0,510355,Tom Salta,510353,Atlas Plug
1,515380,Sara Nicholas,512604,DJ Ginger Snapp
2,1816108,Alex Bilowitz,1303285,Alex Bilo
3,285421,Edward Upton,839935,BBII
4,567288,Charles Hilton Jr.,567286,CJ
...,...,...,...,...
75587,2069733,Luca Tommasini,2850597,Asylum Connection
75588,2069733,Luca Tommasini,2309086,Shadow Echo Canyon
75589,359596,Jaron Martinez,127171,Jaron Inc.
75590,2329220,Ruslan Troknyuk,2346401,Prog Pink Massive


We now separate each artist in their own list of entities (id and name).

In [11]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
# This algorithm groups all the different (same) artists in a list
seen_dict = dict()
artists_lists = list()
last_idx = -1
for _, row in pairs.iterrows():
    artist0 = {"id": row["a0_id"], "name": row["a0_name"]}
    artist1 = {"id": row["a1_id"], "name": row["a1_name"]}
    if artist0["id"] in seen_dict:
        if artist1["id"] in seen_dict:
            continue
        artist0_idx = seen_dict[artist0["id"]]
        artists_lists[artist0_idx].append(artist1)
        seen_dict[artist1["id"]] = artist0_idx
    elif artist1["id"] in seen_dict:
        artist1_idx = seen_dict[artist1["id"]]
        artists_lists[artist1_idx].append(artist0)
        seen_dict[artist0["id"]] = artist1_idx
    else:
        last_idx += 1
        artists_lists.append([artist0, artist1])
        seen_dict[artist0["id"]] = last_idx
        seen_dict[artist1["id"]] = last_idx
artists_lists[:5]

[[{'id': '510355', 'name': 'Tom Salta'},
  {'id': '510353', 'name': 'Atlas Plug'}],
 [{'id': '515380', 'name': 'Sara Nicholas'},
  {'id': '512604', 'name': 'DJ Ginger Snapp'}],
 [{'id': '1816108', 'name': 'Alex Bilowitz'},
  {'id': '1303285', 'name': 'Alex Bilo'}],
 [{'id': '285421', 'name': 'Edward Upton'},
  {'id': '839935', 'name': 'BBII'},
  {'id': '167634', 'name': 'EDMX'},
  {'id': '119129', 'name': 'Bass Potato'},
  {'id': '329054', 'name': 'David Michael Cross'},
  {'id': '64070', 'name': 'DMX Krew'},
  {'id': '105617', 'name': 'Computor Rockers'},
  {'id': '329053', 'name': 'Ed DMX'},
  {'id': '204935', 'name': 'Michael Knight'},
  {'id': '690681', 'name': '101 Force'},
  {'id': '1043503', 'name': 'Asylum Seekers'}],
 [{'id': '567288', 'name': 'Charles Hilton Jr.'},
  {'id': '567286', 'name': 'CJ'}]]

We now need to find which artist instance is the most common. For that we're going to be using the `tracks_no_va.csv` dataset.

In [12]:
tracks = pd.read_csv("../data/tracks_no_va.csv", dtype=str)
tracks.fillna("", inplace=True)
tracks

Unnamed: 0,name,date,year,month,artist_count,a0_id,a0_name,tags,a1_id,a1_name,a2_id,a2_name,a3_id,a3_name,a4_id,a4_name
0,*~ƒint_vœr!~*,201612,2016,12,1,2808021,Julius Androide,,,,,,,,,
1,roots of resonance,202410,2024,10,1,1567845,Kebun,,,,,,,,,
2,roots of resistance,202404,2024,4,1,414004,Darkness,,,,,,,,,
3,roots of renewal lie in native soil,201802,2018,2,1,1998753,:Dieux Des Cimetières:,"75, 148438, 1349, 47463, 77158, 2063, 1421, 16...",,,,,,,,
4,roots of regret,201104,2011,4,1,999272,Krisu,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24317061,half a dozen mc's,201113,2011,13,5,1154205,Ricko,,860519,Sus,860517,Messiahbolical,336309,Pyrelli,162646,Funky DL
24317062,half awake,201911,2019,11,5,2061657,Niaque,,1574533,Philipp Brämswig,1257597,David Helm,1169590,Stefan Karl Schmid,1167410,Fabian Arends
24317063,half dead,201208,2012,8,5,981802,Stu Bangas,235,791534,Vanderslice,721231,Roc Marciano,121498,Apathy,56162,Planet Asia
24317064,hallelujah,202110,2021,10,5,2249779,Colm R. McGuinness,,1546928,Thomas Sanders,1490678,Dan Vasc,1230974,Jonathan Young,1090885,Caleb Hyles


In [13]:
len(tracks)

24317066

In [14]:
id_columns = [f"a{i}_id" for i in range(5)]
artist_freqs = (tracks[id_columns].melt().groupby(by=["value"]).count()).to_dict(index="value")["variable"]

for i, (k, v) in enumerate(artist_freqs.items()):
    print(f"{k}: {v}")
    if i == 25:
        break

: 93797954
10: 48
1000: 148
100000: 1
1000000: 2
1000001: 1
1000002: 1
1000003: 3
1000004: 2
1000005: 1
1000006: 1
1000007: 5
1000008: 2
1000010: 3
1000011: 1
1000012: 1
1000013: 1
1000014: 1
1000015: 1
1000016: 1
1000017: 3
1000018: 1
1000019: 2
100002: 5
1000020: 3
1000021: 3


We are now ready to sort the lists so that the most common instance is the first element of the list.

In [15]:
artist_freqs.get("418540")

457

In [16]:
# First element of the list will be the "main" instance of the artist
artists_lists = list(map(
    lambda artist_list: sorted(artist_list, key=lambda artist: artist_freqs.get(artist["id"], 0), reverse=True),
    artists_lists
))
artists_lists[:5]

[[{'id': '510355', 'name': 'Tom Salta'},
  {'id': '510353', 'name': 'Atlas Plug'}],
 [{'id': '515380', 'name': 'Sara Nicholas'},
  {'id': '512604', 'name': 'DJ Ginger Snapp'}],
 [{'id': '1816108', 'name': 'Alex Bilowitz'},
  {'id': '1303285', 'name': 'Alex Bilo'}],
 [{'id': '64070', 'name': 'DMX Krew'},
  {'id': '329053', 'name': 'Ed DMX'},
  {'id': '105617', 'name': 'Computor Rockers'},
  {'id': '167634', 'name': 'EDMX'},
  {'id': '329054', 'name': 'David Michael Cross'},
  {'id': '204935', 'name': 'Michael Knight'},
  {'id': '690681', 'name': '101 Force'},
  {'id': '839935', 'name': 'BBII'},
  {'id': '1043503', 'name': 'Asylum Seekers'},
  {'id': '119129', 'name': 'Bass Potato'},
  {'id': '285421', 'name': 'Edward Upton'}],
 [{'id': '567286', 'name': 'CJ'},
  {'id': '567288', 'name': 'Charles Hilton Jr.'}]]

So far we've only handled the "multiple instances" artists. The following block of code adds the "single instance" artists to the list. It doesn't matter if we do this after the sort, these lists are going to have only one instance of the artist after all.

In [17]:
# Has to be a better way to do this
# WARNING: several minutes execution time
seen_set = set(seen_dict.keys())
seen_set.add("")
for row_index in range(len(tracks)):
    artists_in_row = [
        {
            "id": tracks.loc[row_index, f"a{i}_id"],
            "name": tracks.loc[row_index, f"a{i}_name"],
        }
        for i in range(5) if tracks.loc[row_index, f"a{i}_id"] not in seen_set
    ]
    if len(artists_in_row) > 0:
        artists_lists.extend([[artist] for artist in artists_in_row])
        seen_set.update(artist["id"] for artist in artists_in_row)

artists_lists[-5:]

[[{'id': '2327895', 'name': 'Sugandha Lad'}],
 [{'id': '2327894', 'name': 'Keya Dutta'}],
 [{'id': '2327893', 'name': 'Sujata Patvi'}],
 [{'id': '2327892', 'name': 'Madhvi Shrivastav'}],
 [{'id': '860517', 'name': 'Messiahbolical'}]]

At this point we have a list of lists, the following will transform what we have into a list of dictionaries, ready to be serialized.

In [18]:
artists = list()
for artists_list in artists_lists:
    current_artist = dict()
    current_artist["main_id"] = artists_list[0]["id"]
    current_artist["known_ids"] = [artist["id"] for artist in artists_list]
    current_artist["known_names"] = [artist["name"] for artist in artists_list]
    artists.append(current_artist)
len(artists)

1488795

In [19]:
artists[:5]

[{'main_id': '510355',
  'known_ids': ['510355', '510353'],
  'known_names': ['Tom Salta', 'Atlas Plug']},
 {'main_id': '515380',
  'known_ids': ['515380', '512604'],
  'known_names': ['Sara Nicholas', 'DJ Ginger Snapp']},
 {'main_id': '1816108',
  'known_ids': ['1816108', '1303285'],
  'known_names': ['Alex Bilowitz', 'Alex Bilo']},
 {'main_id': '64070',
  'known_ids': ['64070',
   '329053',
   '105617',
   '167634',
   '329054',
   '204935',
   '690681',
   '839935',
   '1043503',
   '119129',
   '285421'],
  'known_names': ['DMX Krew',
   'Ed DMX',
   'Computor Rockers',
   'EDMX',
   'David Michael Cross',
   'Michael Knight',
   '101 Force',
   'BBII',
   'Asylum Seekers',
   'Bass Potato',
   'Edward Upton']},
 {'main_id': '567286',
  'known_ids': ['567286', '567288'],
  'known_names': ['CJ', 'Charles Hilton Jr.']}]

In [20]:
artists[-5:]

[{'main_id': '2327895',
  'known_ids': ['2327895'],
  'known_names': ['Sugandha Lad']},
 {'main_id': '2327894',
  'known_ids': ['2327894'],
  'known_names': ['Keya Dutta']},
 {'main_id': '2327893',
  'known_ids': ['2327893'],
  'known_names': ['Sujata Patvi']},
 {'main_id': '2327892',
  'known_ids': ['2327892'],
  'known_names': ['Madhvi Shrivastav']},
 {'main_id': '860517',
  'known_ids': ['860517'],
  'known_names': ['Messiahbolical']}]

The only thing left is to actually serialize what we have.

In [21]:
jsons = [json.dumps(artist, ensure_ascii=False) for artist in artists]
unique_jsons = list(set(jsons))

with open("../data/artists.jsonl", "w", encoding="utf-8") as out_file:
    for unique_json in unique_jsons:
        out_file.write(unique_json + "\n")

In [22]:
!wc -l ../data/artists.jsonl

1488795 ../data/artists.jsonl


We can now retrieve the data every time we want.

In [23]:
with open("../data/artists.jsonl", "r", encoding="utf-8") as in_file:
    artist_data = [json.loads(line) for line in in_file]
print(artist_data[:5])

[{'main_id': '1930932', 'known_ids': ['1930932'], 'known_names': ['Chad Eby']}, {'main_id': '675121', 'known_ids': ['675121'], 'known_names': ['Gandy']}, {'main_id': '2005087', 'known_ids': ['2005087'], 'known_names': ['ilonqueen']}, {'main_id': '496172', 'known_ids': ['496172'], 'known_names': ['Marcio Montarroyos']}, {'main_id': '614797', 'known_ids': ['614797', '805624', '615443'], 'known_names': ['Matthias Keller', 'Solomatt', 'Kemal Wegda']}]


## Relationships dataset

The next step is to save the relationships between the artists in a CSV file. Let's start collecting the different relationships.

In [24]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in filter(lambda lt: lt not in (108, 292, 1079), link_types.id):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine, dtype=str)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships.drop_duplicates(inplace=True)
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
645,2815354,Johann Sebastian Mastropiero,2327,Johann Sebastian Bach,973
646,2815356,Johann Sebastian Masana,2327,Johann Sebastian Bach,973
647,2815356,Johann Sebastian Masana,1506534,Gerardo Masana,973
648,2815376,I Musicisti,388265,I Musici,973


Now we filter the relationships so that we don't have artists that don't concern us.

In [25]:
mask = relationships[["id0", "id1"]].isin(artist_freqs.keys()).all(axis=1)
filtered_relationships = relationships[mask]
filtered_relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
635,2762905,Joe Tex and His X Class Mates,5188,Joe Tex,973
638,1281154,Lachlan Bryan & The Wildes,1016410,Lachlan Bryan,973
642,2775215,Felysrator,1191913,Feryquitous,973
643,405197,Dananananaykroyd,342839,Dan Aykroyd,973


Now, we have the relationships, but we've made an artist dataset that will help in the task of replacing the non-important id with the main id for each artist. For this task a dictionary will be created.

In [26]:
changes_dict = dict()
for artist in artist_data:
    if len(artist["known_ids"]) > 1:
        for known_id in artist["known_ids"]:
            if known_id != artist["main_id"]:
                changes_dict[known_id] = artist["main_id"]
len(changes_dict)

74629

These are the relationships that we need to modify.

In [27]:
mask = filtered_relationships[["id0", "id1"]].isin(changes_dict.keys()).any(axis=1)
filtered_relationships.loc[mask]

Unnamed: 0,id0,name0,id1,name1,relationship_type
1,359330,Miley Cyrus,686291,Helping Haiti,102
36,116653,David Harrow,79824,Planet 4 Folk Quartet,102
43,268120,Can Oral,272765,Twizzler,102
75,150262,Mike Fabulous,249659,Fly My Pretties,102
84,459193,Solale,456266,Lindstrøm & Solale,102
...,...,...,...,...,...
575,2280996,MACINTOSH IACON,923360,Macintosh Plus,973
576,2281030,JUICE PLUS,923360,Macintosh Plus,973
579,2707509,homura for android,1558612,暁美ほむら,973
597,2629185,月見英子,2316189,月見英子,973


In [28]:
filtered_relationships.loc[mask, ["id0", "id1"]] = filtered_relationships.loc[mask, ["id0", "id1"]].replace(changes_dict)

filtered_relationships.loc[mask]

Unnamed: 0,id0,name0,id1,name1,relationship_type
1,418540,Miley Cyrus,686291,Helping Haiti,102
36,8151,David Harrow,79824,Planet 4 Folk Quartet,102
43,42684,Can Oral,272765,Twizzler,102
75,782296,Mike Fabulous,249659,Fly My Pretties,102
84,459186,Solale,456266,Lindstrøm & Solale,102
...,...,...,...,...,...
575,2280996,MACINTOSH IACON,860871,Macintosh Plus,973
576,2281030,JUICE PLUS,860871,Macintosh Plus,973
579,2707509,homura for android,353368,暁美ほむら,973
597,1577273,月見英子,1423112,月見英子,973


In [29]:
changes_dict["359330"]

'418540'

We can make sure that there are no cyclic references this way (hoping for a False return):

In [30]:
filtered_relationships[["id0", "id1"]].isin(changes_dict.keys()).any(axis=1).any()

False

Now we can finally save our relationships CSV.

In [31]:
filtered_relationships.to_csv("../data/relationships.csv", index=False)

In [32]:
!wc -l ../data/relationships.csv

213663 ../data/relationships.csv


## Tags

We now can get the tags from the database.

In [33]:
query = """
SELECT artist, STRING_AGG(tag::VARCHAR, ', ') as tags
FROM artist_tag
GROUP BY artist;
"""
tags = pd.read_sql_query(query, engine, dtype=str)
tags = tags[tags["artist"].isin([artist["id"] for artists_list in artists_lists for artist in artists_list])]
tags

Unnamed: 0,artist,tags
1,4,"1, 7, 11, 12, 20, 57, 58, 171, 237, 280, 402, ..."
2,6,"11, 71, 92, 171, 237, 349, 1055, 1072, 1391"
3,7,"98, 121, 379, 72115"
4,9,"10, 11, 12, 58, 77, 559, 709, 1282, 1302, 1498..."
5,10,"111, 1661"
...,...,...
210994,2858077,"19, 1670, 1758, 252782"
210995,2858085,77
210996,2858087,166
210997,2858096,"235, 40255, 56565"


In [34]:
mask = tags["artist"].isin(changes_dict.keys())
tags[mask]

Unnamed: 0,artist,tags
3,7,"98, 121, 379, 72115"
294,574,"10, 11, 206, 545"
983,2189,77
993,2214,66
1025,2305,"675, 1090, 5916"
...,...,...
210878,2855368,"172, 1259, 3663, 7363"
210880,2855373,"172, 1259, 7363"
210931,2856354,"133, 761, 1519, 147653"
210952,2856649,237367


In [35]:
tags.loc[mask, "artist"] = tags.loc[mask, "artist"].map(changes_dict)

In [36]:
tags[mask]

Unnamed: 0,artist,tags
3,21685,"98, 121, 379, 72115"
294,9916,"10, 11, 206, 545"
983,1054,77
993,273166,66
1025,306892,"675, 1090, 5916"
...,...,...
210878,2855369,"172, 1259, 3663, 7363"
210880,2855369,"172, 1259, 7363"
210931,2856358,"133, 761, 1519, 147653"
210952,2415290,237367


We can now save the results for the future.

In [37]:
tags.to_csv("../data/artist_tags.csv", index=False)

In [38]:
!wc -l ../data/artist_tags.csv

183415 ../data/artist_tags.csv


In [39]:
# This cell helps with memory issues coming up with the tracks situation
# Dask is great, don't get me wrong, but I'm more comfortable with this
del tags
del artist_freqs
del artists_lists
del pairs
del seen_dict
del jsons
del unique_json
del filtered_relationships
del relationships
del artist_data
del artists
del seen_set
del tracks

## Tracks dataset with main IDs

Now that we're here, why not do the same with the tracks dataset, which we have already in memory.

In [40]:
def replace_helper(df, cs, cd):
    mask = df[cs].isin(cd.keys()).any(axis=1)
    for c in cs:
        df.loc[mask, c] = df.loc[mask, c].map(cd)
    return df

with ProgressBar():
    tracks = dd.read_csv("../data/tracks_no_va.csv", dtype=str, na_values="")
    tracks = tracks.map_partitions(replace_helper, id_columns, changes_dict)
    tracks.to_csv("../data/tracks_no_va_merged.csv", index=False, single_file=True)

del tracks

[########################################] | 100% Completed | 148.09 s


In [41]:
!wc -l ../data/tracks_no_va_merged.csv

24317067 ../data/tracks_no_va_merged.csv


## Cleanup

In [42]:
engine.dispose()
conn.close()

In [43]:
!service postgresql stop

[0;1;31mFailed to stop postgresql.service: Connection timed out[0m
[0;1;31mSee system logs and 'systemctl status postgresql.service' for details.[0m
