# Genres and MusicBrainz

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x7f68de8fe980; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [5]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x7f68de816e30; closed: 0>

In [6]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [7]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(671957,)


Seems like a fairly big number, let's check how they relate to each other.

In [9]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are three that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one entity of the same artist in the final CSVs. I'm going to store all occurrences of an artist in a list and stay with the most used instance.

Id 292. This relationship links a voice actor with their character. We want to store the voice actor.

First let's collect the data.

In [10]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in filter(lambda lt: lt not in (108, 292, 1079), link_types.id):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships["id0"] = relationships["id0"].astype(int).astype(str)
relationships["id1"] = relationships["id1"].astype(int).astype(str)
relationships["relationship_type"] = relationships["relationship_type"].astype(int)
relationships = relationships.drop_duplicates()
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
563,2719063,S. E. Nováček so svým orchestrem,2719061,Sláva Eman Nováček,973
564,242,The Chemical Brothers,1468,The Dust Brothers,973
565,1237428,Curb Cobain,236309,Kurt Cobain,973
566,1625587,Chor und Orchester Mantovani,210790,Mantovani,973


In [11]:
relationships.drop_duplicates()

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
563,2719063,S. E. Nováček so svým orchestrem,2719061,Sláva Eman Nováček,973
564,242,The Chemical Brothers,1468,The Dust Brothers,973
565,1237428,Curb Cobain,236309,Kurt Cobain,973
566,1625587,Chor und Orchester Mantovani,210790,Mantovani,973


Now we retrieve the changes that are needed for ids 1079 and 108.

In [12]:
query =\
f"""
SELECT a0.id AS artist0_id, a1.id AS artist1_id
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 1079
    OR link_type = 108
)
"""
changes = pd.read_sql_query(query, engine)
changes["artist0_id"] = changes["artist0_id"].astype(int).astype(str)
changes["artist1_id"] = changes["artist1_id"].astype(int).astype(str)
changes

Unnamed: 0,artist0_id,artist1_id
0,805193,182397
1,510355,510353
2,515380,512604
3,366859,134438
4,299983,488532
...,...,...
62558,2733152,1586316
62559,2733152,1923699
62560,2733152,2493633
62561,107069,2516613


In [13]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
# This algorithm groups all the different (same) artists in a list
seen_dict = {}
changes_list = []
last_idx = -1
for _, row in changes.iterrows():
    artist0 = row["artist0_id"]
    artist1 = row["artist1_id"]
    if artist0 in seen_dict:
        if artist1 in seen_dict:
            continue
        artist0_idx = seen_dict[artist0]
        changes_list[artist0_idx].append(artist1)
        seen_dict[artist1] = artist0_idx
    elif artist1 in seen_dict:
        artist1_idx = seen_dict[artist1]
        changes_list[artist1_idx].append(artist0)
        seen_dict[artist0] = artist1_idx
    else:
        last_idx += 1
        changes_list.append([artist0, artist1])
        seen_dict[artist0] = last_idx
        seen_dict[artist1] = last_idx

In [14]:
print(len([artist for artist_list in changes_list for artist in artist_list]))
print(len(changes_list))
changes_list[:3]

103569
41509


[['805193', '182397'], ['510355', '510353'], ['515380', '512604']]

Now we need to extract how many times each artist appears.

In [15]:
col_names = [f"a{i}_id" for i in range(5)]
releases = pd.DataFrame(columns=col_names, dtype=str)
for i in range(1, 6):
    path = f"releases-{i}.csv"
    releases = pd.concat([releases, pd.read_csv(path, usecols=[f"a{j}_id" for j in range(i)], dtype=str)])
releases.fillna("", inplace=True)
releases.head(5)

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
0,119635,,,,
1,491638,,,,
2,674029,,,,
3,834659,,,,
4,872941,,,,


In [16]:
releases.tail(5)

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
9000,620646,473903,373323,238532,23874
9001,1376930,1149230,933101,727181,174389
9002,2098528,2098506,1377684,433418,333305
9003,1550756,1422828,1292976,1260458,1215394
9004,1205355,1205354,1205352,1129217,1033122


In [17]:
artist_freqs = (releases.melt().groupby(by=["value"]).count()).to_dict(index="value")["variable"]

**Side note**: I've noticed that there's a "Various Artists" artist with ID 1. We should maybe take care of it once we're passed this (I'm thinking of removing it from the CSVs and just lower the number of artists involved by one for each song).

The next step is to order each list of different (same) artists so that the first element of the list is the artist instance what will remain after all changes have taken placen. From that list a dictionary holding the same information is created, but in a format that will allow Pandas to do the replacing we seek.

In [18]:
# First element of the list will be the "main" instance of the artist
changes_list = list(map(
    lambda artist_list: sorted(artist_list, key=lambda artist: artist_freqs.get(str(artist), 0), reverse=True),
    changes_list
))

In [19]:
changes_list[:5]

[['805193', '182397'],
 ['510355', '510353'],
 ['515380', '512604'],
 ['258876',
  '366859',
  '134438',
  '690598',
  '408293',
  '411300',
  '159973',
  '131031',
  '121353',
  '165609'],
 ['299983', '488532']]

In [20]:
# Here I create a dict that follows the following:
# Key | Value
# Artist | Main artist
changes_dict = dict()
for artist_list in changes_list:
    main_artist_id = str(artist_list[0])
    for artist in artist_list[1:]:
        if artist_freqs.get(artist, 0) > 0: 
            changes_dict[artist] = main_artist_id

In [21]:
len(changes_dict)

15127

No for the relationship with id 292 the key will be the fictional character and the value will be the voice actor.

In [22]:
query =\
f"""
SELECT a1.id AS fictional_character, a0.id AS voice_actor
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 292
)
"""
changes = pd.read_sql_query(query, engine)
changes["fictional_character"] = changes["fictional_character"].astype(int).astype(str)
changes["voice_actor"] = changes["voice_actor"].astype(int).astype(str)
changes

Unnamed: 0,fictional_character,voice_actor
0,735344,496417
1,738237,564124
2,701492,732019
3,2679153,2679152
4,2679144,373593
...,...,...
8989,2729442,2353282
8990,2729443,1432171
8991,2729445,2729444
8992,2730943,564124


In [23]:
for _, fictional_character, voice_actor in changes.itertuples():
    changes_dict[fictional_character] = voice_actor
len(changes_dict)

23594

We only care about artists we have already stored.

In [24]:
# Before doing anything we need to establish relationships only with
# the artists that we care about
relationships = relationships[relationships.isin(artist_freqs.keys()).any(axis=1)]
len(relationships)

455563

These are the relationships that we need to modify.

In [25]:
relationships.loc[relationships.isin(changes_dict.keys()).any(axis=1)]

Unnamed: 0,id0,name0,id1,name1,relationship_type
22,701492,Queen,699073,Queen & Elizabeth,102
79,116653,David Harrow,598952,The Justice League of Zion,102
93,116653,David Harrow,79824,Planet 4 Folk Quartet,102
116,426487,The Count of Monte Cristal,541571,The Count & Sinden,102
126,184688,Frank Tovey,195090,Mkultra,102
...,...,...,...,...,...
513,2150344,t e l e g o o f テレビ へま,1186724,t e l e p a t h テレパシー能力者,973
546,2239546,DJ SpongeBob,43102,SpongeBob SquarePants,973
555,1004547,SpongeBOZZ,43102,SpongeBob SquarePants,973
556,2705719,GUNRINGER-Y,1603935,Gunslinger-R,973


In [26]:
mask = relationships.isin(changes_dict.keys()).any(axis=1)
relationships.loc[mask] = relationships.loc[mask].replace(changes_dict)

relationships.loc[mask]

Unnamed: 0,id0,name0,id1,name1,relationship_type
22,732019,Queen,699073,Queen & Elizabeth,102
79,8151,David Harrow,598952,The Justice League of Zion,102
93,8151,David Harrow,79824,Planet 4 Folk Quartet,102
116,344890,The Count of Monte Cristal,541571,The Count & Sinden,102
126,46990,Frank Tovey,195090,Mkultra,102
...,...,...,...,...,...
513,2150344,t e l e g o o f テレビ へま,1282472,t e l e p a t h テレパシー能力者,973
546,2239546,DJ SpongeBob,462543,SpongeBob SquarePants,973
555,1700445,SpongeBOZZ,462543,SpongeBob SquarePants,973
556,2595181,GUNRINGER-Y,334571,Gunslinger-R,973


In [27]:
changes_dict["701492"]

'732019'

Now we can finally save our relationships CSV.

In [28]:
relationships.to_csv("relationships.csv")

In [29]:
!wc -l relationships.csv

455564 relationships.csv


## Cleanup

In [30]:
engine.dispose()

In [31]:
!service postgresql stop