# Genres and MusicBrainz

By Alejandro Fernández Sánchez

## Setting up the connection

In [31]:
# Just in case you're the host and it's not already started
!service postgresql start

In [40]:
# Imports
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import threading
import numpy as np
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x7fac20052980; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [5]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x7fac1ff6f010; closed: 0>

In [6]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [7]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(671957,)


Seems like a fairly big number, let's check how they relate to each other.

In [9]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are two that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one entity of the same artist in the final CSVs. I'm going to store all occurrences of an artist in a list and stay with the most used instance.

First let's collect the data.

In [10]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in (i for i in link_types["id"] if i not in (108, 1079)):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships["id0"] = relationships["id0"].astype(int).astype(str)
relationships["id1"] = relationships["id1"].astype(int).astype(str)
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102.0
1,359330,Miley Cyrus,686291,Helping Haiti,102.0
2,129154,Jay-J,472106,Jay-J & Macari,102.0
3,267439,Andrew Macari,472106,Jay-J & Macari,102.0
4,212204,James Blunt,686291,Helping Haiti,102.0
...,...,...,...,...,...
563,1237428,Curb Cobain,236309,Kurt Cobain,973.0
564,242,The Chemical Brothers,1468,The Dust Brothers,973.0
565,1625587,Chor und Orchester Mantovani,210790,Mantovani,973.0
566,2731567,JELEE,2723888,橘ののか,973.0


Now we retrieve the changes that are needed.

In [11]:
query =\
f"""
SELECT a0.id AS artist0_id, a0.name AS artist0_name, a1.id AS artist1_id, a1.name AS artist1_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 1079
    OR link_type = 108
)
"""
changes = pd.read_sql_query(query, engine)
changes["artist0_id"] = changes["artist0_id"].astype(int).astype(str)
changes["artist1_id"] = changes["artist1_id"].astype(int).astype(str)
changes

Unnamed: 0,artist0_id,artist0_name,artist1_id,artist1_name
0,805193,Péter Takács,182397,Deto
1,366859,Tobias Lützenkirchen,134438,LXR
2,510355,Tom Salta,510353,Atlas Plug
3,299983,Henri Sorvali,488532,The Sieg Heil Man
4,305283,Maureen Walsh,493190,Maureen
...,...,...,...,...
62558,2733152,Evan Kahlenberg,1586316,halberd
62559,2733152,Evan Kahlenberg,1923699,polearm
62560,2733152,Evan Kahlenberg,2493633,lilpolearm
62561,107069,Chris Cowie,2516613,Q


In [12]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
# This algorithm groups all the different (same) artists in a list
seen_dict = {}
changes_list = []
last_idx = -1
for _, row in changes.iterrows():
    artist0 = {"id": row["artist0_id"], "name": row["artist0_name"]}
    artist1 = {"id": row["artist1_id"], "name": row["artist1_name"]}
    if artist0["id"] in seen_dict:
        if artist1["id"] in seen_dict:
            continue
        artist0_idx = seen_dict[artist0["id"]]
        changes_list[artist0_idx].append(artist1)
        seen_dict[artist1["id"]] = artist0_idx
    elif artist1["id"] in seen_dict:
        artist1_idx = seen_dict[artist1["id"]]
        changes_list[artist1_idx].append(artist0)
        seen_dict[artist0["id"]] = artist1_idx
    else:
        last_idx += 1
        changes_list.append([artist0, artist1])
        seen_dict[artist0["id"]] = last_idx
        seen_dict[artist1["id"]] = last_idx

In [13]:
print(len([artist for artist_list in changes_list for artist in artist_list]))
print(len(changes_list))
changes_list[:3]

103569
41508


[[{'id': '805193', 'name': 'Péter Takács'}, {'id': '182397', 'name': 'Deto'}],
 [{'id': '366859', 'name': 'Tobias Lützenkirchen'},
  {'id': '134438', 'name': 'LXR'},
  {'id': '131031', 'name': 'Karosa'},
  {'id': '121353', 'name': 'Richthoven'},
  {'id': '258876', 'name': 'Lützenkirchen'},
  {'id': '408293', 'name': 'L.Y.T.Z.'},
  {'id': '411300', 'name': 'Lu Tracks'},
  {'id': '165609', 'name': '7-7-0'},
  {'id': '159973', 'name': 'Toby Lee Connor'},
  {'id': '690598', 'name': 'Paratopic'}],
 [{'id': '510355', 'name': 'Tom Salta'},
  {'id': '510353', 'name': 'Atlas Plug'}]]

Now we need to extract how many times each artist appears.

In [14]:
col_names = [f"a{i}_id" for i in range(5)]
releases = pd.DataFrame(columns=col_names, dtype=str)
for i in range(1, 6):
    path = f"releases-{i}.csv"
    releases = pd.concat([releases, pd.read_csv(path, usecols=[f"a{j}_id" for j in range(i)], dtype=str)])
releases.fillna("", inplace=True)
releases.head(5)

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
0,119635,,,,
1,491638,,,,
2,674029,,,,
3,834659,,,,
4,872941,,,,


In [15]:
releases.tail(5)

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
9000,620646,473903,373323,238532,23874
9001,1376930,1149230,933101,727181,174389
9002,2098528,2098506,1377684,433418,333305
9003,1550756,1422828,1292976,1260458,1215394
9004,1205355,1205354,1205352,1129217,1033122


In [16]:
artist_freqs = (releases.melt().groupby(by=["value"]).count()).to_dict(index="value")["variable"]

**Side note**: I've noticed that there's a "Various Artists" artist with ID 1. We should maybe take care of it once we're passed this (I'm thinking of removing it from the CSVs and just lower the number of artists involved by one for each song).

In [17]:
# First element of the list will be the "main" instance of the artist
changes_list = list(map(
    lambda artist_list: sorted(artist_list, key=lambda artist: artist_freqs.get(str(artist["id"]), 0), reverse=True),
    changes_list
))

In [23]:
len(changes_list)

41508

In [18]:
# Here I create a dict that follows the following:
# Key | Value
# Artist | Main artist
changes_dict = dict()
for artist_list in changes_list:
    main_artist_id = str(artist_list[0]["id"])
    for artist in artist_list[1:]:
        if artist_freqs.get(artist["id"], 0) > 0: 
            changes_dict[artist["id"]] = main_artist_id

In [19]:
len(changes_dict)

15126

In [37]:
# Before doing nothing we need to establish relationships only with
# the artists that we care about
relationships = relationships[relationships.isin(artist_freqs.keys()).any(axis=1)]
len(relationships)

494617

In [52]:
import concurrent.futures

def aux_func(original_chunk):
    return original_chunk.replace(changes_dict)

original_chunks = np.array_split(relationships.iloc[:100000], 12)
with concurrent.futures.ProcessPoolExecutor() as executor:
    modified_chunks = list(executor.map(aux_func, original_chunks))

relationships_ = pd.concat(modified_chunks)
len(relationships_)

  return bound(*args, **kwds)


100000

## Cleanup

In [53]:
engine.dispose()

In [54]:
!service postgresql stop