# Genres and MusicBrainz

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [32]:
# Imports
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from functools import reduce
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x7f5d63597d80; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [5]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x7f5d6352c310; closed: 0>

In [6]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [7]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(665510,)


Seems like a fairly big number, let's check how they relate to each other.

In [9]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are two that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one entity of the same artist in the final CSVs. I'm going to store all occurrences of an artist in a list and stay with the most used instance.

First let's collect the data.

In [10]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in (i for i in link_types["id"] if i not in (108, 1079)):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102.0,Xoel López,248824.0,Lovely Luna,102.0
1,77944.0,Michael Bublé,686291.0,Helping Haiti,102.0
2,391119.0,HHH,1931190.0,HHH×MM×ST,102.0
3,359330.0,Miley Cyrus,686291.0,Helping Haiti,102.0
4,665008.0,Joe McElderry,686291.0,Helping Haiti,102.0
...,...,...,...,...,...
556,1004547.0,SpongeBOZZ,43102.0,SpongeBob SquarePants,973.0
557,2685255.0,Trio Messiaen,10371.0,Olivier Messiaen,973.0
558,2705719.0,GUNRINGER-Y,1603935.0,Gunslinger-R,973.0
559,2707509.0,homura for android,1558612.0,暁美ほむら,973.0


Now we retrieve the changes that are needed.

In [11]:
query =\
f"""
SELECT a0.id AS artist0_id, a0.name AS artist0_name, a1.id AS artist1_id, a1.name AS artist1_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 1079
    OR link_type = 108
)
"""
changes = pd.read_sql_query(query, engine)
changes

Unnamed: 0,artist0_id,artist0_name,artist1_id,artist1_name
0,510355,Tom Salta,510353,Atlas Plug
1,515380,Sara Nicholas,512604,DJ Ginger Snapp
2,1816108,Alex Bilowitz,1303285,Alex Bilo
3,805193,Péter Takács,182397,Deto
4,472038,Hendrik Admiraal,310099,Ferox
...,...,...,...,...
62059,2090235,Mowty Mahlyka,565327,Dark Angel
62060,311014,Adrian Edmondson,221375,Vyvyan
62061,2712993,Paul Garraway,2712994,Soliheen
62062,2712555,Vadim Sprikut,2684903,Drowned


In [49]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
seen_dict = {}
changes_list = []
last_idx = -1
for _, row in changes.iterrows():
    artist0 = (row["artist0_id"], row["artist0_name"])
    artist1 = (row["artist1_id"], row["artist1_name"])
    if artist0 in seen_dict:
        if artist1 in seen_dict:
            continue
        artist0_idx = seen_dict[artist0]
        changes_list[artist0_idx].append(artist1)
        seen_dict[artist1] = artist0_idx
    elif artist1 in seen_dict:
        artist1_idx = seen_dict[artist1]
        changes_list[artist1_idx].append(artist0)
        seen_dict[artist0] = artist1_idx
    else:
        last_idx += 1
        changes_list.append([artist0, artist1])
        seen_dict[artist0] = last_idx
        seen_dict[artist1] = last_idx

Now, for each list of (same) artists, we need to extract how many times it appears.

In [111]:
foo = !grep ",745888," releases-*.csv | wc -l
int(foo[0])

46

In [139]:
def artist_occurrences(artist_id):
    result = !grep ",{artist_id}," releases-*.csv | wc -l
    return int(result[0])

In [110]:
artist_occurrences((745888, "AJR"))

46

In [143]:
test = list(map(
    lambda s: list(map(
        lambda artist: (artist[0], artist[1], artist_occurrences(artist[0])),
        s
    )),
    changes_list[:10])
)
test

[[(510355, 'Tom Salta', 14), (510353, 'Atlas Plug', 4)],
 [(515380, 'Sara Nicholas', 0), (512604, 'DJ Ginger Snapp', 0)],
 [(1816108, 'Alex Bilowitz', 0), (1303285, 'Alex Bilo', 1)],
 [(805193, 'Péter Takács', 0), (182397, 'Deto', 0)],
 [(472038, 'Hendrik Admiraal', 0), (310099, 'Ferox', 1)],
 [(285421, 'Edward Upton', 0),
  (839935, 'BBII', 2),
  (105617, 'Computor Rockers', 6),
  (167634, 'EDMX', 5),
  (119129, 'Bass Potato', 0),
  (329053, 'Ed DMX', 1),
  (64070, 'DMX Krew', 88),
  (329054, 'David Michael Cross', 1),
  (204935, 'Michael Knight', 1),
  (690681, '101 Force', 1),
  (1043503, 'Asylum Seekers', 1)],
 [(475823, 'Nurmad Jusat', 0), (30986, 'Nuron', 8), (174009, 'Fugue', 4)],
 [(567288, 'Charles Hilton Jr.', 0), (567286, 'CJ', 0)],
 [(366859, 'Tobias Lützenkirchen', 4),
  (134438, 'LXR', 3),
  (131031, 'Karosa', 0),
  (121353, 'Richthoven', 0),
  (258876, 'Lützenkirchen', 28),
  (408293, 'L.Y.T.Z.', 1),
  (411300, 'Lu Tracks', 1),
  (165609, '7-7-0', 0),
  (159973, 'Toby Le

In [ ]:
 # NOTE: try if dicts are faster (they must be)

## Cleanup

In [145]:
engine.dispose()

In [146]:
!service postgresql stop