# Genres and MusicBrainz

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [5]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x7fe245f6c7c0; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [6]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x7fe2461e7e20; closed: 0>

In [7]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [4]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(665510,)


Seems like a fairly big number, let's check how they relate to each other.

In [12]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are two that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one of the two entities in the final CSVs. I think we should keep the second entity in the first case and the first entity in the second case. Let me know your thoughts on this.

First let's collect the data.

In [28]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in (i for i in link_types["id"] if i not in (108, 1079)):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,594467.0,Trugoy the Dove,871206.0,First Serve,102.0
1,1156171.0,CANDY GO!GO!,2653059.0,RE:IDOLオールスターズ,102.0
2,1180005.0,dela,2653059.0,RE:IDOLオールスターズ,102.0
3,2653053.0,えびすばし☆プリンセス,2653059.0,RE:IDOLオールスターズ,102.0
4,1154669.0,Feam,2653059.0,RE:IDOLオールスターズ,102.0
...,...,...,...,...,...
556,2647573.0,Bobby Byrne's Dixielanders,364225.0,Bobby Byrne,973.0
557,2647579.0,The Gigi Gryce Orchestra,105085.0,Gigi Gryce,973.0
558,2648591.0,Uzur,1214053.0,Princess Charlotte of Württemberg,973.0
559,2647582.0,The Reinhold Svensson Trio,617987.0,Reinhold Svensson,973.0


Now we retrieve the changes that are needed.

In [29]:
query =\
f"""
SELECT a0.id AS from_id, a0.name AS from_name, a1.id AS to_id, a1.name AS to_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 1079
)

UNION

SELECT a1.id AS from_id, a1.name AS from_name, a0.id AS to_id, a0.name AS to_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 108
)
"""
changes = pd.read_sql_query(query, engine)
changes

Unnamed: 0,from_id,from_name,to_id,to_name
0,47866,Little Willie John,411404,William Edward John
1,1230127,Walshy Fire,1301899,Leighton Paul Walsh
2,283685,Olga+Jozef,291743,Dalibor Kŕč
3,23976,Ozark Henry,250768,Piet Goddaer
4,2444591,Seven Sins,43345,Susperia
...,...,...,...,...
62030,779888,Porn on Vinyl,1433177,Aidan Wall
62031,1555796,GOFISH,1555798,寺井ショウタ
62032,1380034,Vylet Pony,1925424,Zelda Trixie Lulamoon
62033,2301842,Marco elsewhere,2703935,Niklas Marco Shahly


Last detail: if an artist have changed names more than once, we want to **TODO**

Now we make the changes:

In [31]:
# TODO

def apply_changes(row):
    if (row['id0'], row['name0']) in zip(changes['from_id'], changes['from_name']):
        new_id = changes.loc[(changes['from_id'] == row['id0']) & (changes['from_name'] == row['name0']), 'to_id']
        new_name = changes.loc[(changes['from_id'] == row['id0']) & (changes['from_name'] == row['name0']), 'to_name']
    if (row['id1'], row['name1']) in zip(changes['from_id'], changes['from_name']):
    
    return row

Pandas(Index=0, from_id=47866, from_name='Little Willie John', to_id=411404, to_name='William Edward John')


## Cleanup

In [9]:
engine.dispose()

In [10]:
!service postgresql stop