# Artist Merge

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
!service postgresql start

In [2]:
# Imports
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Artist merge

Explanations are brief because it's mostly a copy-paste from `artist_artist.ipynb`.

In [5]:
query =\
f"""
SELECT a0.id AS artist0_id, a1.id AS artist1_id
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 1079
    OR link_type = 108
)
"""
changes = pd.read_sql_query(query, engine, dtype=str)
changes

Unnamed: 0,artist0_id,artist1_id
0,805193,182397
1,510355,510353
2,472038,310099
3,366859,134438
4,475823,30986
...,...,...
62558,2155180,2732786
62559,2733152,2493633
62560,107069,2516613
62561,2732796,2732788


In [6]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
# This algorithm groups all the different (same) artists in a list
seen_dict = {}
changes_list = []
last_idx = -1
for _, row in changes.iterrows():
    artist0 = row["artist0_id"]
    artist1 = row["artist1_id"]
    if artist0 in seen_dict:
        if artist1 in seen_dict:
            continue
        artist0_idx = seen_dict[artist0]
        changes_list[artist0_idx].append(artist1)
        seen_dict[artist1] = artist0_idx
    elif artist1 in seen_dict:
        artist1_idx = seen_dict[artist1]
        changes_list[artist1_idx].append(artist0)
        seen_dict[artist0] = artist1_idx
    else:
        last_idx += 1
        changes_list.append([artist0, artist1])
        seen_dict[artist0] = last_idx
        seen_dict[artist1] = last_idx

In [7]:
releases = pd.read_csv("releases_no_va.csv", dtype=str)
releases

Unnamed: 0,name,date,artist_credit,artist_count,a0_id,a0_name,a1_id,a1_name,a2_id,a2_name,a3_id,a3_name,a4_id,a4_name
0,!,2020-08-06,119635,1,119635,Kevin Drumm,,,,,,,,
1,Sabr,2009-02-16,2094632,1,1450753,Shahram Solati,,,,,,,,
2,Sabr Aur Shukr,2023-09-08,351688,1,351688,Shekhar Ravjiani,,,,,,,,
3,Sabra Shatila 1982,2019-12-28,1288322,1,1096452,Geography of Hell,,,,,,,,
4,Sabrana Djela 1976. - 1987.,2019-03-28,414860,1,414860,Paraf,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2841551,Glorious Percussion / In tempus praesens,2011-10-28,1085468,5,947108,Luzerner Sinfonieorchester,947105,Glorious Percussion,538910,Vadim Gluzman,420382,Jonathan Nott,153732,София Асгатовна Губайдулина
2841552,Glow of Benares,2017-11-24,2962164,5,1496607,Abhijit Banerjee,1058331,Randers Kammerorkester,494877,Kala Ramnath,310652,Aarhus Jazz Orchestra,23800,Lars Møller
2841553,Glowing Up,2021-06-11,3222555,5,2290023,TUSO,2290022,Tudor,2118382,Milwin,1886127,Discrete,1184231,Sofia Karlberg
2841554,Gloria and Other Choral Music,1988-12-28,2062383,5,551005,Donna Deam,476955,City of London Sinfonia,129845,The Cambridge Singers,36433,John Rutter,30462,Francis Poulenc


In [8]:
cols = [f"a{i}_id" for i in range(5)]
artist_freqs = (releases[cols].melt().groupby(by=["value"]).count()).to_dict(index="value")["variable"]
artist_freqs

{'10': 6,
 '1000': 44,
 '1000007': 1,
 '1000008': 2,
 '1000017': 1,
 '100002': 1,
 '1000025': 1,
 '100003': 1,
 '100004': 4,
 '1000049': 1,
 '100005': 2,
 '1000058': 1,
 '1000060': 3,
 '1000067': 7,
 '1000068': 2,
 '1000073': 1,
 '1000077': 1,
 '1000078': 1,
 '1000079': 1,
 '1000080': 16,
 '1000081': 3,
 '1000082': 3,
 '1000083': 1,
 '1000085': 2,
 '100009': 1,
 '1000095': 1,
 '1000096': 2,
 '1000107': 1,
 '1000112': 1,
 '100012': 15,
 '1000120': 2,
 '1000121': 1,
 '1000130': 3,
 '1000131': 1,
 '1000135': 12,
 '1000138': 6,
 '100014': 1,
 '1000143': 3,
 '1000144': 1,
 '1000146': 1,
 '1000147': 2,
 '1000148': 1,
 '1000150': 1,
 '1000151': 1,
 '1000154': 1,
 '1000155': 2,
 '1000156': 1,
 '1000157': 1,
 '1000158': 3,
 '1000159': 2,
 '1000161': 4,
 '1000164': 1,
 '1000167': 1,
 '1000168': 3,
 '100017': 3,
 '1000170': 1,
 '1000171': 11,
 '100018': 2,
 '1000182': 1,
 '1000187': 1,
 '1000188': 4,
 '1000196': 3,
 '1000199': 2,
 '100020': 4,
 '1000200': 1,
 '1000202': 5,
 '1000204': 4,
 '100020

In [9]:
# First element of the list will be the "main" instance of the artist
changes_list = list(map(
    lambda artist_list: sorted(artist_list, key=lambda artist: artist_freqs.get(str(artist), 0), reverse=True),
    changes_list
))
changes_list[:5]

[['805193', '182397'],
 ['510355', '510353'],
 ['310099', '472038'],
 ['258876',
  '366859',
  '134438',
  '690598',
  '411300',
  '408293',
  '159973',
  '131031',
  '121353',
  '165609'],
 ['30986', '174009', '475823']]

In [10]:
# Here I create a dict that follows the following:
# Key | Value
# Artist | Main artist
changes_dict = dict()
for artist_list in changes_list:
    main_artist_id = str(artist_list[0])
    for artist in artist_list[1:]:
        if artist_freqs.get(artist, 0) > 0: 
            changes_dict[artist] = main_artist_id

In [11]:
query =\
f"""
SELECT a1.id AS fictional_character, a0.id AS voice_actor
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = 292
)
"""
changes_292 = pd.read_sql_query(query, engine, dtype=str)
for _, fictional_character, voice_actor in changes_292.itertuples():
    changes_dict[fictional_character] = voice_actor
len(changes_dict)

23755

In [12]:
mask = releases[cols].isin(changes_dict.keys()).any(axis=1)
releases.loc[mask, cols]

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
457,2630975,,,,
813,604380,,,,
900,1875555,,,,
902,2119905,,,,
962,1400830,,,,
...,...,...,...,...,...
2841284,2641987,2641986,2641985,2641984,1679364
2841297,2104904,2104903,2041481,1810635,1141182
2841349,741223,535356,352163,284443,180306
2841525,1181250,688686,469344,333071,137868


In [13]:
# WARNING: ~5m exec time
releases.loc[mask, cols] = releases.loc[mask, cols].replace(changes_dict)

releases.loc[mask, cols]

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
457,1245245,,,,
813,284452,,,,
900,1875579,,,,
902,741269,,,,
962,531216,,,,
...,...,...,...,...,...
2841284,2641987,1679364,2641984,2641984,1679364
2841297,1141182,1810635,2041481,1810635,1141182
2841349,741223,203282,158068,190601,180306
2841525,779144,688686,469344,333071,137868


In [14]:
releases.to_csv("releases_no_va_merged.csv", index=False)

In [15]:
!wc -l releases_no_va_merged.csv

2841557 releases_no_va_merged.csv


## Cleanup

In [16]:
engine.dispose()

In [17]:
!service postgresql stop