# Artist-Artist relationships

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
from typing import Any
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from concurrent.futures import ProcessPoolExecutor
import os
from dotenv import load_dotenv
import json
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Establishing a connection via postgre's python driver
conn = psycopg2.connect(
    database=DB_NAME,
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASS,
    port=DB_PORT
)
conn

<connection object at 0x7f80eaeff740; dsn: 'user=musicbrainz password=xxx dbname=musicbrainz_db host=localhost port=5432', closed: 0>

In [5]:
cursor = conn.cursor()  # Helps with querying without memory allocation
cursor

<cursor object at 0x7f80eac945e0; closed: 0>

In [6]:
# Helper function
def query_with_cursor(c, q, column_names=False, head=False):
    conn.rollback()  # This is needed if a previous query fails
    c.execute(q)
    if column_names:
        print([col[0] for col in c.description])
    count = 0
    for r in c:
        print(r)
        count += 1
        if head and count == 10:
            break

In [7]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url, pool_size=10, max_overflow=0)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Types of artists relationships

How many are there?

In [8]:
query_with_cursor(
    cursor,
    "SELECT COUNT(*) FROM l_artist_artist"
)

(696360,)


Seems like a fairly big number, let's check how they relate to each other.

In [9]:
query =\
"""
SELECT id, name, description, long_link_phrase
FROM link_type
WHERE entity_type0 = 'artist'
  AND entity_type1 = 'artist'
ORDER BY id
"""
pd.read_sql_query(query, engine)

Unnamed: 0,id,name,description,long_link_phrase
0,102,collaboration,"This is used to specify that an <a href=""/doc/...",collaborated {minor:minorly} {additional:addit...
1,103,member of band,This indicates a person is a member of a group.,is/was {additional:an|a} {additional} {origina...
2,104,supporting musician,Indicates an artist doing long-time instrument...,is/was a supporting artist for
3,105,instrumental supporting musician,Indicates a musician doing long-time instrumen...,does/did {instrument} support for
4,106,musical relationships,,musical relationship
5,107,vocal supporting musician,Indicates a musician doing long-time vocal sup...,does/did {vocal:%|vocals} support for
6,108,is person,This links an artist's performance name (a sta...,performs as
7,109,parent,Indicates a parent-child relationship.,is the {step}parent of
8,110,sibling,This links two siblings (brothers or sisters).,has {half:half-}{step}sibling
9,111,married,This links artists who were married.,is/was married to


It seems like we have 22 possible relationships. They are all important, but there are three that differ from the rest.

Ids 1079 and 108. As I understand them, we should only have one entity of the same artist in the final CSVs. I'm going to store all occurrences of an artist in a list and stay with the most used instance.

Id 292. This relationship links a voice actor with their character.

## Artist dataset

We now have all the information needed to tackle the task of generating an artist dataset. Using relationships 108, 292, 1079 we'll create a list of known ids and names for each artist and store them following the JSONL convention.

The first subtask is to find pairs of entities that represents the same artist.

In [10]:
query =\
f"""
SELECT a0.id AS a0_id, a0.name AS a0_name, a1.id AS a1_id, a1.name AS a1_name
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type IN (1079, 108, 292)
)
"""
pairs = pd.read_sql_query(query, engine, dtype=str)
pairs.drop_duplicates(inplace=True)
pairs

Unnamed: 0,a0_id,a0_name,a1_id,a1_name
0,805193,Péter Takács,182397,Deto
1,366859,Tobias Lützenkirchen,134438,LXR
2,299983,Henri Sorvali,488532,The Sieg Heil Man
3,305283,Maureen Walsh,493190,Maureen
4,44387,Vincent de Moor,501299,Outline
...,...,...,...,...
74086,196112,Slave Raider,2809141,Chainsaw
74087,1772197,Kaosic,2809271,kaosicwips
74088,2102836,Kenneyon,2809286,KenKen
74089,2809328,David Meredith,1553632,Gudsforladt


We now separate each artist in their own list of entities (id and name).

In [11]:
# I've iterated though some algorithms that I came up with and this is the fastest one (that works)
# This algorithm groups all the different (same) artists in a list
seen_dict = dict()
artists_lists = list()
last_idx = -1
for _, row in pairs.iterrows():
    artist0 = {"id": row["a0_id"], "name": row["a0_name"]}
    artist1 = {"id": row["a1_id"], "name": row["a1_name"]}
    if artist0["id"] in seen_dict:
        if artist1["id"] in seen_dict:
            continue
        artist0_idx = seen_dict[artist0["id"]]
        artists_lists[artist0_idx].append(artist1)
        seen_dict[artist1["id"]] = artist0_idx
    elif artist1["id"] in seen_dict:
        artist1_idx = seen_dict[artist1["id"]]
        artists_lists[artist1_idx].append(artist0)
        seen_dict[artist0["id"]] = artist1_idx
    else:
        last_idx += 1
        artists_lists.append([artist0, artist1])
        seen_dict[artist0["id"]] = last_idx
        seen_dict[artist1["id"]] = last_idx
artists_lists[:5]

[[{'id': '805193', 'name': 'Péter Takács'}, {'id': '182397', 'name': 'Deto'}],
 [{'id': '366859', 'name': 'Tobias Lützenkirchen'},
  {'id': '134438', 'name': 'LXR'},
  {'id': '131031', 'name': 'Karosa'},
  {'id': '121353', 'name': 'Richthoven'},
  {'id': '411300', 'name': 'Lu Tracks'},
  {'id': '258876', 'name': 'Lützenkirchen'},
  {'id': '408293', 'name': 'L.Y.T.Z.'},
  {'id': '165609', 'name': '7-7-0'},
  {'id': '159973', 'name': 'Toby Lee Connor'},
  {'id': '690598', 'name': 'Paratopic'}],
 [{'id': '299983', 'name': 'Henri Sorvali'},
  {'id': '488532', 'name': 'The Sieg Heil Man'}],
 [{'id': '305283', 'name': 'Maureen Walsh'},
  {'id': '493190', 'name': 'Maureen'}],
 [{'id': '44387', 'name': 'Vincent de Moor'},
  {'id': '501299', 'name': 'Outline'},
  {'id': '383291', 'name': 'Flashbang'},
  {'id': '444700', 'name': 'Emerald'},
  {'id': '847529', 'name': 'Extract'},
  {'id': '154886', 'name': 'Sidewalk'},
  {'id': '102837', 'name': 'Fix to Fax'},
  {'id': '60141', 'name': 'VDM'}]]

We now need to find which artist instance is the most common. For that we're going to be using the `releases_no_va.csv` dataset.

In [12]:
releases = pd.read_csv("releases_no_va.csv", dtype=str)
releases.fillna("", inplace=True)
releases

Unnamed: 0,name,date,artist_credit,artist_count,a0_id,a0_name,tags,a1_id,a1_name,a2_id,a2_name,a3_id,a3_name,a4_id,a4_name
0,!,2020-08-06,119635,1,119635,Kevin Drumm,"159660, 148804, 1769, 166",,,,,,,,
1,Sad Boy Song #39,2023-04-21,1384884,1,1167315,Smokey Brights,,,,,,,,,
2,Sad Boy Songs,2019-06-28,2222601,1,1667572,Captain Johnny Sausage,,,,,,,,,
3,Sad Boy Stomp,2018-06-01,2760005,1,2010087,King Strang,"6844, 2485, 46031, 1416, 127, 71, 75",,,,,,,,
4,Sad Boy Summer,2021-08-27,2775906,1,2020450,Negative 25,"1100, 1211, 1091, 20",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2959864,Glowing Up,2021-06-11,3222555,5,2290023,TUSO,,2290022,Tudor,2118382,Milwin,1886127,Discrete,1184231,Sofia Karlberg
2959865,Glowing Up (stripped),2021-08-20,3222555,5,2290023,TUSO,,2290022,Tudor,2118382,Milwin,1886127,Discrete,1184231,Sofia Karlberg
2959866,Glück,2015-04-22,2490708,5,1530048,Christian Wolfarth,,1114541,Enrico Malatesta,809427,Ingar Zach,430419,Michael Vorfeld,416657,Burkhard Beins
2959867,Go From My Window: Music for the Virginal,1994-09-27,2814928,5,464137,Colin Tilney,,291593,John Bull,124235,Orlando Gibbons,103870,Jan Pieterszoon Sweelinck,12435,William Byrd


In [13]:
id_columns = [f"a{i}_id" for i in range(5)]
artist_freqs = (releases[id_columns].melt().groupby(by=["value"]).count()).to_dict(index="value")["variable"]
artist_freqs

{'': 11358942,
 '10': 6,
 '1000': 45,
 '1000007': 1,
 '1000008': 2,
 '1000017': 1,
 '100002': 1,
 '1000025': 1,
 '100003': 1,
 '100004': 4,
 '1000049': 1,
 '100005': 2,
 '1000058': 1,
 '1000060': 3,
 '1000067': 7,
 '1000068': 2,
 '1000073': 1,
 '1000077': 1,
 '1000078': 1,
 '1000079': 1,
 '1000080': 17,
 '1000081': 3,
 '1000082': 3,
 '1000083': 1,
 '1000085': 2,
 '100009': 1,
 '1000095': 1,
 '1000096': 2,
 '1000107': 1,
 '1000112': 1,
 '100012': 15,
 '1000120': 2,
 '1000121': 1,
 '1000130': 3,
 '1000131': 1,
 '1000135': 12,
 '1000138': 10,
 '100014': 1,
 '1000143': 3,
 '1000144': 1,
 '1000146': 1,
 '1000147': 2,
 '1000148': 1,
 '1000150': 1,
 '1000151': 1,
 '1000154': 2,
 '1000155': 2,
 '1000156': 1,
 '1000157': 1,
 '1000158': 3,
 '1000159': 2,
 '1000161': 4,
 '1000164': 1,
 '1000167': 1,
 '1000168': 3,
 '100017': 3,
 '1000170': 1,
 '1000171': 11,
 '100018': 2,
 '1000182': 1,
 '1000187': 1,
 '1000188': 5,
 '1000196': 4,
 '1000199': 2,
 '100020': 4,
 '1000200': 1,
 '1000202': 5,
 '10002

We are now ready to sort the lists so that the most common instance is the first element of the list.

In [14]:
# First element of the list will be the "main" instance of the artist
artists_lists = list(map(
    lambda artist_list: sorted(artist_list, key=lambda artist: artist_freqs.get(artist["id"], 0), reverse=True),
    artists_lists
))
artists_lists[:5]

[[{'id': '805193', 'name': 'Péter Takács'}, {'id': '182397', 'name': 'Deto'}],
 [{'id': '258876', 'name': 'Lützenkirchen'},
  {'id': '366859', 'name': 'Tobias Lützenkirchen'},
  {'id': '134438', 'name': 'LXR'},
  {'id': '690598', 'name': 'Paratopic'},
  {'id': '411300', 'name': 'Lu Tracks'},
  {'id': '408293', 'name': 'L.Y.T.Z.'},
  {'id': '159973', 'name': 'Toby Lee Connor'},
  {'id': '131031', 'name': 'Karosa'},
  {'id': '121353', 'name': 'Richthoven'},
  {'id': '165609', 'name': '7-7-0'}],
 [{'id': '299983', 'name': 'Henri Sorvali'},
  {'id': '488532', 'name': 'The Sieg Heil Man'}],
 [{'id': '305283', 'name': 'Maureen Walsh'},
  {'id': '493190', 'name': 'Maureen'}],
 [{'id': '44387', 'name': 'Vincent de Moor'},
  {'id': '444700', 'name': 'Emerald'},
  {'id': '501299', 'name': 'Outline'},
  {'id': '383291', 'name': 'Flashbang'},
  {'id': '847529', 'name': 'Extract'},
  {'id': '154886', 'name': 'Sidewalk'},
  {'id': '102837', 'name': 'Fix to Fax'},
  {'id': '60141', 'name': 'VDM'}]]

So far we've only handled the "multiple instances" artists. The following block of code adds the "single instance" artists to the list. It doesn't matter if we do this after the sort, these lists are going to have only one instance of the artist after all.

In [15]:
# Has to be a better way to do this
# WARNING: ~3m execution time
for _, row in releases.iterrows():
    artists_in_row = (
        {
            "id": row[f"a{i}_id"],
            "name": row[f"a{i}_name"],
        }
        for i in range(5) if row[f"a{i}_id"] != ""
    )
    for artist in artists_in_row:
        if artist["id"] not in seen_dict:
            artists_lists.append([artist])
artists_lists[-5:]

[[{'id': '103870', 'name': 'Jan Pieterszoon Sweelinck'}],
 [{'id': '12435', 'name': 'William Byrd'}],
 [{'id': '1205355', 'name': 'Loota'}],
 [{'id': '1205354', 'name': 'JayAllDay'}],
 [{'id': '1033122', 'name': 'Okasian'}]]

At this point we have a list of lists, the following will transform what we have into a list of dictionaries, ready to be serialized.

In [16]:
artists = list()
for artists_list in artists_lists:
    current_artist = dict()
    current_artist["main_id"] = artists_list[0]["id"]
    current_artist["known_ids"] = [artist["id"] for artist in artists_list]
    current_artist["known_names"] = [artist["name"] for artist in artists_list]
    artists.append(current_artist)
artists[:5]

[{'main_id': '805193',
  'known_ids': ['805193', '182397'],
  'known_names': ['Péter Takács', 'Deto']},
 {'main_id': '258876',
  'known_ids': ['258876',
   '366859',
   '134438',
   '690598',
   '411300',
   '408293',
   '159973',
   '131031',
   '121353',
   '165609'],
  'known_names': ['Lützenkirchen',
   'Tobias Lützenkirchen',
   'LXR',
   'Paratopic',
   'Lu Tracks',
   'L.Y.T.Z.',
   'Toby Lee Connor',
   'Karosa',
   'Richthoven',
   '7-7-0']},
 {'main_id': '299983',
  'known_ids': ['299983', '488532'],
  'known_names': ['Henri Sorvali', 'The Sieg Heil Man']},
 {'main_id': '305283',
  'known_ids': ['305283', '493190'],
  'known_names': ['Maureen Walsh', 'Maureen']},
 {'main_id': '44387',
  'known_ids': ['44387',
   '444700',
   '501299',
   '383291',
   '847529',
   '154886',
   '102837',
   '60141'],
  'known_names': ['Vincent de Moor',
   'Emerald',
   'Outline',
   'Flashbang',
   'Extract',
   'Sidewalk',
   'Fix to Fax',
   'VDM']}]

In [17]:
artists[-5:]

[{'main_id': '103870',
  'known_ids': ['103870'],
  'known_names': ['Jan Pieterszoon Sweelinck']},
 {'main_id': '12435', 'known_ids': ['12435'], 'known_names': ['William Byrd']},
 {'main_id': '1205355', 'known_ids': ['1205355'], 'known_names': ['Loota']},
 {'main_id': '1205354',
  'known_ids': ['1205354'],
  'known_names': ['JayAllDay']},
 {'main_id': '1033122', 'known_ids': ['1033122'], 'known_names': ['Okasian']}]

The only thing left is to actually serialize what we have.

In [18]:
jsons = [json.dumps(artist, ensure_ascii=False) for artist in artists]
unique_jsons = list(set(jsons))

with open("artists.jsonl", "w", encoding="utf-8") as out_file:
    for unique_json in unique_jsons:
        out_file.write(unique_json + "\n")

In [19]:
!wc -l artists.jsonl

810337 artists.jsonl


We can now retrieve the data every time we want.

In [20]:
with open("artists.jsonl", "r", encoding="utf-8") as in_file:
    artist_data = [json.loads(line) for line in in_file]
print(artist_data[:5])

[{'main_id': '2561682', 'known_ids': ['2561682'], 'known_names': ['Denise Boyette']}, {'main_id': '1248371', 'known_ids': ['1248371'], 'known_names': ['Angel Flo']}, {'main_id': '681413', 'known_ids': ['681413'], 'known_names': ['Victoria Baillie']}, {'main_id': '1740637', 'known_ids': ['1740637'], 'known_names': ['Ludger Billerbeck']}, {'main_id': '1359505', 'known_ids': ['1359505'], 'known_names': ['Alex Lacamoire']}]


## Relationships dataset

The next step is to save the relationships between the artists in a CSV file. Let's start collecting the different relationships.

In [21]:
link_types = pd.read_sql_query("SELECT DISTINCT id FROM link_type  WHERE entity_type0 = 'artist' AND entity_type1 = 'artist'", engine)
relationships = pd.DataFrame({
    'id0': [],
    'name0': [],
    'id1': [],
    'name1': [],
    'relationship_type': [],
})
for link_type in filter(lambda lt: lt not in (108, 292, 1079), link_types.id):
    query =\
f"""
SELECT a0.id AS id0, a0.name AS name0, a1.id AS id1, a1.name AS name1, {link_type} AS relationship_type
FROM l_artist_artist laa
JOIN artist a0 ON a0.id = laa.entity0
JOIN artist a1 ON a1.id = laa.entity1
WHERE laa.link IN (
    SELECT id
    FROM link
    WHERE link_type = {link_type}
);
"""
    result = pd.read_sql_query(query, engine, dtype=str)
    if result.empty:
        continue
    relationships = pd.concat([relationships, result])
del result
relationships.drop_duplicates(inplace=True)
relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
627,2770439,Frank Ferrar y Su Orquesta,137590,Waldo de los Ríos,973
628,1944261,Maâlem Hamid El Kasri et son Groupe,2774376,Hamid Faraji,973
629,2781260,Friends of Jerry,30866,Jerry Garcia,973
630,405197,Dananananaykroyd,342839,Dan Aykroyd,973


Now we filter the relationships so that we don't have artists that don't concern us.

In [22]:
mask = relationships[["id0", "id1"]].isin(artist_freqs.keys()).all(axis=1)
filtered_relationships = relationships[mask]
filtered_relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
612,2754335,My Little Romance,48734,My Chemical Romance,973
615,2733328,The Erogerigegege,217284,The Gerogerigegege,973
617,2672523,Big Creek Slim and the Cockroaches,1217031,Big Creek Slim,973
618,760760,Ludwig Göransson,1021,Ludwig van Beethoven,973


Now, we have the relationships, but we've made an artist dataset that will help in the task of replacing the non-important id with the main id for each artist. For this task a dictionary will be created.

In [23]:
changes_dict = dict()
for artist in artist_data:
    if len(artist["known_ids"]) > 1:
        for known_id in artist["known_ids"]:
            if known_id != artist["main_id"]:
                changes_dict[known_id] = artist["main_id"]
len(changes_dict)

73158

These are the relationships that we need to modify.

In [24]:
mask = filtered_relationships[["id0", "id1"]].isin(changes_dict.keys()).any(axis=1)
filtered_relationships

Unnamed: 0,id0,name0,id1,name1,relationship_type
0,448102,Xoel López,248824,Lovely Luna,102
1,359330,Miley Cyrus,686291,Helping Haiti,102
2,129154,Jay-J,472106,Jay-J & Macari,102
3,267439,Andrew Macari,472106,Jay-J & Macari,102
4,212204,James Blunt,686291,Helping Haiti,102
...,...,...,...,...,...
612,2754335,My Little Romance,48734,My Chemical Romance,973
615,2733328,The Erogerigegege,217284,The Gerogerigegege,973
617,2672523,Big Creek Slim and the Cockroaches,1217031,Big Creek Slim,973
618,760760,Ludwig Göransson,1021,Ludwig van Beethoven,973


In [25]:
filtered_relationships.loc[mask, ["id0", "id1"]] = filtered_relationships.loc[mask, ["id0", "id1"]].replace(changes_dict)

filtered_relationships.loc[mask]

Unnamed: 0,id0,name0,id1,name1,relationship_type
107,83356,Ulrich Pöppelbaum,262073,Tom Borijn,102
114,1206864,John Mitchell,426382,Blind Ego,102
155,288057,Gecko,501793,Heckmann & Gecko,102
183,33137,Dirk Serries,248031,Continuum,102
201,571193,Zzino,198811,Zzino vs. Accelerator,102
...,...,...,...,...,...
561,2281032,Flac Plus,860871,Macintosh Plus,973
565,2281012,MACINTOPOESH POELUS,860871,Macintosh Plus,973
567,2280996,MACINTOSH IACON,860871,Macintosh Plus,973
568,2281030,JUICE PLUS,860871,Macintosh Plus,973


In [26]:
changes_dict["426487"]

'344890'

We can make sure that there are no cyclic references this way (hoping for a False return):

In [27]:
filtered_relationships[["id0", "id1"]].isin(changes_dict.keys()).any(axis=1).any()

False

Now we can finally save our relationships CSV.

In [28]:
filtered_relationships.to_csv("relationships.csv", index=False)

In [29]:
!wc -l relationships.csv

130758 relationships.csv


## Tags

We now can get the tags from the database.

In [30]:
query = """
SELECT artist, STRING_AGG(tag::VARCHAR, ', ') as tags
FROM artist_tag
GROUP BY artist;
"""
tags = pd.read_sql_query(query, engine, dtype=str)
tags = tags[tags["artist"].isin([artist["id"] for artists_list in artists_lists for artist in artists_list])]
tags

Unnamed: 0,artist,tags
1,4,"1, 7, 11, 12, 20, 57, 58, 171, 237, 280, 402, ..."
2,6,"11, 71, 92, 171, 237, 349, 1055, 1072, 1391"
3,7,"98, 121, 379, 72115"
4,9,"10, 11, 12, 58, 77, 559, 709, 1282, 1302, 1498..."
5,10,"111, 1661"
...,...,...
206997,2809052,1127
206998,2809066,"205, 3590, 7112"
207002,2809245,11
207008,2809333,"11, 19, 186, 634, 1670, 1758, 252782, 253935, ..."


In [31]:
mask = tags["artist"].isin(changes_dict.keys())
tags[mask]

Unnamed: 0,artist,tags
3,7,"98, 121, 379, 72115"
294,574,"10, 11, 206, 545"
894,2022,10
981,2189,77
991,2214,66
...,...,...
206943,2806403,"33, 744"
206977,2808038,"12, 49, 235, 662, 36080, 39754, 49120, 105557"
206978,2808039,"12, 49, 235, 662, 36080, 39754, 49120, 105557"
206979,2808040,"12, 49, 235, 662, 36080, 39754, 49120, 105557"


In [32]:
tags.loc[mask, "artist"] = tags.loc[mask, "artist"].replace(changes_dict)

We can now save the results for the future.

In [33]:
tags.to_csv("artist_tags.csv", index=False)

## Releases dataset with main IDs

Now that we're here, why not do the same with the releases dataset, which we have already in memory.

In [34]:
mask = releases[id_columns].isin(changes_dict.keys()).any(axis=1)
releases.loc[mask, id_columns]

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
79,2241953,,,,
200,2158767,,,,
338,599547,,,,
353,131234,,,,
368,2340195,,,,
...,...,...,...,...,...
2959589,2641987,2641986,2641985,2641984,1679364
2959594,2104904,2104903,2041481,1810635,1141182
2959650,741223,535356,352163,284443,180306
2959836,1181250,688686,469344,333071,137868


In [35]:
# WARNING: ~10m execution time
releases.loc[mask, id_columns] = releases.loc[mask, id_columns].replace(changes_dict)

releases.loc[mask, id_columns]

Unnamed: 0,a0_id,a1_id,a2_id,a3_id,a4_id
79,2402119,,,,
200,2156584,,,,
338,504252,,,,
353,356208,,,,
368,118269,,,,
...,...,...,...,...,...
2959589,2641987,1679364,2641984,2641984,1679364
2959594,1141182,1810635,2041481,1810635,1141182
2959650,741223,203282,158068,190601,180306
2959836,779144,688686,469344,333071,137868


In [36]:
releases[id_columns].isin(changes_dict.keys()).any(axis=1).any()

False

In [37]:
releases.to_csv("releases_no_va_merged.csv", index=False)

In [38]:
!wc -l releases_no_va_merged.csv

2959870 releases_no_va_merged.csv


## Cleanup

In [39]:
engine.dispose()
conn.close()

In [40]:
!service postgresql stop