In [20]:
# We only were able to extract a few tiktok accounts with wikidata
!wc -l ../data/tiktok_musicbrainz_all.jsonl

26902 ../data/tiktok_musicbrainz_all.jsonl


In [21]:
# The idea is to complement those with the mapping available in MB
# Let's get a driver for our DB

from sqlalchemy import create_engine, Engine, text
from dotenv import load_dotenv
import pandas as pd
import os

def get_engine() -> Engine:
    DB_NAME = os.getenv("DB_NAME")
    DB_HOST = os.getenv("DB_HOST")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_PORT = os.getenv("DB_PORT")

    # .env validation
    assert DB_NAME is not None and \
        DB_HOST is not None and \
        DB_PORT is not None and \
        DB_USER is not None and \
        DB_PASS is not None, \
        "INVALID .env"

    engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    engine = create_engine(engine_url, pool_size=10, max_overflow=0)
    return engine

load_dotenv()
engine = get_engine()
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

In [22]:
# Now, we study which link_types are available for tiktok accounts in MB

query = text("""
    SELECT *
    FROM link_type
    WHERE id IN (
        SELECT DISTINCT link_type
        FROM link
        WHERE id IN (
            SELECT DISTINCT lau.link
            FROM l_artist_url lau
            WHERE lau.entity1 IN (
                SELECT DISTINCT u.id
                FROM url u
                WHERE u.url ~ 'tiktok.com/@'
            )
        )
    )
    ;
""")

with engine.connect() as conn:
    df = pd.read_sql_query(query, conn)

df

Unnamed: 0,id,parent,child_order,gid,entity_type0,entity_type1,name,description,link_phrase,reverse_link_phrase,long_link_phrase,last_updated,is_deprecated,has_dates,entity0_cardinality,entity1_cardinality
0,185,841.0,1,35b3a50f-bf0e-4309-a3b4-58eeed8cee6a,artist,url,online community,This relationship type links an artist to thei...,online communities,online community page for,has an online community page at,2015-02-04 08:45:34.222372+00:00,False,True,0,0
1,303,192.0,0,d86c9450-b6d0-4760-a275-e7547495b48b,artist,url,video channel,"This links an artist to a channel, playlist, o...",video channel,video channel for,has an official video channel at,2015-02-04 12:11:54.724651+00:00,False,True,0,0
2,172,171.0,1,f484f897-81cc-406e-96f9-cd799a04ee24,artist,url,fanpage,This links an artist to a fan-created website.,fan pages,fan page for,has a fan page at,2014-01-19 07:09:01.584869+00:00,False,True,0,0
3,183,,0,fe33d22f-c3b0-4d68-bd53-a856badf2b15,artist,url,official homepage,Indicates the official homepage for an artist.,official homepages,official homepage for,has an official homepage at,2014-04-03 14:48:22.372135+00:00,False,True,0,0
4,199,841.0,2,eb535226-f8ca-499d-9b18-6a144df4ae6f,artist,url,blog,This relationship type is used to link an arti...,blogs,blog of,has a blog at,2015-02-04 12:11:40.752151+00:00,False,True,0,0
5,192,841.0,0,99429741-f3f6-484b-84f8-23af51991770,artist,url,social network,A social network page is an artist's own profi...,social networking,social networking page for,has a social networking page at,2023-10-23 19:01:27.104885+00:00,False,True,0,0


In [23]:
# We have 6 link_types.
# Out of those, 185 and 303 are not official accounts, so we'll skip those.
link_types = [172, 183, 199, 192]

In [24]:
# Now it's time to gather the data
# In total, we're expecting this many rows
# Hopefully they will add some value to the mini-musynergy
query = text(f"""
    SELECT count(u.id)
    FROM url u
    WHERE 1=1
    AND u.url ~ 'tiktok.com/@'
    AND u.id IN (
        SELECT lau.entity1
        FROM l_artist_url lau
            WHERE lau.link IN (
            SELECT id
            FROM link
            WHERE link_type IN {tuple(link_types)}
        )
    )
    ;
""")

with engine.connect() as conn:
    df = pd.read_sql_query(query, conn)

df

Unnamed: 0,count
0,9994


In [25]:
# Now we extract the data we're looking for
query = text(f"""
    SELECT
        lau.entity0 AS artist_id,
        split_part(regexp_replace(u.url, '^.*@', ''), '[/?]', 1) AS tiktok_username
    FROM l_artist_url lau
    JOIN (
        SELECT
            u.id,
            u.url
        FROM url u
        WHERE u.url ~ 'tiktok.com/@'
    ) u ON u.id = lau.entity1
    WHERE lau.link IN (
        SELECT id
        FROM link
        WHERE link_type IN {tuple(link_types)}
    )
    ;
""")

with engine.connect() as conn:
    df = pd.read_sql_query(query, conn)

df.drop_duplicates(inplace=True)

df

Unnamed: 0,artist_id,tiktok_username
0,1223298,cosmosheldrake
1,2752145,shallowsky.band
2,2585491,gmjoonofficial
3,2585497,nappimoon
4,2585587,yahritza
...,...,...
10022,2585223,wearefromtohoku
10023,2585245,questship
10024,2585297,yakinikutabe_houdai
10025,2585365,electric_heartbeat


In [26]:
# Now we save the CSV
df.to_csv(index=False, path_or_buf="../data/mb_tiktok_mapping.csv")

!head ../data/mb_tiktok_mapping.csv

artist_id,tiktok_username
1223298,cosmosheldrake
2752145,shallowsky.band
2585491,gmjoonofficial
2585497,nappimoon
2585587,yahritza
2688404,jayvounter
1843053,Noisecream_official
2578273,tiot_now
2585981,spacedbandofficial


In [27]:
# And we close the driver
engine.dispose()