# Tags

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import json
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Tags

We'll start by getting the ID of all the tags that we need. For that I'll import the data that we have in out current dataset regarding tags.

In [5]:
artists_tags = pd.read_csv("artist_tags.csv", dtype=str)["tags"].dropna(axis=0)
artists_tags.tolist()

['1, 7, 11, 12, 20, 57, 58, 171, 237, 280, 402, 559, 1391, 1431, 1498, 1753, 1942, 4667, 32091, 33762, 92312, 119407, 255911',
 '11, 71, 92, 171, 237, 349, 1055, 1072, 1391',
 '98, 121, 379, 72115',
 '10, 11, 12, 58, 77, 559, 709, 1282, 1302, 1498, 1516, 1600, 71973',
 '111, 1661',
 '71, 111, 1053, 6443',
 '20, 111, 304, 1181, 8363',
 '14, 72, 1600, 4754',
 '111, 47664',
 '7, 75, 111, 127, 186, 304, 343, 712, 719, 1044, 1045, 1046, 1047, 1048, 1049, 1091, 1120, 4663, 7264, 26046, 29056, 33505, 40695, 41014, 53766, 70272, 252516, 253611',
 '7, 19, 29, 171, 237, 782, 987, 1044, 1045, 1046, 1056, 1063, 1091, 1391, 4663, 31341, 49112, 49197, 252805',
 '11, 171, 237, 628, 1391, 4663, 34450, 49112, 101582, 115617, 121083',
 '7, 20, 171, 175, 237, 271, 783, 1149, 1275, 1391, 45785',
 '7, 71, 75, 111, 127, 166, 186, 343, 712, 721, 1063, 1142, 1245, 1255, 4663, 40695, 41911, 42172, 47664, 80788',
 '7, 20, 284, 662, 1012, 1275, 4663, 49109, 255911',
 '7, 20, 133, 339, 743, 1057, 1058, 1081, 4663

In [6]:
releases_tags = pd.read_csv("releases_no_va_merged_id.csv", dtype=str)["tags"].dropna(axis=0)
releases_tags.tolist()

['159660, 148804, 1769, 166',
 '6844, 2485, 46031, 1416, 127, 71, 75',
 '1100, 1211, 1091, 20',
 '2341, 6517, 175335, 55, 58, 1275, 4833, 16, 166, 49, 537',
 '4720, 5451, 7, 3371, 88, 33',
 '54052, 197, 166, 43435, 177286, 5922, 47463, 26742, 2417, 67645, 68694, 5168',
 '24638, 75737, 47463, 27020, 26005, 1091, 11',
 '523',
 '235',
 '7, 248, 564',
 '166, 1754',
 '7, 11, 19, 32, 187, 192, 782, 32, 173858, 156077, 65070, 192, 187, 7, 11, 19, 166, 5168, 798, 782, 564, 192, 7, 11, 19, 32, 187, 782',
 '7, 71, 516, 517, 560, 2145',
 '2807, 75, 508, 166, 7',
 '166, 153267, 95596, 2680, 2105',
 '210354, 46052, 2261, 560',
 '34450, 69106, 46085, 166, 1883',
 '77, 58036, 11',
 '199830, 42561, 486, 206, 62',
 '71, 7, 11, 3771, 686, 590, 77, 3977',
 '29, 1105',
 '20, 2377, 284, 537, 782, 1068, 564, 662',
 '58, 69106, 2498, 303, 166, 11',
 '255, 19, 564, 537, 284',
 '2719, 11, 445, 7630, 5003, 4190, 441, 662',
 '1164, 7055, 7',
 '11, 25, 498, 1259, 1401, 155656, 216531',
 '235, 49120, 662, 36080, 3

We'll now get a list of the tags IDs without repetitions.

In [7]:
tags_set = set(tag for tag_list in artists_tags for tag in tag_list.split(", ")).union(tag for tag_list in releases_tags for tag in tag_list.split(", "))
len(tags_set)

171425

It's now the time to get the tag information.

In [8]:
query = f"""
    SELECT id, name
    FROM tag
    WHERE id IN ({", ".join(tags_set)});
"""
tags_df = pd.read_sql_query(query, engine_url)
tags_df

Unnamed: 0,id,name
0,250930,italopop
1,195101,evening
2,246528,champ 700
3,244904,darkness and light
4,246465,lam phaen
...,...,...
171420,235915,school party
171421,235916,school disco
171422,235923,3 pills
171423,235927,3am in tokyo


I'll just do some tests before serializing it.

In [9]:
tags_df[tags_df["name"].str.contains("rock")]

Unnamed: 0,id,name
159,59364,neo-rockabilly
184,246424,sophisticated rock
276,254259,heart land rock
296,254483,rock bard classical records
337,254276,garacge rock
...,...,...
171236,231732,campfire rock
171264,232076,gronk rock
171265,232078,rock | hard rock
171278,232400,fast punk rock


In [10]:
tags_df[tags_df["name"].str.contains("pop")]

Unnamed: 0,id,name
0,250930,italopop
7,246457,scream-pop
43,104457,austro pop
65,246534,proto-hyperpop
87,246458,scream pop
...,...,...
171006,227750,pop fusion
171091,229242,pop and roll
171159,230401,jkpop
171205,231080,video pop


In [11]:
tags_df[tags_df["name"].str.contains("metal")]

Unnamed: 0,id,name
53,166443,darkmetal
68,246592,swiss metalcore
98,265437,vegan metalcore
154,1414,symphonic black metal
174,41873,atmospheric doom metal
...,...,...
171163,230560,experimental noise metal
171305,233052,post-numetal
171317,233222,sovmetal
171318,233223,stoner black metal


In [12]:
tags_df[tags_df["name"].str.contains("pop-rock")]

Unnamed: 0,id,name
9488,159,pop-rock
44614,65260,adult alternative pop-rock
51517,78606,pop-rock motown soul r&b 80's guitar synth man...
60041,91603,greek pop-rock
63811,96369,rock/progressive/pop-rock
65402,97548,rock/progressive pop-rock
75997,113603,german/synthie/pop-rock
77171,115268,chanson/pop-rock
78075,126887,electro-pop-rock
78502,116992,progressive pop-rock


In [13]:
tags_df.to_csv("tags.csv", index=False)

## Cleanup

In [14]:
engine.dispose()

In [15]:
!service postgresql stop