# Tags

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import json
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Tags

We'll start by getting the ID of all the tags that we need. For that I'll import the data that we have in out current dataset regarding tags.

In [5]:
artists_tags = pd.read_csv("../data/artist_tags.csv", dtype=str)["tags"].dropna(axis=0)
artists_tags.tolist()[:10]

['1, 7, 11, 12, 20, 57, 58, 171, 237, 280, 402, 559, 1391, 1431, 1498, 1753, 1942, 4667, 32091, 33762, 92312, 119407, 255911',
 '11, 71, 92, 171, 237, 349, 1055, 1072, 1391',
 '98, 121, 379, 72115',
 '10, 11, 12, 58, 77, 559, 709, 1282, 1302, 1498, 1516, 1600, 71973',
 '111, 1661',
 '71, 111, 1053, 6443',
 '20, 111, 304, 1181, 8363',
 '14, 72, 1600, 4754',
 '111, 47664',
 '7, 71, 75, 111, 127, 186, 304, 343, 712, 719, 987, 1044, 1045, 1046, 1047, 1048, 1049, 1091, 1120, 1131, 1276, 4663, 7264, 25577, 26046, 29056, 33505, 40695, 41014, 53766, 70272, 252516, 253611']

In [6]:
tracks_tags = pd.read_csv("../data/tracks_no_va_merged_id.csv", dtype=str)["tags"].dropna(axis=0)
tracks_tags.tolist()[:10]

['267',
 '54784, 54785, 27606, 38432, 27603',
 '609, 267',
 '1403, 267',
 '267',
 '5290',
 '267',
 '178, 669, 155151, 11, 2796, 151, 4190, 7630, 2232, 1881, 1886, 2484, 6686',
 '267, 1294, 402, 1377, 2797, 1403, 5948',
 '267, 303, 2465, 211445, 7, 88, 173818, 102413']

We'll now get a list of the tags IDs without repetitions.

In [7]:
tags_set = set(tag for tag_list in artists_tags for tag in tag_list.split(", ")).union(tag for tag_list in tracks_tags for tag in tag_list.split(", "))
len(tags_set)

176941

It's now the time to get the tag information.

In [8]:
query = f"""
    SELECT id, name
    FROM tag
    WHERE id IN ({", ".join(tags_set)});
"""
tags_df = pd.read_sql_query(query, engine_url)
tags_df

Unnamed: 0,id,name
0,250930,italopop
1,246528,champ 700
2,244904,darkness and light
3,246465,lam phaen
4,246541,hard 2 breathe
...,...,...
176936,78805,retro electro
176937,6747,horns
176938,43,heavy metal
176939,2552,hair metal


I'll just do some tests before serializing it.

In [9]:
tags_df[tags_df["name"].str.contains("rock")]

Unnamed: 0,id,name
149,59364,neo-rockabilly
172,246424,sophisticated rock
271,254259,heart land rock
290,254483,rock bard classical records
337,254276,garacge rock
...,...,...
176848,6504,jrock
176863,1057,college rock
176881,4916,alternative-rock
176894,41035,dance-rock


In [10]:
tags_df[tags_df["name"].str.contains("pop")]

Unnamed: 0,id,name
0,250930,italopop
6,246457,scream-pop
30,246534,proto-hyperpop
55,104457,austro pop
85,246458,scream pop
...,...,...
176832,159,pop-rock
176843,3178,synth pop
176867,31591,pop-rap
176873,49197,pop reggae


In [11]:
tags_df[tags_df["name"].str.contains("metal")]

Unnamed: 0,id,name
69,246592,swiss metalcore
70,166443,darkmetal
95,265437,vegan metalcore
144,1414,symphonic black metal
173,41873,atmospheric doom metal
...,...,...
176928,95386,blackened heavy metal
176931,92,metal
176938,43,heavy metal
176939,2552,hair metal


In [12]:
tags_df[tags_df["name"].str.contains("pop-rock")]

Unnamed: 0,id,name
729,246771,crappy pop-rock
46349,65260,adult alternative pop-rock
53490,78606,pop-rock motown soul r&b 80's guitar synth man...
62457,91603,greek pop-rock
66277,96369,rock/progressive/pop-rock
67912,97548,rock/progressive pop-rock
78803,113603,german/synthie/pop-rock
80069,115268,chanson/pop-rock
80879,126887,electro-pop-rock
81312,116992,progressive pop-rock


In [13]:
len(tags_df)

176941

In [14]:
tags_df.drop_duplicates(subset="name", keep="first", inplace=True)

In [15]:
len(tags_df)

176941

Finally we save it.

In [16]:
tags_df.to_csv("../data/tags.csv", index=False)

## Cleanup

In [17]:
engine.dispose()

In [18]:
!service postgresql stop