# Tags

By Alejandro Fernández Sánchez

## Setting up the connection

In [1]:
# Just in case you're the host and it's not already started
!service postgresql start

In [2]:
# Imports
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import json
load_dotenv()

True

In [3]:
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_PORT = os.getenv("DB_PORT")

In [4]:
# Used for saving results to pandas dataframes
engine_url = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(engine_url)
engine

Engine(postgresql://musicbrainz:***@localhost:5432/musicbrainz_db)

## Tags

We'll start by getting the ID of all the tags that we need. For that I'll import the data that we have in out current dataset regarding tags.

In [5]:
artists_tags = pd.read_csv("../data/artist_tags.csv", dtype=str)["tags"].dropna(axis=0)
artists_tags.tolist()

['1, 7, 11, 12, 20, 57, 58, 171, 237, 280, 402, 559, 1391, 1431, 1498, 1753, 1942, 4667, 32091, 33762, 92312, 119407, 255911',
 '11, 71, 92, 171, 237, 349, 1055, 1072, 1391',
 '98, 121, 379, 72115',
 '10, 11, 12, 58, 77, 559, 709, 1282, 1302, 1498, 1516, 1600, 71973',
 '111, 1661',
 '71, 111, 1053, 6443',
 '20, 111, 304, 1181, 8363',
 '14, 72, 1600, 4754',
 '111, 47664',
 '7, 71, 75, 111, 127, 186, 304, 343, 712, 719, 987, 1044, 1045, 1046, 1047, 1048, 1049, 1091, 1120, 1131, 1276, 4663, 7264, 25577, 26046, 29056, 33505, 40695, 41014, 53766, 70272, 252516, 253611',
 '7, 19, 29, 171, 237, 782, 987, 1044, 1045, 1046, 1056, 1063, 1091, 1391, 4663, 31341, 49112, 49197, 252805',
 '11, 171, 237, 628, 1391, 4663, 34450, 49112, 101582, 115617, 121083',
 '7, 20, 171, 175, 237, 271, 783, 1149, 1275, 1391, 45785',
 '7, 71, 75, 111, 127, 166, 186, 343, 712, 721, 1063, 1142, 1245, 1255, 4663, 40695, 41911, 42172, 47664, 80788',
 '7, 20, 284, 662, 1012, 1275, 4663, 49109, 255911',
 '7, 20, 133, 339,

In [None]:
tracks_tags = pd.read_csv("../data/tracks_no_va_merged_id.csv", dtype=str)["tags"].dropna(axis=0)
tracks_tags.tolist()[:10]

We'll now get a list of the tags IDs without repetitions.

In [None]:
tags_set = set(tag for tag_list in artists_tags for tag in tag_list.split(", ")).union(tag for tag_list in tracks_tags for tag in tag_list.split(", "))
len(tags_set)

It's now the time to get the tag information.

In [None]:
query = f"""
    SELECT id, name
    FROM tag
    WHERE id IN ({", ".join(tags_set)});
"""
tags_df = pd.read_sql_query(query, engine_url)
tags_df

I'll just do some tests before serializing it.

In [None]:
tags_df[tags_df["name"].str.contains("rock")]

In [None]:
tags_df[tags_df["name"].str.contains("pop")]

In [None]:
tags_df[tags_df["name"].str.contains("metal")]

In [None]:
tags_df[tags_df["name"].str.contains("pop-rock")]

In [None]:
len(tags_df)

In [None]:
tags_df.drop_duplicates(subset="name", keep="first", inplace=True)

In [None]:
len(tags_df)

Finally we save it.

In [None]:
tags_df.to_csv("../data/tags.csv", index=False)

## Cleanup

In [None]:
engine.dispose()

In [None]:
!service postgresql stop