In [19]:
import random
import json
from collections import Counter
from itertools import chain
import unidecode
from pathlib import Path
import sys
from tqdm import tqdm
import pandas as pd
import re
import os

In [9]:
# Agrega la ra√≠z del proyecto al path
root = Path(__file__).resolve().parent.parent if '__file__' in globals() else Path.cwd().parent
sys.path.append(str(root))

# Ahora s√≠ puedes importar
from utils.tag_equivalences import unify_tag

In [10]:
data_path = Path("../data/albums_full.json")

In [11]:
# Cargar JSON
with open(data_path, "r", encoding="utf-8") as f:
    albums = json.load(f)

print(f"‚úÖ JSON cargado correctamente. Total de √°lbumes: {len(albums):,}")

‚úÖ JSON cargado correctamente. Total de √°lbumes: 1,884


In [12]:
for a in albums[:3]:
    print(a.get("artist"), "-", a.get("title"))
    print("Tags:", a.get("tags"))
    print()

{'name': 'ostatni kontynent', 'url': 'https://ostatnikontynent.bandcamp.com', 'location': 'Wroc≈Çaw, Poland', 'description': 'wroc≈Çaw-based emo two piece\n\ngabry≈õ - git/vox\n gwen - drums\n\npizza emo nerds 4 ever', 'imageUrl': 'https://f4.bcbits.com/img/0041279214_28.jpg', 'label': None} - demo
Tags: ['alternative', 'emo', 'lofi emo', 'midwest emo', 'screamo', 'skramz', 'Wroc≈Çaw']

{'name': 'sadness, abriction', 'url': 'https://sadnessmusic.bandcamp.com', 'location': None, 'description': 'patreon.com/user?u=2974746', 'imageUrl': 'https://f4.bcbits.com/img/0031872051_28.jpg', 'label': None} - that lasts forever
Tags: ['blackgaze', 'metal', 'emo', 'post-rock', 'shoegaze']

{'name': 'Thought Bubble', 'url': 'https://bubble.bandcamp.com', 'location': 'England, UK', 'description': 'Electronica, analogue percussion and a voice.\n\nNo set style or genre. The music comes out in whatever form it wants.\n\nSometimes we invite friends to add elements.\n\n____________\n\n\n A couple of us wer

In [15]:
df = pd.DataFrame([
    {
        "artist": a.get("artist", ""),
        "album": a.get("title", ""),
        "tags": "; ".join(a.get("tags", [])),
    }
    for a in albums
])

print("‚úÖ DataFrame creado correctamente")
df.head(10)

‚úÖ DataFrame creado correctamente


Unnamed: 0,artist,album,tags
0,"{'name': 'ostatni kontynent', 'url': 'https://...",demo,alternative; emo; lofi emo; midwest emo; screa...
1,"{'name': 'sadness, abriction', 'url': 'https:/...",that lasts forever,blackgaze; metal; emo; post-rock; shoegaze
2,"{'name': 'Thought Bubble', 'url': 'https://bub...",A Made Up World EP,alternative; soundtrack music; ambient electro...
3,"{'name': 'Caratacus', 'url': 'https://caratacu...",The Celtic Sessions Vol. 1,celtic; rock; soundtrack; soundtrack music; ce...
4,"{'name': 'Windir', 'url': 'https://windirband....",1184,metal; black metal; metal; viking metal; Norway
5,"{'name': 'MASTER BOOT RECORD', 'url': 'https:/...",HARDWAREZ,alternative; black metal; death metal; metal; ...
6,"{'name': 'THORN', 'url': 'https://sligm.bandca...",COALESCENCE - [2020-2025 Thorn Archives],death metal; metal; death; deathgrind; grind; ...
7,"{'name': 'After', 'url': 'https://after95.band...",After EP,2000s; pop; downtempo; dream pop; drempop; fru...
8,"{'name': 'Makaya McCraven', 'url': 'https://in...",Techno Logic (featuring Theon Cross & Ben LaMa...,berlin; diddley bow; electronic; improvisation...
9,"{'name': 'Wormhole World', 'url': 'https://wor...",A Wormhole Xmas 2025,experimental; ambient; asmr; avant-garde; comp...


In [17]:
all_tags = list(chain.from_iterable(a["tags"] for a in albums))
tag_counts = Counter(all_tags)

df_tags = pd.DataFrame(tag_counts.items(), columns=["tag", "count"])\
    .sort_values("count", ascending=False)

print(f"üîç Total de tags √∫nicos: {len(df_tags):,}")
df_tags.head(20)

üîç Total de tags √∫nicos: 4,123


Unnamed: 0,tag,count
35,electronic,648
8,metal,349
68,ambient,312
36,experimental,280
0,alternative,265
17,rock,249
288,punk,150
26,black metal,146
59,jazz,145
276,techno,136


In [24]:
from utils.tag_constants import INVALID_TAGS, GEO_TAGS

In [29]:
invalid_set = set(tag.lower().strip() for tag in INVALID_TAGS)
geo_set = set(tag.lower().strip() for tag in GEO_TAGS)

removed_tags = []

for album in tqdm(albums, desc="üßπ Normalizando tags"):
    tags = album.get("tags", [])

    clean_tags = []
    for t in tags:
        t_lower = t.lower().strip()

        # Si el tag es inv√°lido o geogr√°fico ‚Üí registrarlo como eliminado
        if t_lower in invalid_set or t_lower in geo_set:
            removed_tags.append(t_lower)
            continue

        # Si es v√°lido ‚Üí guardarlo limpio
        clean_tags.append(t_lower)

    # Guardar las nuevas tags limpias en el √°lbum
    album["clean_tags"] = clean_tags

üßπ Normalizando tags: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1884/1884 [00:00<00:00, 417127.78it/s]


In [30]:
counter_removed = Counter(removed_tags)
df_removed = pd.DataFrame(counter_removed.most_common(), columns=["tag", "count"])
df_removed.to_csv("../data/removed_tags_report.csv", index=False, encoding="utf-8")



In [31]:
output_path = Path("../data/albums_clean.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(albums, f, ensure_ascii=False, indent=2)

print(f"üíæ Archivo guardado: {output_path}")

üíæ Archivo guardado: ..\data\albums_clean.json
