In [None]:
import json
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("GEN_TOKEN")

client = genai.Client(api_key=API_KEY)

artists = [
"Taylor Swift", "Ed Sheeran", "Adele", "Billie Eilish", "Ariana Grande",
"Harry Styles", "Olivia Rodrigo", "Katy Perry", "Lady Gaga", "Rihanna",
"Hozier", "Sia", "Sam Smith", "Chappell Roan", "Halsey", "Billy Joel",
"Maroon 5", "Britney Spears", "Michael Jackson", "Madonna",
"Prince", "Whitney Houston", "George Michael", "Cher", "Elton John",
"Kendrick Lamar", "Drake", "Kanye West", "Travis Scott",
"Cardi B", "Nicki Minaj", "Tyler, The Creator", "Eminem",
"The Beatles", "Queen", "Led Zeppelin", "Pink Floyd", "The Rolling Stones",
"Nirvana", "Radiohead", "Coldplay", "The Killers", "Imagine Dragons",
"Linkin Park", "Green Day", "My Chemical Romance", "Fall Out Boy",
"Paramore", "The Smiths", "The Cure", "Depeche Mode",
"Joy Division", "New Order", "David Bowie", "Bruce Springsteen",
"ACDC", "Metallica", "Elliott Smith", "Panic! At The Disco", "Pulp",
"Boygenius", "Alex G", "Sufjan Stevens", "Big Thief", "Adrianne Lenker",
"Twenty One Pilots", "Placebo",
"Frank Ocean", "The Weeknd", "SZA", "Stevie Wonder", "Aretha Franklin",
"Bill Withers", "Ray Charles", "Etta James", "Diana Ross",
"Bob Dylan", "Leonard Cohen", "Joni Mitchell", "Simon & Garfunkel", "Carole King",
"Fleetwood Mac", "ABBA"]

genres = ['pop', 'rock', 'alternative', 'rap', 'soul']

metadata = {}

for artist in artists:
    try:
        prompt = (
        f"Provide a short description of the artist '{artist}' including \n"
        f"the topics they explore in their music, styles they use, and "
        f"the genres they are known for (from this list: {', '.join(genres)}). "
        "Return only JSON with fields: name, genres (list), description."
        )

        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(thinking_budget=0)
            )
        )
        
        artist_data = response.text.strip()
        if artist_data.startswith("") and artist_data.endswith(""):
            artist_data = "\n".join(artist_data.split("\n")[1:-1]).strip()

        try:
            artist_json = json.loads(artist_data)
        except json.JSONDecodeError:
            artist_json = {
                "name": artist,
                "genres": [],
                "description": artist_data
            }
        
        metadata[artist] = artist_json
        
        print(f"Processed {artist}")
        
    except Exception as e:
        print(f"Error processing {artist}: {e}")
        metadata[artist] = {"name": artist, "genres": [], "description": ""}


with open("dataset/artist_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f"Metadata about {len(artists)} artists saved to dataset/artist_metadata.json")

Processed Diana Ross
Processed Bob Dylan
Processed Leonard Cohen
Processed Joni Mitchell
Processed Simon & Garfunkel
Processed Carole King
Processed Fleetwood Mac
Processed ABBA
Metadata about 8 artists saved to dataset/artist_metadata.json


In [10]:
import json
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("GEMINI_API")
client = genai.Client(api_key=API_KEY)

genres = ['pop', 'rock', 'alternative', 'rap', 'soul']

metadata = {}

for genre in genres:
    try:
        prompt = (
        f"Provide a short description of the genre '{genre}' including \n"
        f"the topics that are common for this genre, styles of writing, mood this genre gives, and themes it explores."
        "Return only JSON with fields: genre, description."
        )

        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(thinking_budget=0)
            )
        )
        
        genre_data = response.text.strip()
        if genre_data.startswith("") and genre_data.endswith(""):
            genre_data = "\n".join(genre_data.split("\n")[1:-1]).strip()

        try:
            genre_json = json.loads(genre_data)
        except json.JSONDecodeError:
            genre_json = {
                "genre": genre,
                "description": genre_data
            }
        
        metadata[genre] = genre_json
        
        print(f"Processed {genre}")
        
    except Exception as e:
        print(f"Error processing {genre}: {e}")
        metadata[genre] = {"genre": genre, "description": ""}

with open("dataset/genre_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f"Metadata about {len(genres)} genres saved to dataset/genre_metadata.json")

Processed pop
Processed rock
Processed alternative
Processed rap
Processed soul
Metadata about 5 genres saved to dataset/genre_metadata.json


In [23]:
with open('dataset/cleaned_songs_dataset.json', 'r', encoding='utf-8') as f:
    songs = json.load(f)

with open('dataset/artist_metadata.json', 'r', encoding='utf-8') as f:
    artists = json.load(f)

with open('dataset/genre_metadata.json', 'r', encoding='utf-8') as f:
    genres = json.load(f)

for song in songs:
    song['artist_metadata'] = artists.get(song['artist'], {})['description']
    song['genre_metadata'] = genres.get(song['genre'], {})['description']

with open('dataset/cleaned_songs_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(songs, f, ensure_ascii=False, indent=2)