In [7]:
import lyricsgenius
import json
import time
from typing import List, Dict
from dotenv import load_dotenv
import os

In [33]:
class LyricsCollector:
    def __init__(self, api_key: str):
        self.genius = lyricsgenius.Genius(api_key)
        self.genius.remove_section_headers = True
        self.genius.skip_non_songs = True
        self.genius.excluded_terms = ["Remix", "Live", "Demo"]
        self.genius.timeout = 15
        self.genius.retries = 3
        
    def get_artist_songs(self, artist_name: str, max_songs: int = 50) -> List[Dict]:

        print(f"Fetching songs for {artist_name}...")

        try:
            artist = self.genius.search_artist(
                artist_name, 
                max_songs=max_songs,
                sort="popularity"
            )
            
            if artist is None:
                print(f"Artist {artist_name} not found")
                return []
                
            songs_data = []
            for song in artist.songs:

                release_date = None

                if hasattr(song, 'album') and song.album:
                    if isinstance(song.album, dict):
                        album_info = song.album
                    else:
                        album_info = {attr: getattr(song.album, attr, None) 
                                    for attr in ['release_date_for_display'] 
                                    if hasattr(song.album, attr)}
                
                if album_info:
                    release_date = album_info.get('release_date_for_display')
            
                song_data = {
                    'artist': artist_name,
                    'title': song.title,
                    'lyrics': song.lyrics,
                    'url': song.url,
                    'release_date': release_date
                }
                songs_data.append(song_data)
                
            print(f"Retrieved {len(songs_data)} songs for {artist_name}")
            return songs_data
            
        except Exception as e:
            print(f"Error fetching songs for {artist_name}: {e}")
            return []
    
    def save_songs_to_file(self, songs_data: List[Dict], filename: str):
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(songs_data, f, indent=2, ensure_ascii=False)
        print(f"Saved {len(songs_data)} songs to {filename}")
    
    def load_songs_from_file(self, filename: str) -> List[Dict]:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)

In [46]:
artists = [
    # Pop & Mainstream
    "Taylor Swift", "Ed Sheeran", "Adele", "Billie Eilish", "Ariana Grande",
    "Harry Styles", "Olivia Rodrigo", "Katy Perry", "Lady Gaga", "Rihanna",
    "Hozier", "Sia", "Sam Smith", "Chappell Roan", "Halsey", "Billy Joel",
    "Maroon 5", "Britney Spears", "Michael Jackson", "Madonna",
    "Prince", "Whitney Houston", "George Michael", "Cher", "Elton John",

    # Hip-Hop & Rap
    "Kendrick Lamar", "Drake", "Kanye West", "Travis Scott",
    "Cardi B", "Nicki Minaj", "Tyler, The Creator", "Eminem", 

    # Rock & Alternative
    "The Beatles", "Queen", "Led Zeppelin", "Pink Floyd", "The Rolling Stones",
    "Nirvana", "Radiohead", "Coldplay", "The Killers", "Imagine Dragons", 
    "Linkin Park", "Green Day", "My Chemical Romance", "Fall Out Boy",
    "Paramore", "The Smiths", "The Cure", "Depeche Mode",
    "Joy Division", "New Order", "David Bowie", "Bruce Springsteen",
    "ACDC", "Metallica", "Elliott Smith", "Panic! At The Disco", "Pulp", 
    "Boygenius", "Alex G", "Sufjan Stevens", "Big Thief", "Adrianne Lenker",
    "Twenty One Pilots", "Placebo",

    # R&B & Soul
    "Frank Ocean", "The Weeknd", "SZA", "Stevie Wonder", "Aretha Franklin",
    "Bill Withers", "Ray Charles", "Etta James", "Diana Ross",

    # Legendary & Classic
    "Bob Dylan", "Leonard Cohen", "Joni Mitchell", "Simon & Garfunkel", "Carole King",
    "Fleetwood Mac", "ABBA"
]

print(f'Total nuber of musicians: {len(artists)}')

Total nuber of musicians: 83


In [45]:
def create_dataset():
    load_dotenv()

    API_KEY = os.getenv("GEN_TOKEN")
    collector = LyricsCollector(API_KEY)

    all_songs = []

    for artist in artists:
        songs = collector.get_artist_songs(artist, max_songs=25)
        all_songs.extend(songs)
        
        if songs:
            filename = f"dataset/individual/{artist.lower().replace(' ', '_')}.json"
            collector.save_songs_to_file(songs, filename)
        
        time.sleep(2)

In [53]:
import json
import glob

input_dir = "dataset/individual/*.json"
output_file = "dataset/songs_dataset.json"

all_songs = []
for filename in glob.glob(input_dir):
    with open(filename, 'r', encoding='utf-8') as f:
        all_songs.extend(json.load(f))

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_songs, f, indent=2)

print(f"Combined {len(all_songs)} songs from {len(glob.glob(input_dir))} files")

Combined 2075 songs from 83 files


In [1]:
import re
import json

pop = ["Taylor Swift", "Ed Sheeran", "Adele", "Billie Eilish", "Ariana Grande",
    "Harry Styles", "Olivia Rodrigo", "Katy Perry", "Lady Gaga", "Rihanna",
    "Hozier", "Sia", "Sam Smith", "Chappell Roan", "Halsey", "Billy Joel",
    "Maroon 5", "Britney Spears", "Michael Jackson", "Madonna",
    "Prince", "Whitney Houston", "George Michael", "Cher", "Elton John", "ABBA"]

rock = ["The Beatles", "Queen", "Led Zeppelin", "Pink Floyd", "The Rolling Stones",
    "Nirvana", "Linkin Park", "Green Day", "My Chemical Romance", "Fall Out Boy",
    "Paramore", "The Cure", "Depeche Mode", "Joy Division", "New Order", 
    "David Bowie", "Bruce Springsteen", "ACDC", "Metallica", "Fleetwood Mac"]

rap = ["Kendrick Lamar", "Drake", "Kanye West", "Travis Scott",
    "Cardi B", "Nicki Minaj", "Tyler, The Creator", "Eminem"]

soul = ["Frank Ocean", "The Weeknd", "SZA", "Stevie Wonder", "Aretha Franklin",
    "Bill Withers", "Ray Charles", "Etta James", "Diana Ross"]

alt = ["Radiohead", "Coldplay", "The Killers", "Imagine Dragons", "Boygenius",
    "Alex G", "Sufjan Stevens", "Big Thief", "Adrianne Lenker", "Elliott Smith",
    "Twenty One Pilots", "Placebo", "The Smiths", "Panic! At The Disco", "Pulp",
    "Bob Dylan", "Leonard Cohen", "Joni Mitchell", "Simon & Garfunkel", "Carole King"]

def clean_brackets(text: str) -> str:
    text = re.sub(r'\*.*?\*', '', text, flags=re.DOTALL)
    text = re.sub(r'\{.*?\}', '', text, flags=re.DOTALL)
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)
    text = re.sub(r'\(\)', '', text)
    text = re.sub(r'\(\n', '(', text)
    text = re.sub(r'\n\)', ')', text)
    
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def clean_entire_dataset(input_file: str, output_file: str = "dataset/cleaned_songs_dataset.json"):
    with open(input_file, 'r', encoding='utf-8') as f:
        songs = json.load(f)
    
    print(f"Loaded {len(songs)} songs for cleaning")
    
    cleaned_songs = []
    
    for song in songs:
        cleaned_song = song.copy()

        if cleaned_song['artist'] in pop:
            cleaned_song['genre'] = 'pop'
        elif cleaned_song['artist'] in rock:
            cleaned_song['genre'] = 'rock'
        elif cleaned_song['artist'] in rap:
            cleaned_song['genre'] = 'rap'
        elif cleaned_song['artist'] in soul:
            cleaned_song['genre'] = 'soul'
        elif cleaned_song['artist'] in alt:
            cleaned_song['genre'] = 'alternative'
        
        if cleaned_song['release_date']:
            cleaned_song['year'] = int(cleaned_song['release_date'].split(" ")[-1])
            del cleaned_song['release_date']
        
        if cleaned_song['url']:
            del cleaned_song['url']

        if 'lyrics' in cleaned_song and cleaned_song['lyrics']:
            cleaned_song['lyrics'] = clean_brackets(cleaned_song['lyrics'])
            cleaned_song['lyrics'] = re.sub(r'\n\s*\n', '\n\n', cleaned_song['lyrics'])
            cleaned_song['lyrics'] = cleaned_song['lyrics'].strip()
            cleaned_song['lyrics'] = cleaned_song['lyrics'].split("\n\n")
        
        cleaned_songs.append(cleaned_song)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_songs, f, indent=2, ensure_ascii=False)
    
    print(f"Saved {len(cleaned_songs)} cleaned songs to {output_file}")
    return cleaned_songs

cleaned_songs = clean_entire_dataset("dataset/songs_dataset.json")

Loaded 2075 songs for cleaning
Saved 2075 cleaned songs to dataset/cleaned_songs_dataset.json
