In [5]:
import json
import time
import requests
from bs4 import BeautifulSoup
import lyricsgenius
import os
import sqlite3

In [2]:
# Dein API-Schlüssel
CLIENT_ACCESS_TOKEN = '0NuDs36z70ithLwO3QTBQmsnofB1Ykb_yIWKFuLMZwIdjkzaxuKr3k763n_jIua-'

# Initialisiere die Genius-API
genius = lyricsgenius.Genius(CLIENT_ACCESS_TOKEN, timeout=20, sleep_time=0.2, retries=3)

def get_all_artists():
    artist_names = []
    base_artist_url = "https://genius.com/artists-index/{}"

    for letter in 'abcdefghijklmnopqrstuvwxyz':
        url = base_artist_url.format(letter)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        for a in soup.find_all('a', class_='artists_index_list-artist_name'):
            artist_names.append(a.get_text())

        for ul in soup.find_all('ul', class_='artists_index_list'):
            for a in ul.find_all('a'):
                artist_names.append(a.get_text())

    return artist_names

def get_artist_id(artist_name):
    search_results = genius.search_artists(artist_name)
    if 'sections' in search_results and len(search_results['sections']) > 0:
        for hit in search_results['sections'][0]['hits']:
            if hit['result']['name'].lower() == artist_name.lower():
                return hit['result']['id']
    return None

def get_artist_songs(artist_id):
    songs = genius.artist_songs(artist_id, per_page=50, sort='title')
    song_ids = [song['id'] for song in songs['songs']]
    return song_ids


def contains_problematic_chars(text):
    # Definiere problematische Zeichen oder Muster
    problematic_chars = ["\u200b", "\u00e4", "\u00f6", "\u00fc", "\u00df"]  # Beispiel: Unicode-Zeichen
    return any(char in text for char in problematic_chars)


def get_song_details(song_id):
    retries = 5
    while retries > 0:
        try:
            song = genius.song(song_id)
            
            # Abrufen der gewünschten Informationen
            title = song['song']['title']
            artist = song['song']['primary_artist']['name']
            release_date = song['song']['release_date'] if 'release_date' in song['song'] else "Unknown"
            album = song['song']['album']['name'] if song['song']['album'] else "Single"
            popularity = song['song']['stats']['pageviews'] if 'stats' in song['song'] and 'pageviews' in song['song']['stats'] else "Unknown"

            # Überprüfe auf problematische Zeichen
            if contains_problematic_chars(title) or contains_problematic_chars(artist):
                print(f"Skipping song ID {song_id} due to problematic characters in title or artist.")
                return None
            
            # Liedtext extrahieren
            lyrics = genius.lyrics(song_id)
            if not lyrics or lyrics.strip() == "":
                lyrics = "Lyrics not available"
            
            # Rückgabe der Informationen in einem Dictionary
            return {
                'song_id': song_id,
                'title': title,
                'artist': artist,
                'release_date': release_date,
                'lyrics': lyrics,
                'album': album,
                'popularity': popularity
            }
        except requests.exceptions.Timeout:
            retries -= 1
            print(f"Timeout occurred. Retrying... ({5 - retries}/5)")
            time.sleep(2)
        except Exception as e:
            print(f"Error getting details for song ID {song_id}: {e}")
            return None



def save_checkpoint(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

def load_checkpoint(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return json.load(f)
    return None

In [24]:
# Schritt 1: Künstlernamen abrufen
artist_names_checkpoint = load_checkpoint('artist_names.json')
if artist_names_checkpoint is None:
    artist_names = get_all_artists()
    save_checkpoint(artist_names, 'artist_names.json')
else:
    artist_names = artist_names_checkpoint

# Schritt 2: Künstler-IDs und Songs abrufen
song_ids_list_checkpoint = load_checkpoint('song_ids_list.json')
if song_ids_list_checkpoint is None:
    song_ids_list  = []
    processed_artists = 0
else:
    song_ids_list = song_ids_list_checkpoint['song_ids']
    processed_artists = song_ids_list_checkpoint['processed_artists']

for index, artist_name in enumerate(artist_names[processed_artists:], start=processed_artists):
    artist_id = get_artist_id(artist_name)
    if artist_id:
        song_ids = get_artist_songs(artist_id)
        song_ids_list.extend(song_ids)

    # Checkpoint nach jedem Künstler speichern
    save_checkpoint({'song_ids': song_ids_list, 'processed_artists': index + 1}, 'song_ids_list.json')


KeyboardInterrupt: 

In [3]:
# Lade die vorhandene JSON-Datei mit den Song-IDs und entferne Duplikate
filename = 'song_ids_list.json'
data = load_checkpoint(filename)
unique_song_ids = list(set(data['song_ids']))

In [4]:
# Lade den Checkpoint für die Song-Details
details_checkpoint = load_checkpoint('song_details_checkpoint.json')
if details_checkpoint is None:
    song_details_list = []
    processed_songs = 0
else:
    song_details_list = details_checkpoint['song_details']
    processed_songs = details_checkpoint['processed_songs']

# Abrufen der Song-Details und Speichern der Checkpoints
for index, song_id in enumerate(unique_song_ids[processed_songs:], start=processed_songs):
    details = get_song_details(song_id)
    if details:
        song_details_list.append(details)
    
    # Checkpoint nach jedem Song speichern
    save_checkpoint({'song_details': song_details_list, 'processed_songs': index + 1}, 'song_details_checkpoint.json')


Skipping song ID 3201397 due to problematic characters in title or artist.
Skipping song ID 10541437 due to problematic characters in title or artist.
Skipping song ID 8444353 due to problematic characters in title or artist.
Skipping song ID 5298630 due to problematic characters in title or artist.
Skipping song ID 6347259 due to problematic characters in title or artist.
Skipping song ID 1104393 due to problematic characters in title or artist.
Skipping song ID 9493004 due to problematic characters in title or artist.
Skipping song ID 3201576 due to problematic characters in title or artist.
Skipping song ID 5298764 due to problematic characters in title or artist.
Skipping song ID 9493110 due to problematic characters in title or artist.
Skipping song ID 8444573 due to problematic characters in title or artist.
Skipping song ID 5298893 due to problematic characters in title or artist.
Skipping song ID 4250322 due to problematic characters in title or artist.
Skipping song ID 1054177

KeyboardInterrupt: 

In [6]:
def load_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def create_database(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS songs (
            song_id INTEGER PRIMARY KEY,
            title TEXT,
            artist TEXT,
            release_date TEXT,
            lyrics TEXT,
            album TEXT,
            popularity INTEGER
        )
    ''')
    conn.commit()
    return conn

def insert_song_details(conn, song_details):
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR IGNORE INTO songs (song_id, title, artist, release_date, lyrics, album, popularity)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (
        song_details['song_id'],
        song_details['title'],
        song_details['artist'],
        song_details['release_date'],
        song_details['lyrics'],
        song_details['album'],
        song_details['popularity']
    ))
    conn.commit()


In [7]:
# Lade die JSON-Daten
data = load_json('song_details_checkpoint.json')
song_details_list = data['song_details']

# Erstelle die SQLite-Datenbank
conn = create_database('songs.db')

# Speichere die Daten in der SQLite-Datenbank
for song_details in song_details_list:
    insert_song_details(conn, song_details)

# Schließe die Datenbankverbindung
conn.close()

In [9]:
def delete_problematic_songs(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Definiere problematische Zeichen oder Muster
    problematic_chars = ["\u200b", "\u00e4", "\u00f6", "\u00fc", "\u00df"]
    
    for char in problematic_chars:
        cursor.execute('''
            DELETE FROM songs WHERE lyrics LIKE ?
        ''', ('%' + char + '%',))
    
    conn.commit()
    conn.close()

# Bereinige die Daten in der SQLite-Datenbank
delete_problematic_songs('songs.db')
