In [None]:
import requests
from bs4 import BeautifulSoup, UnicodeDammit
import time
import re
import os
import json
from unidecode import unidecode
from html import unescape
from typing import Optional
from swaglyrics import __version__, unsupported_txt, backend_url, api_timeout, genius_timeout

The code of this notebook is borrowed from the **SwagLyrics-For-Spotify** project.
https://github.com/SwagLyrics/SwagLyrics-For-Spotify

Specifically it is taken from here:
https://github.com/SwagLyrics/SwagLyrics-For-Spotify/blob/master/swaglyrics/cli.py

In [None]:
with open("../../data/network.json") as f:
  artists = json.load(f)

In [None]:
# matches braces with feat included or text after -, also adds support for Bollywood songs by matching (From "<words>")
brc = re.compile(r'([(\[](feat|ft|From "[^"]*")[^)\]]*[)\]]|- .*)', re.I)
aln = re.compile(r'[^ \-a-zA-Z0-9]+')  # matches non space or - or alphanumeric characters
spc = re.compile(' *- *| +')  # matches one or more spaces
wth = re.compile(r'(?: *\(with )([^)]+)\)')  # capture text after with
nlt = re.compile(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]')  # match only latin characters,
# built using latin character tables (basic, supplement, extended a,b and extended additional)

In [None]:
def stripper(song: str, artist: str) -> str:
    """
    Generate the url path given the song and artist to format the Genius URL with.
    Strips the song and artist of special characters and unresolved text such as 'feat.' or text within braces.
    Then concatenates both with hyphens replacing the blank spaces.
    Eg.
    >>>stripper('Paradise City', 'Guns n’ Roses')
    >>>'Guns-n-Roses-Paradise-City'
    Which then formats the url to https://genius.com/Guns-n-Roses-Paradise-City-lyrics
    :param song: currently playing song
    :param artist: song artist
    :return: formatted url path
    """
    song = re.sub(brc, '', song).strip()  # remove braces and included text with feat and text after '- '
    ft = wth.search(song)  # find supporting artists if any
    if ft:
        song = song.replace(ft.group(), '')  # remove (with supporting artists) from song
        ar = ft.group(1)  # the supporting artist(s)
        if '&' in ar:  # check if more than one supporting artist and add them to artist
            artist += f'-{ar}'
        else:
            artist += f'-and-{ar}'
    song_data = artist + '-' + song
    # swap some special characters
    url_data = song_data.replace('&', 'and')
    # replace /, !, _ with space to support more songs
    url_data = url_data.replace('/', ' ').replace('!', ' ').replace('_', ' ')
    for ch in ['Ø', 'ø']:
        if ch in url_data:
            url_data = url_data.replace(ch, '')
    url_data = re.sub(nlt, '', url_data)  # remove non-latin characters before unidecode
    url_data = unidecode(url_data)  # convert accents and other diacritics
    url_data = re.sub(aln, '', url_data)  # remove punctuation and other characters
    url_data = re.sub(spc, '-', url_data.strip())  # substitute one or more spaces to -
    return url_data

In [None]:
def get_lyrics(song: str, artist: str) -> Optional[str]:
    """
    Get lyrics from Genius given the song and artist.
    Formats the URL with the stripped url path to fetch the lyrics.
    :param song: currently playing song
    :param artist: song artist
    :return: song lyrics or None if lyrics unavailable
    """
    url_data = stripper(song, artist)  # generate url path using stripper()
    if url_data.startswith('-') or url_data.endswith('-'):
        return None  # url path had either song in non-latin, artist in non-latin, or both
    url = f'https://genius.com/{url_data}-lyrics'  # format the url with the url path
    try:
        page = requests.get(url, timeout=genius_timeout)
        page.raise_for_status()
    except requests.exceptions.HTTPError:
        url_data = requests.get(f'{backend_url}/stripper', data={
            'song': song,
            'artist': artist}, timeout=api_timeout).text
        if not url_data:
            return None
        url = 'https://genius.com/{}-lyrics'.format(url_data)
        page = requests.get(url, timeout=genius_timeout)

    html = BeautifulSoup(page.text, "html.parser")
    lyrics_path = html.find("div", class_="lyrics")  # finding div on Genius containing the lyrics
    if lyrics_path:
        lyrics = UnicodeDammit(lyrics_path.get_text().strip()).unicode_markup
    else:
        # hotfix!
        lyrics_path = html.find_all("div", class_=re.compile("^Lyrics__Container"))
        lyrics_data = []
        for x in lyrics_path:
            lyrics_data.append(UnicodeDammit(re.sub("<.*?>", "", str(x).replace("<br/>", "\n"))).unicode_markup)

        lyrics = "\n".join(unescape(lyrics_data))  # also convert escaped characters to symbols
    return lyrics

The function below takes a dictionary variable as input. For example ```{'name': 'Justin Bieber', 'id': '1uNFoZAHBGtllmzznpCI3s'}``` and returns the top 5 lyrics uncleaned.

In [None]:
# save the lyrics
for artist in artists:
    if artist["isNode"]:
        if not os.path.exists("../../data/genius/" + artist["id"] + ".json"):
            print(artist["name"])
            lyrics = []
            for track in artist["top5"]:
                result = get_lyrics(track["track"], track["artist"])
                item = {
                    "id": track["id"],
                    "lyrics": result
                }
                lyrics.append(item)

            with open("../../data/genius/" + artist["id"] + ".json", 'w') as outfile:
                json.dump(lyrics, outfile)
            time.sleep(0.4)

onnie Earl
Ronnie James Dio
Ronnie Milsap
Roo Panes
Rooney
Roots Manuva
Rory Gallagher
Rosanne Cash
Roscoe Dash
Rose Royce
Rose Tattoo
Rosie Gaines
Rossa
Rouge
Rowdy Rebel
Roxette
Roxy Music
Roy Brown
Roy Davis Jr.
Roy Orbison
Roy Woods
Royal Bliss
Royal Blood
Royal Republic
RuPaul
Ruben Studdard
Rucka Rucka Ali
Rudy La Scala
Ruff Ryders
Rufus
Rufus Wainwright
Run River North
Run The Jewels
Runaway June
Rune RK
Rune Rudberg
Running Wild
Runtown
Rush
Rusko
Russ
Russell Dickerson
Russian Circles
Russian Red
Russkaja
Rusted Root
Rvssian
Rx Bandits
Ry Cooder
Ryan Adams
Ryan Bingham
Ryan Hurd
Ryn Weaver
Rytmus
Ryuichi Sakamoto
Róisín Murphy
Röyksopp
S Club 7
SALES
SBTRKT
SCH
SHINee
SLANDER
SNAILS
SOJA
SR-71
STRFKR
SVDDEN DEATH
SWMRS
SWV
SZA
Saba
Sacha Distel
Sadat X
Sade
Sadistik
SafetySuit
Saga
Sage Francis
SahBabii
Saigon
Saiko
Saint Asonia
Saint Etienne
Saint Motel
Saint Raymond
Sakanaction
Saliva
Salt-N-Pepa
Sam & Dave
Sam Bush
Sam Cooke
Sam Fender
Sam Hunt
Sam Palladio
Sam Phillips
Sam