In [1]:
import os
import sys
from pathlib import Path

GENIUS_API_TOKEN = os.environ.get("GENIUS_API_TOKEN")

In [2]:
# Make HTTP requests
import requests
# Scrape data from an HTML document
from bs4 import BeautifulSoup
# I/O
import os
# Search and manipulate strings
import re

1. Get a list of Genius.com URL’s for a specified number of songs for an artist


In [3]:
# Get artist object from Genius API
def request_artist_info(artist_name, page):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response
# Get Genius.com song url's from artist object
def request_song_url(artist_name, song_cap):
    page = 1
    songs = []

    while True:
        response = request_artist_info(artist_name, page)
        json = response.json()
        # Collect up to song_cap song objects from artist
        song_info = []
        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                song_info.append(hit)

        # Collect song URL's from song objects
        for song in song_info:
            if (len(songs) < song_cap):
                url = song['result']['url']
                songs.append(url)

        if (len(songs) == song_cap):
            break
        else:
            page += 1

    print('Found {} songs by {}'.format(len(songs), artist_name))
    return songs

# DEMO
request_song_url('Lana Del Rey', 2)

Found 2 songs by Lana Del Rey


['https://genius.com/Lana-del-rey-young-and-beautiful-lyrics',
 'https://genius.com/Lana-del-rey-summertime-sadness-lyrics']

2. Fetch lyrics from the URLs
* [Fix for the html parsing](https://stackoverflow.com/questions/68027296/scraping-song-lyrics-with-beautifulsoup)

In [33]:
# Now next task is to fetch lyrics from the url's we got from the above function
# We will use the BeautifulSoup library to parse the HTML data we scrapped from the website
# We will use the requests library to make HTTP requests to the website

# Scrape lyrics from a Genius.com song URL
page = requests.get('https://genius.com/Lana-del-rey-ride-lyrics')
html = BeautifulSoup(page.text, 'html.parser')

# prettify the parsed html
# print(html.prettify())

# Scrape the song lyrics from the HTML
lyrics = html.select("div[class*=Lyrics__Container]") 
lyrics = re.sub('<[^>]*>', '', str(lyrics[0])) # Remove html tags
lyrics = re.sub(' +', ' ', lyrics) # Remove duplicate newlines
lyrics = lyrics.strip() # Remove whitespace at the beginning and end

# Remove identifiers like chorus, verse, etc
lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)

print(lyrics)


Ooh-ooh-ooh-ooh, ooh-ooh-oohOoh-ooh-ooh-ooh, ooh-ooh-oohI been out on that open roadBut you can be my full-time daddy, white and goldSinging blues has been gettin' oldBut you can be my full-time baby, hot or coldDon't break me down I been traveling too long I been tryin' too hard With one pretty song I hear the birds on the summer breeze, I drive fastI am alone at midnightBeen trying hard not to get into trouble, but II've got a war in my mindSo, I just ride, just rideI just ride, I just ride


In [35]:
def scrape_song_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = html.select("div[class*=Lyrics__Container]")
    # Extract text from the Tag objects
    lyrics = [tag.get_text() for tag in lyrics]
    # Convert list of lyrics to a single string
    lyrics = '\n'.join(lyrics)
    # Remove identifiers like chorus, verse, etc
    lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
    # Remove empty lines
    lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])         
    return lyrics

# DEMO
scrape_song_lyrics('https://genius.com/Lana-del-rey-ride-lyrics')

"Ooh-ooh-ooh-ooh, ooh-ooh-oohOoh-ooh-ooh-ooh, ooh-ooh-oohI been out on that open roadBut you can be my full-time daddy, white and goldSinging blues has been gettin' oldBut you can be my full-time baby, hot or coldDon't break me down I been traveling too long I been tryin' too hard With one pretty song I hear the birds on the summer breeze, I drive fastI am alone at midnightBeen trying hard not to get into trouble, but II've got a war in my mindSo, I just ride, just rideI just ride, I just ride\r\nDying young and playing hardThat's the way my father made his life and artDrink all day and we talk 'til darkThat's the way the road dogs do it, light 'til darkDon't leave me now Don't say goodbye Don't turn around Leave me high and dry I hear the birds on the summer breeze, I drive fastI am alone at midnightBeen trying hard not to get into trouble, but II've got a war in my mindI just ride, just rideI just ride, I just rideI'm tired of feeling like I'm fucking crazyI'm tired of driving 'til I

3. Loop through all URL’s and write lyrics to one file

In [37]:
def write_lyrics_to_file(artist_name, song_count):
    os.makedirs('lyrics', exist_ok=True)  # Create 'lyrics' directory if it doesn't exist
    f = open('lyrics/' + artist_name.lower() + '.txt', 'wb')
    urls = request_song_url(artist_name, song_count)
    for url in urls:
        lyrics = scrape_song_lyrics(url)
        f.write(lyrics.encode("utf8"))
    f.close()
    num_lines = sum(1 for line in open('lyrics/' + artist_name.lower() + '.txt', 'rb'))
    print('Wrote {} lines to file from {} songs'.format(num_lines, song_count))

# DEMO
write_lyrics_to_file('Lana Del Rey', 2)

Found 2 songs by Lana Del Rey
Wrote 4 lines to file from 2 songs


### Next steps
- [ ] Add a function to get the artist name from the URL
- [ ] Add a function to get the song name from the URL
- [ ] Add a function to get the song lyrics from the URL
- [ ] Add a function to get the song release date from the URL
- [ ] Add a function to get the song album name from the URL
- [ ] Add a function to get the song writer from the URL
- [ ] Add a function to get the song producer from the URL
- [ ] Add a function to get the song featured artists from the URL
- [ ] Add a function to get the song samples from the URL
- [ ] Add a function to get the song interpolates from the URL
- [ ] Add a function to get the song description from the URL
- [ ] Add a function to get the song album art from the URL

# References
- [Genius API](https://docs.genius.com/)
- [Genius API Python Wrapper](www.github.com/johnwmillr/LyricsGenius)
- [How to Scrape Song Lyrics: A Gentle Tutorial](https://medium.com/analytics-vidhyahow-to-scrape-song-lyrics-a-gentle-python-tutorial-5b1d4ab35)

### Future directions
- Rap song writing recurrent neural network trained on Kanye West’s entire discography
    Same thing but Taylor Swift
    Same thing but Young Thug

- Lyric generation trained on data from two or more artists (Kanye + TSwift, Young Thug + Lana Del Rey, Travis Scott + Beach Boys…go wild my children)

- Lyric generation + Audio generation = AI generated song?