In [192]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

from lyricsgenius import Genius
from collections import Counter

In [193]:
genius = Genius('in424DYZRH1oa--Iyzce_Ue6uLYNqt1Q6-Yq20PUSYl-PjvMNuuaX0YBiwcSFt9_')

In [194]:
song_parts_regex = r"\[Intro.*?\]|\[Verse [0-9].*?\]|\[Refrain.*?\]|\[Pre-Chorus.*?\]|\[Chorus.*?\]|\[Post-Chorus.*?\]|\[Hooks.*?\]|\[Riffs/Basslines.*?\]|\[Scratches.*?\]|\[Bridge.*?\]|\[Interlude.*?\]|\[Break.*?\]|\[Skit.*?\]|\[Collision.*?\]|\[Instrumental or Solo.*?\]|\[Ad lib.*?\]|\[Segue.*?\]|\[Outro.*?\]"

In [195]:
def scrape_lyrics(url):
    """
    input (str): url for a Genius lyrics page
    output (tuple list): returns list of song parts and lyrics tuples
    """

    if type(url) != str or len(url) <= 3:
        return []

    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics_result_set = html.find_all('div', attrs={'data-lyrics-container': 'true'})
    lyrics = [lyrics_result.get_text(separator=' ') for lyrics_result in lyrics_result_set]
    lyrics = ' '.join(lyrics)
   
    try:
        parts_iter = re.finditer(song_parts_regex, lyrics)

        song_parts = []
        song_part_spans = []
        for part_object in parts_iter:
            song_parts.append(part_object.group())
            song_part_spans.append(part_object.span())

        lyric_spans = []
        # # If there are lyrics before the first song part, add that range of indices
        # try:
        #     print(song_part_spans[0][0])
        # except:
        #     print("An exception occurred", song_part_spans)
        # first_song_part_start_index = song_part_spans[0][0]
        # if first_song_part_start_index != 0:
        #     song_parts.insert(0, '[Start]')
        #     lyric_spans.append((0, first_song_part_start_index))

        for index in range(len(song_part_spans) - 1):
            start_lyric_span = song_part_spans[index][1]
            end_lyric_span = song_part_spans[index + 1][0]
            lyric_spans.append((start_lyric_span, end_lyric_span))

        # Add the lyrics after the final song part
        lyric_spans.append((song_part_spans[-1][1], len(lyrics)))

        # Get lyrics without song part tags
        span_to_lyrics = lambda span: lyrics[span[0]:span[1]].lower()
        lyrics_without_song_parts = [span_to_lyrics(span).strip() for span in lyric_spans]

        # Create lyrics list
        lyrics = list(zip(song_parts, lyrics_without_song_parts))

        return lyrics
        
    except:
        return []


def extract_lyrics_text(lyrics):
        if lyrics:
            return ' '.join(list(zip(*lyrics))[1])

In [196]:
def create_df_billboard(limit=5):
    df_charts = pd.read_csv('data/charts.csv')
    song_artist_array = df_charts[['song', 'artist']].to_numpy()
    songs, artists = list(zip(*[song_info for (song_info, _) in Counter(map(tuple, song_artist_array)).most_common()[:limit]]))

    urls = []
    for song, artist in zip(songs, artists):
        while True:
            try:
                urls.append(genius.search_song(song, artist).url)
                break
            except:
                pass

    lyrics = list(map(scrape_lyrics, urls))
    lyrics_text = list(map(extract_lyrics_text, lyrics))

    df_billboard = pd.DataFrame({
        'song': songs,
        'artist': artists,
        'url': urls,
        'lyrics': lyrics,
        'lyrics_text': lyrics_text
    })

    return df_billboard
    

In [197]:
df_billboard = create_df_billboard(10)

Searching for "Blinding Lights" by The Weeknd...
Searching for "Blinding Lights" by The Weeknd...
Searching for "Blinding Lights" by The Weeknd...
Done.
Searching for "Radioactive" by Imagine Dragons...
Done.
Searching for "Sail" by AWOLNATION...
Done.
Searching for "I'm Yours" by Jason Mraz...
Done.
Searching for "How Do I Live" by LeAnn Rimes...
Done.
Searching for "Counting Stars" by OneRepublic...
Done.
Searching for "Party Rock Anthem" by LMFAO Featuring Lauren Bennett & GoonRock...
Done.
Searching for "Rolling In The Deep" by Adele...
Done.
Searching for "Foolish Games/You Were Meant For Me" by Jewel...
Done.
Searching for "Before He Cheats" by Carrie Underwood...
Done.


In [198]:
df_billboard

Unnamed: 0,song,artist,url,lyrics,lyrics_text
0,Blinding Lights,The Weeknd,https://genius.com/The-weeknd-blinding-lights-...,"[([Intro], yeah), ([Verse 1], i've been tryna ...",yeah i've been tryna call i've been on my own ...
1,Radioactive,Imagine Dragons,https://genius.com/Imagine-dragons-radioactive...,"[([Intro], whoa, oh-oh whoa, oh-oh whoa, oh-oh...","whoa, oh-oh whoa, oh-oh whoa, oh-oh whoa i'm w..."
2,Sail,AWOLNATION,https://genius.com/Awolnation-sail-lyrics,"[([Intro], sail), ([Verse 1], this is how i sh...",sail this is how i show my love i made it in m...
3,I'm Yours,Jason Mraz,https://genius.com/Jason-mraz-im-yours-lyrics,"[([Verse 1], well you done done me in; you bet...",well you done done me in; you bet i felt it i ...
4,How Do I Live,LeAnn Rimes,https://genius.com/Leann-rimes-how-do-i-live-l...,"[([Verse 1], how do i get through one night wi...",how do i get through one night without you? if...
5,Counting Stars,OneRepublic,https://genius.com/Onerepublic-counting-stars-...,"[([Intro], lately, i've been, i've been losin'...","lately, i've been, i've been losin' sleep drea..."
6,Party Rock Anthem,LMFAO Featuring Lauren Bennett & GoonRock,https://genius.com/Lmfao-party-rock-anthem-lyrics,"[([Intro: Redfoo], party rock yeah woo! let's ...",party rock yeah woo! let's go! party rock is i...
7,Rolling In The Deep,Adele,https://genius.com/Adele-rolling-in-the-deep-l...,"[([Verse 1], there's a fire starting in my hea...",there's a fire starting in my heart reaching a...
8,Foolish Games/You Were Meant For Me,Jewel,https://genius.com/Ray-bradbury-burning-bright...,[],
9,Before He Cheats,Carrie Underwood,https://genius.com/Carrie-underwood-before-he-...,"[([Verse 1], right now, he's probably slow dan...","right now, he's probably slow dancing with a b..."
