In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import datetime

from lyricsgenius import Genius
from collections import Counter

In [3]:
genius = Genius('in424DYZRH1oa--Iyzce_Ue6uLYNqt1Q6-Yq20PUSYl-PjvMNuuaX0YBiwcSFt9_')

In [4]:
song_parts_regex = r"\[Intro.*?\]|\[Verse [0-9].*?\]|\[Refrain.*?\]|\[Pre-Chorus.*?\]|\[Chorus.*?\]|\[Post-Chorus.*?\]|\[Hooks.*?\]|\[Riffs/Basslines.*?\]|\[Scratches.*?\]|\[Bridge.*?\]|\[Interlude.*?\]|\[Break.*?\]|\[Skit.*?\]|\[Collision.*?\]|\[Instrumental or Solo.*?\]|\[Ad lib.*?\]|\[Segue.*?\]|\[Outro.*?\]"

In [5]:
def scrape_lyrics(url):
    """
    input (str): url for a Genius lyrics page
    output (tuple list): returns list of song parts and lyrics tuples
    """

    if type(url) != str or len(url) <= 3:
        return []

    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics_result_set = html.find_all('div', attrs={'data-lyrics-container': 'true'})
    lyrics = [lyrics_result.get_text(separator=' ') for lyrics_result in lyrics_result_set]
    lyrics = ' '.join(lyrics)
   
    try:
        parts_iter = re.finditer(song_parts_regex, lyrics)

        song_parts = []
        song_part_spans = []
        for part_object in parts_iter:
            song_parts.append(part_object.group())
            song_part_spans.append(part_object.span())

        lyric_spans = []
        # # If there are lyrics before the first song part, add that range of indices
        # try:
        #     print(song_part_spans[0][0])
        # except:
        #     print("An exception occurred", song_part_spans)
        # first_song_part_start_index = song_part_spans[0][0]
        # if first_song_part_start_index != 0:
        #     song_parts.insert(0, '[Start]')
        #     lyric_spans.append((0, first_song_part_start_index))

        for index in range(len(song_part_spans) - 1):
            start_lyric_span = song_part_spans[index][1]
            end_lyric_span = song_part_spans[index + 1][0]
            lyric_spans.append((start_lyric_span, end_lyric_span))

        # Add the lyrics after the final song part
        lyric_spans.append((song_part_spans[-1][1], len(lyrics)))

        # Get lyrics without song part tags
        span_to_lyrics = lambda span: lyrics[span[0]:span[1]].lower()
        lyrics_without_song_parts = [span_to_lyrics(span).strip() for span in lyric_spans]

        # Create lyrics list
        lyrics = list(zip(song_parts, lyrics_without_song_parts))

        return lyrics
        
    except:
        return []


def extract_lyrics_text(lyrics):
        if lyrics:
            return ' '.join(list(zip(*lyrics))[1])

In [6]:
df_charts = pd.read_csv('data/charts.csv')
df_charts.head()

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,4.0,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,5.0,2,18


In [7]:
def create_df_billboard(limit=5):
    df_charts = pd.read_csv('data/charts.csv')
    song_artist_array = df_charts[['song', 'artist']].to_numpy()
    # print(song_artist_array[:limit])
    # print([song_info for (song_info, _) in Counter(map(tuple, song_artist_array)).most_common()[:limit]])
    songs, artists = list(zip(*[song_info for (song_info, _) in Counter(map(tuple, song_artist_array)).most_common()[:limit]]))

    urls = []
    peak_ranks = []
    weeks_on_boards = []
    last_weeks = []
    ranks = []
    for song, artist in zip(songs, artists):
        start = datetime.datetime.now()
        stop = start + datetime.timedelta(seconds = 10)
        while True:
            if datetime.datetime.now() > stop:
                urls.append(None)
                break
            try:
                urls.append(genius.search_song(song, artist).url)
                break
            except:
                pass
        match_rows = df_charts.loc[(df_charts['song'] == song) & (df_charts['artist'] == artist)]
        peak_rank = {}
        weeks_on_board = {}
        last_week = {}
        rank = {}
        for index, row in match_rows.iterrows():
            rank[row['date']] = row['rank']
            peak_rank[row['date']] = row['peak-rank']
            weeks_on_board[row['date']] = row['weeks-on-board']
            last_week[row['date']] = row['last-week']
        ranks.append(rank)
        peak_ranks.append(peak_rank)
        weeks_on_boards.append(weeks_on_board)
        last_weeks.append(last_week)

    lyrics = list(map(scrape_lyrics, urls))
    lyrics_text = list(map(extract_lyrics_text, lyrics))

    df_billboard = pd.DataFrame({
        'song': songs,
        'artist': artists,
        'url': urls,
        'lyrics': lyrics,
        'lyrics_text': lyrics_text,
        'rank': ranks,
        'peak_rank': peak_ranks,
        'weeks_on_board': weeks_on_boards,
        'last_week': last_weeks
    })

    return df_billboard
    

In [8]:
df_billboard = create_df_billboard(10000)

Searching for "Blinding Lights" by The Weeknd...
Done.
Searching for "Radioactive" by Imagine Dragons...
Done.
Searching for "Sail" by AWOLNATION...
Done.
Searching for "I'm Yours" by Jason Mraz...
Searching for "I'm Yours" by Jason Mraz...
Searching for "How Do I Live" by LeAnn Rimes...
Done.
Searching for "Counting Stars" by OneRepublic...
Done.
Searching for "Party Rock Anthem" by LMFAO Featuring Lauren Bennett & GoonRock...
Done.
Searching for "Rolling In The Deep" by Adele...
Done.
Searching for "Foolish Games/You Were Meant For Me" by Jewel...
Done.
Searching for "Before He Cheats" by Carrie Underwood...
Done.
Searching for "I Hope" by Gabby Barrett Featuring Charlie Puth...
Done.
Searching for "Ho Hey" by The Lumineers...
Done.
Searching for "You And Me" by Lifehouse...
Done.
Searching for "Circles" by Post Malone...
Done.
Searching for "Demons" by Imagine Dragons...
Done.
Searching for "Need You Now" by Lady Antebellum...
Done.
Searching for "Macarena (Bayside Boys Mix)" by Los

ConnectionError: ('Connection aborted.', TimeoutError(60, 'Operation timed out'))

In [9]:
df_billboard

NameError: name 'df_billboard' is not defined