# **Scraping Lyrics**
<img src="https://www.liderlogo.es/wp-content/uploads/2022/12/pasted-image-0-4-1024x576.png" alt="Texto alternativo" width="180" height="100">



In [None]:
import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import requests
from bs4 import BeautifulSoup
import lyricsgenius as lg
from unidecode import unidecode
from joblib import Parallel, delayed
import re
from collections import Counter
import ast
import pandas as pd
import numpy as np
import time
from tabulate import tabulate
from tqdm import tqdm
import random

# **1. Define Functions**

In [56]:
def format_genius_url(artists, song_name):
    '''
    This function cleans the artist and song names from the data. 
    It utilizes regular expressions, replacements, and splitting techniques. 
    Finally, it performs parallel processing to optimize efficiency and reduce runtime."
    '''
    artist_clean = unidecode(artists).lower().strip()
    artist_clean = artist_clean.replace('&', 'and')
    artist_clean = artist_clean.replace(".", "")
    artist_clean = artist_clean.replace("'", "")
    artist_clean = artist_clean.split('/')[0].strip() #  # If artist name contains '/', take only the part before it
    artist_clean = re.sub(r"\s*(featuring|ft\.)\s.*", "", artist_clean)
    artist_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", artist_clean)


    song_clean = unidecode(song_name.lower().strip())    
    song_clean = song_clean.replace("&", "and") 
    song_clean = song_clean.replace("'", "")
    song_clean = song_clean.split(" / ")[0].strip() 
    song_clean = song_clean.replace("/", " ")
    song_clean = re.sub(r"\s*\((feat\.[^\)]+|Live|Remix|Version)[^\)]+\)\s*", "", song_clean)  # Remove (feat. Artist), (Live), etc.
    song_clean = re.sub(r"\s*\(\s*([^\)]+)\s*\)", r" \1", song_clean)
    song_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", song_clean)

    artist_clean = re.sub(r"\s+", "-", artist_clean)
    song_clean = re.sub(r"\s+", "-", song_clean)
    
    link = f'https://genius.com/{artist_clean}-{song_clean}-lyrics'

    link_clean = re.sub(r"-{2,}", "-", link)

    return link_clean

def get_song_lyrics(artist, song_name):
    song_url = format_genius_url(artist, song_name)

    print('\nSong: {}\nArtist: {}'.format(song_name, artist))
    print('song_url: ', song_url)
    
    response = requests.get(song_url)
    if response.status_code != 200:  # If webpage doesnt exist
        return None

    else:
        soup = BeautifulSoup(response.content, 'lxml')
    
        lyrics = None
        for tag in soup.select('div[data-lyrics-container="true"], div[class^="Lyrics__Container"], .Lyrics__Root'):
            for i in tag.select('i'):
                i.unwrap()
            tag.smooth()
            lyrics = tag.get_text(strip=True, separator='\n')
            if lyrics:
                break

        if not lyrics:
            return "No Lyrics"

        return lyrics

def get_lyrics_parallel(row):
    lyrics = get_song_lyrics(row['artist'], row['title'])
    song_url = format_genius_url(row['artist'], row['title'])
    return pd.Series([lyrics, song_url])

def get_lyrics_parallel_balanced(row):
    MIN_DELAY = 3   
    MAX_DELAY = 7   
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    return get_lyrics_parallel(row)

In [63]:
def genius_api_info(row):
    '''
    This function utilizes the Genius API to extract missing data. 
    It employs a more robust search process to accurately identify the correct link by using the title and artist values.
    '''
    genius_api_key = "8uWLYoKNMdAgpIarv32OGJJfCU6D2QE1GsADuMWVYljmsLl1lGmzh1IkLC_c0J38"
    genius = lg.Genius(genius_api_key, timeout=10) 

    try:
        song = genius.search_song(title=row['title'], artist=row['artist'])
        if song:
            return pd.Series({
                "lyrics": song.lyrics if song.lyrics else None,
                "song_url": song.url if song.url else None
            })
        else:
            return pd.Series({
                "lyrics": None,
                "song_url": row['song_url']
            })
    except Exception as e:
        print(f"Error with {row['title']} by {row['artist']}: {e}")
        return pd.Series({
            "lyrics": None,
            "song_url": row['song_url']
        })

In [147]:
def get_song_info(song_url):
    '''
    This function uses the requests library in conjunction with BeautifulSoup to extract additional information about the song. 
    The resulting columns obtained are: producers, writers, label, release date, and tags.
    '''
    soup = BeautifulSoup(requests.get(song_url).content, 'lxml')
    
    producers = None
    writers = None
    label = None
    released_on = None
    tags = None

    credits_tag = soup.find('div', {'class': re.compile(r'SongInfo__Title-sc-4162678b-1\s+\w')})

    if credits_tag:
        producers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo__Label-sc-4162678b-4\s+\w+')}, string=re.compile(r'Producer', re.IGNORECASE))
        if producers_tag:                                      
            producers = list(set([span.get_text(strip=True) for span in producers_tag.find_next_sibling().find_all('span')]))

        writers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo__Label-sc-4162678b-4\s+\w+')}, text=re.compile(r'Writer', re.IGNORECASE))
        if writers_tag:
            writers = list(set([span.get_text(strip=True) for span in writers_tag.find_next_sibling().find_all('span')]))
            
        label_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo__Label-sc-4162678b-4\s+\w+')}, text=re.compile(r'Label', re.IGNORECASE))
        if label_tag:
            label = list(set([span.get_text(strip=True) for span in label_tag.find_next_sibling().find_all('span')]))

        released_on_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo__Label-sc-4162678b-4\s+\w+')}, text=re.compile(r'Released\s+(in|on)', re.IGNORECASE))
        if released_on_tag:
            released_on = released_on_tag.find_next_sibling('div').get_text(strip=True) if released_on_tag else None

    tags_tag = soup.find('div', {'class': re.compile(r'SongTags__Title-sc-b55131f0-0\s+\w')}, string=re.compile(r'Tags', re.IGNORECASE))

    if tags_tag:
        tags = list(set([a.get_text(strip=True) for a in tags_tag.find_next_sibling('div').find_all('a')])) if tags_tag else None

    return producers, writers, label, released_on, tags

def get_info_parallel(row):
    producers, writers, label, released_on, tags = get_song_info(row['song_url'])
    return pd.Series([producers, writers, label, released_on, tags])

def get_info_parallel_balanced(row):
    MIN_DELAY = 3   
    MAX_DELAY = 7  
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    return get_info_parallel(row)

In [None]:
def get_only_lyrics(row):
    
    response = requests.get(row['song_url'])
    if response.status_code != 200:  # If webpage doesnt exist
        return None

    else:
        soup = BeautifulSoup(response.content, 'lxml')
    
        lyrics = None
        for tag in soup.select('div[data-lyrics-container="true"], div[class^="Lyrics__Container"], .Lyrics__Root'):
            for i in tag.select('i'):
                i.unwrap()
            tag.smooth()
            lyrics = tag.get_text(strip=True, separator='\n')
            if lyrics:
                break

        if not lyrics:
            return "No Lyrics"

        return lyrics

In [259]:
def get_song_release_date(title):
    '''
    Get the date when the song was published.
    This function uses Spotify API.
    '''
    username = '21og2aoyw24qgximxsq6hfa3i'
    spotipy_client_id = "548c02068e724526baf7d90cddc281dc" 
    spotipy_client_secret = "426aab0a7efe4f0e8787d0ac2f568bc1"

    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=spotipy_client_id, client_secret=spotipy_client_secret))

    query = f"track:{title}"
    results = sp.search(q=query, type="track", limit=1)

    if results["tracks"]["items"]:
        track = results["tracks"]["items"][0]
        album_id = track["album"]["id"]
        album = sp.album(album_id)
        release_date = album["release_date"]  # Formato: YYYY-MM-DD o YYYY-MM o YYYY
        
        return release_date
    else:
        return None

# **2. Getting the Data**

In [None]:
df = pd.read_csv('billboard_hot_100_2004_2024.csv')

df_2004_2005 = df[(df['year'] >= 2004) & (df['year'] <= 2005)]
df_2006_2010 = df[(df['year'] >= 2006) & (df['year'] <= 2010)]
df_2011_2015 = df[(df['year'] >= 2011) & (df['year'] <= 2015)]
df_2016_2020 = df[(df['year'] >= 2016) & (df['year'] <= 2020)]
df_2021_2024 = df[(df['year'] >= 2021) & (df['year'] <= 2024)]

In [None]:
N_JOBS = 6  

list_range = [df_2004_2005, df_2006_2010, df_2011_2015, df_2016_2020, df_2021_2024]

for df_anio in list_range:
    df_anio[['lyrics', 
             'song_url']] = Parallel(n_jobs=N_JOBS)(delayed(get_lyrics_parallel_balanced)(row) for _, row in  tqdm(df_anio.iterrows(), total=len(df_anio)))

In [None]:
for df_anio in list_range:
    df_anio_null = df_anio[df_anio['lyrics'].isnull()]
    df_anio.loc[df_anio['lyrics'].isnull(), ["lyrics", "song_url"]] = df_anio_null.apply(genius_api_info, axis=1)

In [None]:
for df_anio in list_range:
    df_anio[['producers', 
             'writers', 
             'label', 
             'released_on', 
             'tags']] = Parallel(n_jobs=N_JOBS)(delayed(get_info_parallel_balanced)(row) for _, row in tqdm(df_anio.iterrows(), total=len(df_anio)))                               

In [167]:
df_combined = pd.concat(list_range, ignore_index=True)

In [None]:
df_combined.apply(lambda x: round(x.isnull().mean(),4)).to_frame().transpose()

Unnamed: 0,0
year,0.0
ranking,0.0
title,0.0
artist,0.0
lyrics,0.0024
song_url,0.0
producers,0.0124
writers,0.0091
label,0.2433
released_on,0.0033


In [169]:
pd.set_option('display.max_column', None)

df_combined.groupby("year").apply(lambda x: x.count()).transpose()

  df_combined.groupby("year").apply(lambda x: x.count()).transpose()


year,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
year,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100,100,99
ranking,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100,100,99
title,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100,100,99
artist,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100,100,99
lyrics,100,100,100,100,98,100,100,100,99,99,99,99,97,100,100,100,99,100,100,99,98
song_url,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100,100,99
producers,99,100,100,99,98,100,100,100,98,99,99,99,97,100,99,98,99,97,97,92,96
writers,100,100,99,99,98,100,100,100,98,99,99,99,97,100,100,98,99,97,99,95,97
label,64,74,70,67,67,67,72,71,66,75,77,75,80,91,95,85,89,83,77,69,69
released_on,100,100,100,100,98,100,100,100,99,99,99,98,96,100,100,100,99,100,100,99,98


In [170]:
df_combined.to_csv("billboard_hot_100_2004_2024_with_lyrics.csv", index=False, encoding="utf-8")

# **3. More Details about Songs**

There are five instances where we couldn't find the lyrics. To prevent losing information, we are modifying the format of the artist and title.

In [353]:
df_combined = pd.read_csv('billboard_hot_100_2004_2024_with_lyrics.csv')

In [354]:
def correct_row(row):
    if row["title"] in corrections:
        row["title"], row["artist"], row["song_url"] = corrections[row["title"]]
    return row

# Define true variables for each case where the lyrics column is missing
corrections = {
    "Hot Boy": ("Hot Nigga", "Bobby Shmurda", "https://genius.com/Bobby-shmurda-hot-nigga-lyrics"),
    "Sucker for Pain": ("Sucker for Pain", "Lil Wayne, Wiz Khalifa, Imagine Dragons, Logic, Ty Dolla Sign",
                        "https://genius.com/Lil-wayne-wiz-khalifa-imagine-dragons-logic-and-ty-dolla-sign-sucker-for-pain-lyrics"),
    "Pussy Fairy (OTW)": ("P*Y Fairy (OTW)", "Jhené Aiko", "https://genius.com/Jhene-aiko-p-y-fairy-otw-lyrics"),
    "Barbie World": ("Barbie World", "Ice Spice & Nicki Minaj", "https://genius.com/Ice-spice-and-nicki-minaj-barbie-world-lyrics"),
    "Carnival": ("Carnival", "Kanye West, Ty Dolla Sign, Rich the Kid", 
                 "https://genius.com/Kanye-west-and-ty-dolla-sign-and-rich-the-kid-carnival-lyrics"),
    "Lean On": ("Lean On", "Major Lazer", "https://genius.com/Major-lazer-lean-on-lyrics")
              }

# Inputing the true values
df_combined = df_combined.apply(correct_row, axis=1)

# Obtain lyrics and other information abount the song
df_combined_null = df_combined[(df_combined['lyrics'].isnull()) | (df_combined['title']=='Lean On')]

df_combined.loc[(df_combined['lyrics'].isnull()) | (df_combined['title']=='Lean On'), ['producers', 'writers', 'label', 'released_on', 'tags']] = Parallel(n_jobs=N_JOBS)(delayed(get_info_parallel_balanced)(row) for _, row in tqdm(df_combined_null.iterrows(), total=len(df_combined_null)))        

df_combined.loc[(df_combined['lyrics'].isnull()) | (df_combined['title']=='Lean On'), ["lyrics"]] = df_combined_null.apply(get_only_lyrics, axis=1)


[A
100%|██████████| 7/7 [00:00<00:00, 54.11it/s]


In this section, we are identifying the month when the song was released. Additionally, we are using the Spotify API to retrieve the release date if this information is missing on the Genius webpage.

In [355]:
months = r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\b"

df_combined['month'] = df_combined['released_on'].apply(
                       lambda x: re.search(months, str(x)).group(1) if re.search(months, str(x)) else None)

In [357]:
month_map = {
    "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6,
    "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12
            }

df_combined["month"] = df_combined["month"].map(month_map).astype("Int64")

In [358]:
mask = df_combined["month"].isna()

df_combined.loc[mask, "released_on"] = df_combined.loc[mask, "title"].map(get_song_release_date)
df_combined.loc[mask, "month"] =  df_combined.loc[mask, "released_on"].astype(str).str.split("-").str[1]

Finally, to extract more useful information about the song, we are creating new columns based on the genre of each song (focusing on the most popular genres). Additionally, another important column is `usa`, which serves as a dummy variable to indicate whether the song includes **USA** in its tags.

In [None]:
df_combined["tags"] = df_combined["tags"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

all_tags = [tag for sublist in df_combined["tags"] for tag in sublist]
top_genres = [genre for genre, _ in Counter(all_tags).most_common(10)]
print(top_genres)

top_genres = ['In English', 'Pop', 'Rap', 'R&B', 'Electro-Pop','Rock']

for genre in top_genres:
    df_combined[genre] = df_combined["tags"].apply(lambda x: 1 if genre in x else 0)\
    
df_combined["usa"] = df_combined["tags"].apply(lambda x: 1 if any(tag.lower() == "usa" for tag in x) else 0)

['In English', 'Pop', 'USA', 'Rap', 'R&B', 'Adult Contemporary', 'Dance-Pop', 'Pop Rap', 'Electro-Pop', 'Rock']


In [363]:
df_combined = df_combined.sort_values(by=["year", "ranking", "month"])

df_combined["year_month"] = df_combined["year"].astype(str) + "-" + df_combined["month"].astype(str).str.zfill(2)

df_combined = df_combined.reset_index(drop=True)

In [364]:
print(tabulate(df_combined.head(5), headers='keys', tablefmt='psql'))

+----+--------+-----------+--------------------+--------------------------------------+--------------------------------------------------------------------------------------+---------------------------------------------------------+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------+--------------------------------------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+--------------+-------+-------+-------+---------------+--------+-------+--------------+
|    |   year |   ranking

In [366]:
df_combined.isnull().mean().to_frame().transpose()

Unnamed: 0,year,ranking,title,artist,lyrics,song_url,producers,writers,label,released_on,tags,month,In English,Pop,Rap,R&B,Electro-Pop,Rock,usa,year_month
0,0.0,0.0,0.0,0.0,0.0,0.0,0.010038,0.006692,0.239962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [367]:
df_combined.to_csv("billboard_hot_100_2004_2024_total.csv", index=False, encoding="utf-8")