# **Spotify Scraping**
<img src="https://www.liderlogo.es/wp-content/uploads/2022/12/pasted-image-0-4-1024x576.png" alt="Texto alternativo" width="180" height="100">



In [None]:
import sys
import spotipy
import spotipy.util as util
import requests
from bs4 import BeautifulSoup
import lyricsgenius as lg
from unidecode import unidecode
from joblib import Parallel, delayed
import re
import pandas as pd
import time
from tabulate import tabulate
from tqdm import tqdm
import random

In [None]:
def format_genius_url(artists, song_name):
    
    artist_clean = unidecode(artists).lower().strip()
    artist_clean = artist_clean.replace('&', 'and')
    artist_clean = artist_clean.replace(".", "")
    artist_clean = artist_clean.replace("'", "")
    artist_clean = artist_clean.split('/')[0].strip() #  # If artist name contains '/', take only the part before it
    artist_clean = re.sub(r"\s*(featuring|ft\.)\s.*", "", artist_clean)
    artist_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", artist_clean)


    song_clean = unidecode(song_name.lower().strip())    
    song_clean = song_clean.replace("&", "and") 
    song_clean = song_clean.replace("'", "")
    song_clean = song_clean.split(" / ")[0].strip() 
    song_clean = song_clean.replace("/", " ")
    song_clean = re.sub(r"\s*\((feat\.[^\)]+|Live|Remix|Version)[^\)]+\)\s*", "", song_clean)  # Remove (feat. Artist), (Live), etc.
    song_clean = re.sub(r"\s*\(\s*([^\)]+)\s*\)", r" \1", song_clean)
    song_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", song_clean)

    artist_clean = re.sub(r"\s+", "-", artist_clean)
    song_clean = re.sub(r"\s+", "-", song_clean)
    
    link = f'https://genius.com/{artist_clean}-{song_clean}-lyrics'

    link_clean = re.sub(r"-{2,}", "-", link)

    return link_clean

def get_song_lyrics(artist, song_name):
    song_url = format_genius_url(artist, song_name)

    print('\nSong: {}\nArtist: {}'.format(song_name, artist))
    print('song_url: ', song_url)
    
    response = requests.get(song_url)
    if response.status_code != 200:  # If webpage doesnt exist
        return None

    else:
        soup = BeautifulSoup(response.content, 'lxml')
    
        lyrics = None
        for tag in soup.select('div[data-lyrics-container="true"], div[class^="Lyrics__Container"], .Lyrics__Root'):
            for i in tag.select('i'):
                i.unwrap()
            tag.smooth()
            lyrics = tag.get_text(strip=True, separator='\n')
            if lyrics:
                break

        if not lyrics:
            return "No Lyrics"

        return lyrics

def get_lyrics_parallel(row):
    lyrics = get_song_lyrics(row['artist'], row['title'])
    song_url = format_genius_url(row['artist'], row['title'])
    return pd.Series([lyrics, song_url])

def get_lyrics_parallel_balanced(row):
    MIN_DELAY = 3   
    MAX_DELAY = 7   
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    return get_lyrics_parallel(row)

In [133]:
def get_song_info(song_url):

    soup = BeautifulSoup(requests.get(song_url).content, 'lxml')
    
    producers = None
    writers = None
    label = None
    released_on = None
    tags = None

    credits_tag = soup.find('div', {'class': 'SongInfo-sc-4162678b-1 gEkcUA'})

    if credits_tag:
        producers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, string=re.compile(r'Producer', re.IGNORECASE))
        if producers_tag:                                      
            producers = list(set([span.get_text(strip=True) for span in producers_tag.find_next_sibling().find_all('span')]))

        writers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Writer', re.IGNORECASE))
        if writers_tag:
            writers = list(set([span.get_text(strip=True) for span in writers_tag.find_next_sibling().find_all('span')]))
            
        label_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Label', re.IGNORECASE))
        if label_tag:
            label = list(set([span.get_text(strip=True) for span in label_tag.find_next_sibling().find_all('span')]))

        released_on_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Released\s+(in|on)', re.IGNORECASE))
        if released_on_tag:
            released_on = released_on_tag.find_next_sibling('div').get_text(strip=True) if released_on_tag else None

    tags_tag = soup.find('div', {'class': 'SongTags-sc-b55131f0-0 eUsmSA'}, string=re.compile(r'Tags', re.IGNORECASE))
    if tags_tag:
        tags = list(set([a.get_text(strip=True) for a in tags_tag.find_next_sibling('div').find_all('a')])) if tags_tag else None

    return producers, writers, label, released_on, tags

def get_info_parallel(row):
    producers, writers, label, released_on, tags = get_song_info(row['song_url'])
    return pd.Series([producers, writers, label, released_on, tags])

def get_info_parallel_balanced(row):
    MIN_DELAY = 3   
    MAX_DELAY = 7  
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    return get_info_parallel(row)

In [136]:
def get_song_info(row):
    
    genius_api_key = "8uWLYoKNMdAgpIarv32OGJJfCU6D2QE1GsADuMWVYljmsLl1lGmzh1IkLC_c0J38"
    genius = lg.Genius(genius_api_key, timeout=10) 

    try:
        song = genius.search_song(title=row['title'], artist=row['artist'])
        if song:
            return pd.Series({
                "lyrics": song.lyrics if song.lyrics else None,
                "song_url": song.url if song.url else None
            })
        else:
            return pd.Series({
                "lyrics": None,
                "song_url": row['song_url']
            })
    except Exception as e:
        print(f"Error with {row['title']} by {row['artist']}: {e}")
        return pd.Series({
            "lyrics": None,
            "song_url": row['song_url']
        })

In [None]:
df = pd.read_csv('billboard_hot_1000_1981_2022.csv')

df_1981_1985 = df[(df['year'] >= 1981) & (df['year'] <= 1985)]
df_1986_1990 = df[(df['year'] >= 1986) & (df['year'] <= 1990)]
df_1991_1995 = df[(df['year'] >= 1991) & (df['year'] <= 1995)]
df_1996_2000 = df[(df['year'] >= 1996) & (df['year'] <= 2000)]
df_2001_2005 = df[(df['year'] >= 2001) & (df['year'] <= 2005)]
df_2006_2010 = df[(df['year'] >= 2006) & (df['year'] <= 2010)]
df_2011_2015 = df[(df['year'] >= 2011) & (df['year'] <= 2015)]
df_2016_2020 = df[(df['year'] >= 2016) & (df['year'] <= 2020)]
df_2021_2022 = df[(df['year'] >= 2021) & (df['year'] <= 2022)]

In [None]:
N_JOBS = 6  

list_range = [df_1981_1985, df_1986_1990, df_1991_1995, df_1996_2000, 
              df_2001_2005, df_2006_2010, df_2011_2015, df_2016_2020, df_2021_2022]

for df_anio in list_range:
    df_anio[['lyrics', 'song_url']] = Parallel(n_jobs=N_JOBS)(delayed(get_lyrics_parallel)(row) for _, row in df_anio.iterrows())

In [None]:
for df_anio in list_range:
    df_anio_null = df_anio[df_anio['lyrics'].isnull()]
    df_anio.loc[df_anio['lyrics'].isnull(), ["lyrics", "song_url"]] = df_anio_null.apply(get_song_info, axis=1)

In [None]:
for df_anio in list_range:
    df_anio[['producers', 
             'writers', 
             'label', 
             'released_on', 
             'tags']] = Parallel(n_jobs=N_JOBS)(delayed(get_info_parallel_balanced)(row) for _, row in df_anio.iterrows())

In [None]:
df_combined = pd.concat(list_range, ignore_index=True)

In [150]:
df_combined.apply(lambda x: round(x.isnull().mean(),4)).to_frame()

Unnamed: 0,0
year,0.0
ranking,0.0
title,0.0
artist,0.0
lyrics,0.0067
song_url,0.0
producers,0.0189
writers,0.0175
label,0.3831
released_on,0.017


In [None]:
pd.set_option('display.max_column', None)

df_combined.groupby("year").apply(lambda x: x.count()).transpose()

  df_combined.groupby("year").apply(lambda x: x.count()).transpose()


year,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
year,99,100,100,100,99,100,100,100,100,100,99,100,98,99,99,100,99,99,99,99,100,100,100,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100
ranking,99,100,100,100,99,100,100,100,100,100,99,100,98,99,99,100,99,99,99,99,100,100,100,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100
title,99,100,100,100,99,100,100,100,100,100,99,100,98,99,99,100,99,99,99,99,100,100,100,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100
artist,99,100,100,100,99,100,100,100,100,100,99,100,98,99,99,100,99,99,99,99,100,100,100,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100
lyrics,99,98,99,98,98,100,99,99,100,99,99,100,97,96,98,100,96,97,98,98,98,98,100,100,100,100,100,98,100,100,100,99,99,99,99,97,100,100,100,99,100,100
song_url,99,100,100,100,99,100,100,100,100,100,99,100,98,99,99,100,99,99,99,99,100,100,100,100,100,100,100,98,100,100,100,99,99,100,99,98,100,100,100,100,100,100
producers,94,96,97,98,98,100,98,97,100,95,97,97,92,96,98,95,95,94,96,96,98,98,100,99,100,100,99,98,100,100,100,98,99,99,99,97,100,99,98,99,97,97
writers,95,96,98,98,98,100,99,97,99,95,96,99,93,95,97,98,94,94,96,96,98,98,99,100,100,99,99,98,100,100,100,98,99,99,99,97,100,100,98,99,97,99
label,28,41,41,46,50,41,38,38,43,40,36,42,54,61,60,58,57,56,59,48,59,76,63,64,74,70,67,67,67,72,71,66,75,77,75,80,91,95,85,89,83,77
released_on,92,96,98,98,98,99,99,97,97,97,93,99,95,96,97,97,94,95,97,94,98,98,99,100,100,100,100,98,100,100,100,99,99,99,98,96,100,100,100,99,100,100


In [None]:
df_combined.to_csv("billboard_hot_100_1981_2022_with_lyrics_vf.csv", index=False, encoding="utf-8")