# **Spotify Scraping**
<img src="https://www.liderlogo.es/wp-content/uploads/2022/12/pasted-image-0-4-1024x576.png" alt="Texto alternativo" width="180" height="100">



In [60]:
import sys
import spotipy
import spotipy.util as util
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from joblib import Parallel, delayed
import re
import pandas as pd
import time
from tabulate import tabulate
from tqdm import tqdm
import random

## Scrapping code

In [61]:
# Fake headers
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
]

def get_random_headers():
    return {
        'User-Agent': random.choice(USER_AGENTS)
    }

def safe_request(url, retries=3, timeout=10):
    for attempt in range(retries):
        try:
            headers = get_random_headers()
            response = requests.get(url, headers=headers, timeout=timeout)
            if response.status_code == 200:
                return response
            elif response.status_code == 429:  # Rate limited
                wait_time = random.uniform(10, 20)
                print(f"Rate limit hit. Sleeping for {wait_time:.2f} seconds...")
                time.sleep(wait_time)
        except requests.RequestException as e:
            print(f"Request error: {e}. Retrying...")
            time.sleep(random.uniform(2, 4))
    print(f"Failed to retrieve URL after {retries} attempts: {url}")
    return None

def format_genius_url(artists, song_name):
    artist_clean = unidecode(artists).lower().strip()
    artist_clean = artist_clean.replace('&', 'and')
    artist_clean = artist_clean.replace(".", "")
    artist_clean = artist_clean.replace("'", "")
    artist_clean = artist_clean.split('/')[0].strip()
    artist_clean = re.sub(r"\s*(featuring|ft\.)\s.*", "", artist_clean)
    artist_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", artist_clean)

    song_clean = unidecode(song_name.lower().strip())    
    song_clean = song_clean.replace("&", "and") 
    song_clean = song_clean.replace("'", "")
    song_clean = song_clean.split(" / ")[0].strip() 
    song_clean = song_clean.replace("/", " ")
    song_clean = re.sub(r"\s*\((feat\.[^\)]+|Live|Remix|Version)[^\)]+\)\s*", "", song_clean)
    song_clean = re.sub(r"\s*\(\s*([^\)]+)\s*\)", r" \1", song_clean)
    song_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", song_clean)

    artist_clean = re.sub(r"\s+", "-", artist_clean)
    song_clean = re.sub(r"\s+", "-", song_clean)

    link = f'https://genius.com/{artist_clean}-{song_clean}-lyrics'
    link_clean = re.sub(r"-{2,}", "-", link)
    return link_clean

def get_song_lyrics_and_info(artist, song_name):
    song_url = format_genius_url(artist, song_name)
    print(f'\nFetching: {song_url}')

    # Add random delay before each request
    time.sleep(random.uniform(2, 5))

    response = safe_request(song_url)
    if not response:
        return None, None, None, None, None, None

    soup = BeautifulSoup(response.content, 'lxml')

    # Extract lyrics
    lyrics = None
    for tag in soup.select('div[data-lyrics-container="true"], div[class^="Lyrics__Container"], .Lyrics__Root'):
        for i in tag.select('i'):
            i.unwrap()
        tag.smooth()
        lyrics = tag.get_text(strip=True, separator='\n')
        if lyrics:
            break

    # Extract credits
    producers = writers = label = released_on = tags = None

    credits_tag = soup.find('div', {'class': 'SongInfo-sc-4162678b-1 gEkcUA'})
    if credits_tag:
        producers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, string=re.compile(r'Producer', re.IGNORECASE))
        if producers_tag:
            producers = list(set([span.get_text(strip=True) for span in producers_tag.find_next_sibling().find_all('span')]))

        writers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Writer', re.IGNORECASE))
        if writers_tag:
            writers = list(set([span.get_text(strip=True) for span in writers_tag.find_next_sibling().find_all('span')]))

        label_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Label', re.IGNORECASE))
        if label_tag:
            label = list(set([span.get_text(strip=True) for span in label_tag.find_next_sibling().find_all('span')]))

        released_on_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Released\s+(in|on)', re.IGNORECASE))
        if released_on_tag:
            released_on = released_on_tag.find_next_sibling('div').get_text(strip=True)

    tags_tag = soup.find('div', {'class': 'SongTags-sc-b55131f0-0 eUsmSA'}, string=re.compile(r'Tags', re.IGNORECASE))
    if tags_tag:
        tags = list(set([a.get_text(strip=True) for a in tags_tag.find_next_sibling('div').find_all('a')]))

    return lyrics, producers, writers, label, released_on, tags

def get_lyrics_and_info_parallel(row):
    lyrics, producers, writers, label, released_on, tags = get_song_lyrics_and_info(row['artist'], row['title'])
    song_url = format_genius_url(row['artist'], row['title'])
    return pd.Series([lyrics, producers, writers, label, released_on, tags, song_url])


In [None]:
# df = pd.read_csv('billboard_hot_100_1981_2022.csv')

# results = []
# for _, row in tqdm(df.iterrows(), total=len(df)):
#     results.append(get_lyrics_and_info_parallel(row))
#     time.sleep(random.uniform(3, 6))  # throttle between songs

# df[['lyrics', 'producers', 'writers', 'label', 'released_on', 'tags', 'song_url']] = pd.DataFrame(results, index=df.index)


In [63]:
# Adjust this for safety vs. speed
N_JOBS = 6  # safer than 10
MIN_DELAY = 4   # seconds
MAX_DELAY = 9   # seconds

def get_lyrics_and_info_parallel_balanced(row):
    # Add randomized delay per worker
    time.sleep(random.uniform(MIN_DELAY, MAX_DELAY))
    return get_lyrics_and_info_parallel(row)

# Load your dataset
df = pd.read_csv('billboard_hot_100_1981_2022.csv')

# Apply in parallel with safer throttling
results = Parallel(n_jobs=N_JOBS)(
    delayed(get_lyrics_and_info_parallel_balanced)(row) for _, row in tqdm(df.iterrows(), total=len(df))
)

# Assign results back to the dataframe
df[['lyrics', 
    'producers', 
    'writers', 
    'label', 
    'released_on', 
    'tags', 
    'song_url']] = pd.DataFrame(results, index=df.index)

100%|██████████| 4182/4182 [2:15:33<00:00,  1.94s/it]  


In [64]:
# Percentage of missing values per column
missing_percent = df.isnull().mean() * 100

# Display the result sorted from highest to lowest
print(missing_percent.sort_values(ascending=False))


label          39.526542
released_on     4.375897
producers       4.304161
writers         4.232425
lyrics          3.586801
tags            3.443329
year            0.000000
ranking         0.000000
title           0.000000
artist          0.000000
song_url        0.000000
dtype: float64


In [66]:
df.to_csv('billboard_hot_100_1981_2022_with_lyrics.csv', index=False)