# **Spotify Scraping**
<img src="https://www.liderlogo.es/wp-content/uploads/2022/12/pasted-image-0-4-1024x576.png" alt="Texto alternativo" width="180" height="100">



In [3]:
import sys
import spotipy
import spotipy.util as util
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from joblib import Parallel, delayed
import re
import pandas as pd
import time
from tabulate import tabulate

In [4]:
def format_genius_url(artists, song_name):
    
    artist_clean = unidecode(artists).lower().strip()
    artist_clean = artist_clean.replace('&', 'and')
    artist_clean = artist_clean.replace(".", "")
    artist_clean = artist_clean.replace("'", "")
    artist_clean = artist_clean.split('/')[0].strip() #  # If artist name contains '/', take only the part before it
    artist_clean = re.sub(r"\s*(featuring|ft\.)\s.*", "", artist_clean)
    artist_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", artist_clean)


    song_clean = unidecode(song_name.lower().strip())    
    song_clean = song_clean.replace("&", "and") 
    song_clean = song_clean.replace("'", "")
    song_clean = song_clean.split(" / ")[0].strip() 
    song_clean = song_clean.replace("/", " ")
    song_clean = re.sub(r"\s*\((feat\.[^\)]+|Live|Remix|Version)[^\)]+\)\s*", "", song_clean)  # Remove (feat. Artist), (Live), etc.
    song_clean = re.sub(r"\s*\(\s*([^\)]+)\s*\)", r" \1", song_clean)
    #song_clean = re.sub(r"\s*-\s*.*", "", song_clean)  # Remove "- from Movie Name or Other versions"
    song_clean = re.sub(r"[^a-zA-Z0-9\s-]", "", song_clean)

    artist_clean = re.sub(r"\s+", "-", artist_clean)
    song_clean = re.sub(r"\s+", "-", song_clean)
    
    link = f'https://genius.com/{artist_clean}-{song_clean}-lyrics'

    link_clean = re.sub(r"-{2,}", "-", link)

    return link_clean

def get_song_lyrics_and_info(artist, song_name):
    song_url = format_genius_url(artist, song_name)

    print('\nSong: {}\nArtist: {}'.format(song_name, artist))
    print('song_url: ', song_url)
    
    soup = BeautifulSoup(requests.get(song_url).content, 'lxml')
    
    lyrics = None
    for tag in soup.select('div[data-lyrics-container="true"], div[class^="Lyrics__Container"], .Lyrics__Root'):
        for i in tag.select('i'):
            i.unwrap()
        tag.smooth()
        lyrics = tag.get_text(strip=True, separator='\n')
        if lyrics:
            break

    # Extract additional information
    producers = None
    writers = None
    label = None
    released_on = None
    tags = None

    credits_tag = soup.find('div', {'class': 'SongInfo-sc-4162678b-1 gEkcUA'})

    if credits_tag:
        producers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, string=re.compile(r'Producer', re.IGNORECASE))
        if producers_tag:                                      
            producers = list(set([span.get_text(strip=True) for span in producers_tag.find_next_sibling().find_all('span')]))

        writers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Writer', re.IGNORECASE))
        if writers_tag:
            writers = list(set([span.get_text(strip=True) for span in writers_tag.find_next_sibling().find_all('span')]))
        
        label_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Label', re.IGNORECASE))
        if label_tag:
            label = list(set([span.get_text(strip=True) for span in label_tag.find_next_sibling().find_all('span')]))

        released_on_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Released\s+(in|on)', re.IGNORECASE))
        if released_on_tag:
            released_on = released_on_tag.find_next_sibling('div').get_text(strip=True) if released_on_tag else None

    tags_tag = soup.find('div', {'class': 'SongTags-sc-b55131f0-0 eUsmSA'}, string=re.compile(r'Tags', re.IGNORECASE))
    if tags_tag:
        tags = list(set([a.get_text(strip=True) for a in tags_tag.find_next_sibling('div').find_all('a')])) if tags_tag else None

    return lyrics, producers, writers, label, released_on, tags

def get_lyrics_and_info_parallel(row):
    lyrics, producers, writers, label, released_on, tags = get_song_lyrics_and_info(row['artist'], row['title'])
    song_url = format_genius_url(row['artist'], row['title'])
    return pd.Series([lyrics, producers, writers, label, released_on, tags, song_url])

In [7]:
# How Do U Want It / California Love,2Pac featuring K-Ci and JoJo / 2Pac featuring Dr. Dre and Roger Troutman
# 867-5309/Jenny,Tommy Tutone
# Baby Got Back,
# Baby-Baby-Baby,TLC
# Crank That (Soulja Boy),Soulja Boy Tell 'Em

singer = 'Jay-Z featuring Alicia Keys'
song = 'Empire State of Mind'
df = pd.DataFrame([{'artist': singer, 'title': song}])

df[['lyrics', 'producers', 'writers', 'label', 'released_on', 'tags', 'song_url']] = df.apply(get_lyrics_and_info_parallel, axis=1)


Song: Empire State of Mind
Artist: Jay-Z featuring Alicia Keys
song_url:  https://genius.com/jay-z-empire-state-of-mind-lyrics


  writers_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Writer', re.IGNORECASE))
  label_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Label', re.IGNORECASE))
  released_on_tag = credits_tag.find_next('div', {'class': re.compile(r'SongInfo-sc-4162678b-4\s+\w+')}, text=re.compile(r'Released\s+(in|on)', re.IGNORECASE))


In [8]:
df = pd.read_csv('billboard_hot_100_1981_2022.csv')
N_JOBS = 10

df[['lyrics', 
    'producers', 
    'writers', 
    'label', 
    'released_on', 
    'tags', 
    'song_url']] = Parallel(n_jobs=N_JOBS)(delayed(get_lyrics_and_info_parallel)(row) for _, row in df.iterrows())

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4182 entries, 0 to 4181
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         4182 non-null   int64 
 1   ranking      4182 non-null   int64 
 2   title        4182 non-null   object
 3   artist       4182 non-null   object
 4   lyrics       674 non-null    object
 5   producers    668 non-null    object
 6   writers      671 non-null    object
 7   label        288 non-null    object
 8   released_on  666 non-null    object
 9   tags         677 non-null    object
 10  song_url     4182 non-null   object
dtypes: int64(2), object(9)
memory usage: 359.5+ KB


In [19]:
df.apply(lambda x: x.isnull().mean()).to_frame().transpose()

Unnamed: 0,year,ranking,title,artist,lyrics,producers,writers,label,released_on,tags,song_url
0,0.0,0.0,0.0,0.0,0.838833,0.840268,0.83955,0.931133,0.840746,0.838116,0.0


In [16]:
pd.set_option("display.max_colwidth", None)

df[df['lyrics'].isnull()].head()

Unnamed: 0,year,ranking,title,artist,lyrics,producers,writers,label,released_on,tags,song_url
17,1981,18,Just the Two of Us,"Grover Washington, Jr. & Bill Withers",,,,,,,https://genius.com/grover-washington-jr-and-bill-withers-just-the-two-of-us-lyrics
27,1981,29,Guilty,Barbra Streisand & Barry Gibb,,,,,,,https://genius.com/barbra-streisand-and-barry-gibb-guilty-lyrics
45,1981,47,The Breakup Song (They Don't Write 'Em),The Greg Kihn Band,,,,,,,https://genius.com/the-greg-kihn-band-the-breakup-song-they-dont-write-em-lyrics
53,1981,55,What Are We Doin' in Love,Dottie West & Kenny Rogers,,,,,,,https://genius.com/dottie-west-and-kenny-rogers-what-are-we-doin-in-love-lyrics
61,1981,63,Ain't Even Done with the Night,John Cougar,,,,,,,https://genius.com/john-cougar-aint-even-done-with-the-night-lyrics


In [13]:
df.to_csv("lyrics_output.csv", index=False)

In [12]:
print(tabulate(df.head(), headers='keys', tablefmt='psql'))

+----+--------+-----------+---------------------------+----------------------------+------------------------------------------------------------------+---------------------------------------------+-------------------------------------+--------------------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------+
|    |   year |   ranking | title                     | artist                     | lyrics                                                           | producers                                   | writers                             | label              | released_on        | tags                                                                                                                                     | song_url                                                            |
|----+--------+-----------

In [None]:
import lyricsgenius as lg
genius_api_key = "8uWLYoKNMdAgpIarv32OGJJfCU6D2QE1GsADuMWVYljmsLl1lGmzh1IkLC_c0J38"

genius = lg.Genius(genius_api_key)

song = genius.search_song(title='Beat Box 2 / Beat Box 3')
lyrics = song.lyrics
print(lyrics)

In [None]:
N_JOBS = 8
data['lyrics_v2'] = Parallel(n_jobs=N_JOBS)(delayed(get_lyrics_parallel)(row) for _, row in data.iterrows())