In [8]:
from bs4 import BeautifulSoup
import re
import requests
from thefuzz import fuzz
import pandas as pd
import time

# urls from genius api

In [9]:
def are_strings_similar(str1, str2, threshold=70):
    similarity_score = fuzz.ratio(str1, str2)
    return similarity_score >= threshold

In [13]:
# Replace 'YOUR_API_KEY' with your actual Genius API key
api_key = 'EOF7HOB8HPEY4xJvOSAxjNvmlsjp9jrVU20FphTY101MCi6PmBePEU8s-3KirNd5'

# Define the base URL for the Genius API
base_url = 'https://api.genius.com/'

# Define the endpoint for searching a song by its title and artist
search_endpoint = 'search'

#function with song title and artist name as parameters and returns the lyrics url
def get_lyrics_url(song_title, artist_name):
    #only take the first artist
    artist_names = artist_name.split(",")

    # Create a headers dictionary with the 'Authorization' field containing your API key
    headers = {
        'Authorization': f'Bearer {api_key}'
    }

    # Create a dictionary with query parameters for the search request
    params = {
        'q': f'{song_title} {artist_names[0]}'
    }

    # Make the GET request to search for the song
    response = requests.get(f'{base_url}{search_endpoint}', headers=headers, params=params)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        # Extract relevant information from the response
        song_hits = data['response']['hits']
        #iterate through the song hits and find the song with the correct artist
        for hit in song_hits:
            #check if hit type is song
            if hit['type'] == 'song':
                song_info = hit['result']
                artist = song_info['primary_artist']['name']
                if are_strings_similar(song_info['title'].lower(), song_title):
                    #check if one of artist_names is similar to artist
                    for artist_name in artist_names:
                        if are_strings_similar(artist.lower(), artist_name):
                            #check lyrics is available
                            if song_info['lyrics_state'] == 'complete':
                                #get lyrics url
                                return song_info['url']
                            else:
                                #return lyrics state
                                return f'lyrics state: {song_info["lyrics_state"]}'
        #if no song is found, return None
        return None
    else:
        time.sleep(60)
        return(f'Error: {response.status_code}')

In [14]:
#TEST
get_lyrics_url('africa', 'toto')

'https://genius.com/Toto-africa-lyrics'

In [15]:
#load first genius dataframe 
df = pd.read_csv('no_lyrics.csv')
df.shape

(8865, 37)

In [16]:
#get the urls for every song
for index, row in df.iterrows():
    try:
        df.at[index, 'lyrics_url'] = get_lyrics_url(row['clear_name'], row['artists'])
    except:
        df.at[index, 'lyrics_url'] = 'Error'

In [17]:
#how many songs have a lyrics url
df_with_url = df[df['lyrics_url'].str.contains('genius.com', na=False)]
df_with_url.shape

(910, 37)

In [18]:
#make dataframe with songs without lyrics url
df_without = df[~df['uri'].isin(df_with_url['uri'])]
df_without.shape

(7955, 37)

now restart getting the urls. this time outcomment the artist in the search query (marked with comment in get_lyrics_url)

In [19]:
def url_info(genius_df):    
    #count the number of None in gerius_df
    null_cnt = genius_df[genius_df['lyrics_url'].isnull()].shape[0]
    #count the number of "Error: 503" in gerius_df
    error_cnt = genius_df[genius_df['lyrics_url'] == 'Error: 503'].shape[0]
    #count the number of "Error" in gerius_df
    error_cnt2 = genius_df[genius_df['lyrics_url'] == 'Error'].shape[0]
    #count the number of "lyrics state: unreleased" in gerius_df
    unreleased_cnt = genius_df[genius_df['lyrics_url'] == 'lyrics state: unreleased'].shape[0]
    #count the number of nan in gerius_df
    nan_cnt = genius_df[genius_df['lyrics_url'] == 'nan'].shape[0]
    #print all counts
    print(f'null_cnt: {null_cnt}')
    print(f'error_cnt: {error_cnt}')
    print(f'error_cnt2: {error_cnt2}')
    print(f'unreleased_cnt: {unreleased_cnt}')
    print(f'nan_cnt: {nan_cnt}')

In [20]:
url_info(df_without)

null_cnt: 7893
error_cnt: 0
error_cnt2: 0
unreleased_cnt: 62
nan_cnt: 0


> ⚠️ Don't forget to store the resulting dataframes


# webscrape lyrics from genius urls

In [21]:
def extract_lyrics_from_genius_html(html):
    try:
        # Parse the HTML content
        soup = BeautifulSoup(html.replace('<br/>', '\n'), "html.parser")
        # Find the element that contains the lyrics
        divs = soup.find_all("div", class_=re.compile("^lyrics$|Lyrics__Container"))
        if divs is None or len(divs) == 0:
            return False, None
        lyrics = "\n".join([div.get_text() for div in divs])
        # Remove [Verse], [Bridge], etc.
        lyrics = re.sub(r'(\[.*?\])*', '', lyrics)
        lyrics = re.sub('\n{2}', '\n', lyrics)  # Gaps between verses
        return True, lyrics.strip("\n")
    except Exception as e:
        return False, "An error occurred: " + str(e)

In [22]:
def get_html_from_url(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the HTML content
            html = response.text
            return True, html
        else:
            print("HTTP GET request failed with status code: " + str(response.status_code))
            return False, f"HTTP GET request failed with status code: {response.status_code}"

    except Exception as e:
        return False, "An error occurred: " + str(e)


In [23]:
#TEST
url = "https://genius.com/Ac-dc-back-in-black-lyrics"
ok, html = get_html_from_url(url)
ok, lyrics = extract_lyrics_from_genius_html(html)
print(lyrics)

Back in black, I hit the sack
I've been too long, I'm glad to be back
Yes, I'm let loose from the noose
That's kept me hanging about
I'm just looking at the sky 'cause it's getting me high
Forget the hearse, 'cause I'll never die
I got nine lives, cat's eyes
Abusing every one of them and running wild

'Cause I'm back
Yes, I'm back
Well, I'm back
Yes, I'm back
Well, I'm back, back
Well, I'm back in black
Yes, I'm back in black

Back in the back of a Cadillac
Number one with a bullet, I'm a power pack
Yes, I'm in a bang with a gang
They've got to catch me if they want me to hang
'Cause I'm back on the track and I'm beatin' the flack
Nobody's gonna get me on another rap
So, look at me now, I'm just making my play
Don't try to push your luck, just get out of my way

'Cause I'm back
Yes, I'm back
Well, I'm back
Yes, I'm back
Well, I'm back, back
Well, I'm back in black
Yes, I'm back in black


Well, I'm back
Yes, I'm back
Well, I'm back
Yes, I'm back
Well, I'm back, back
Well, I'm back in b

In [28]:
def get_lyrics_from_genius(df, output_path='lyrics.csv'):
    #create a new column for the lyrics
    df['lyrics'] = ''
    
    #reset index
    df.reset_index(drop=True, inplace=True)

    df_length = len(df)
    #5% of the length
    step = df_length // 100

    error_cnt = 0
    step_cnt = 0
    
    #iterate through the dataframe
    for index, row in df.iterrows():

        #print the progress
        if index % step == 0:
            print(f"progress: {index} / {df_length}")
            # if error_cnt > 0 and error_cnt == step_cnt:
            #     print("too many errors, stop")
            #     break
            # else:
            #     error_cnt = 0
            #     step_cnt = 0
        #get the url
        url = row['lyrics_url']
        #get the html
        ok, html = get_html_from_url(url)

        if ok:
            #extract the lyrics
            ok, lyrics = extract_lyrics_from_genius_html(html)
            if not ok:
                error_cnt += 1
                print(lyrics)
        else:
            error_cnt += 1
            print(html)
        
        #update the dataframe
        df.at[index, 'lyrics'] = lyrics
        
        step_cnt += 1
        
        #wait for 1 second
        #time.sleep(3)
    #save the dataframe
    df.to_csv(output_path, index=False)

In [29]:
# scrape the lyrics
get_lyrics_from_genius(df_with_url, 'lyrics.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics'] = ''


progress: 0 / 910
None
None
None
progress: 9 / 910
None
None
None
progress: 18 / 910
None
None
None
progress: 27 / 910
None
None
None
None
None
progress: 36 / 910
None
None
None
None
progress: 45 / 910
None
None
None
progress: 54 / 910
None
None
None
None
None
progress: 63 / 910
None
None
None
None
None
None
None
progress: 72 / 910
None
None
None
None
None
None
progress: 81 / 910
None
None
None
None
None
None
None
None
progress: 90 / 910
None
None
None
None
None
progress: 99 / 910
None
None
None
None
None
None
progress: 108 / 910
None
None
None
None
None
None
progress: 117 / 910
None
None
None
None
None
None
None
None
progress: 126 / 910
None
None
None
None
None
None
None
None
None
progress: 135 / 910
None
None
None
None
None
None
None
progress: 144 / 910
None
None
None
None
None
progress: 153 / 910
None
None
None
None
None
progress: 162 / 910
None
None
None
None
None
None
None
progress: 171 / 910
None
None
None
None
None
None
None
progress: 180 / 910
None
None
None
None
None
None
None

In [46]:
df = pd.read_csv('lyrics.csv')

In [47]:
#count the number of songs with lyrics
df[~df['lyrics'].notnull()].shape

(735, 37)

In [50]:
df.sort_values(by=['popularity'], inplace=True)

In [51]:
df.head(10)

Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,Listeners,Playcount,Tags,clear_name,lyrics_url,lyrics,chart_peak,chart_weeks,rank,chart_rank
909,DEE860701147,"['german', 'pop']",it's all in the game,nena,It's All in the Game,1985-08-10,day,spotify:track:2DjOVzUbgLQ0NUi4VF1lT8,2DjOVzUbgLQ0NUi4VF1lT8,,...,309,1707,"['80s', 'new wave', 'Neue Deutsche Welle', '19...",it's all in the game,https://genius.com/Nena-its-all-in-the-game-ly...,There was an old man\nWho was robbing himself ...,0,0,0.000118,0
889,GBEEL0600125,"['industrial', 'rock']",rattlesnake insurance,"foetus,the foetus all nude revue",Sink,1989,year,spotify:track:3svNLYIQgvEfMw572uTPtv,3svNLYIQgvEfMw572uTPtv,,...,1854,5573,[],rattlesnake insurance,https://genius.com/Foetus-rattlesnake-insuranc...,,0,0,0.000944,0
888,GBEEL0600124,"['industrial', 'rock']",sick minutes,"foetus,foetus uber frisco",Sink,1989,year,spotify:track:17GmAhgpBjDHrhSQVf61px,17GmAhgpBjDHrhSQVf61px,,...,2043,6511,"['seen live', '80s', 'experimental', 'industri...",sick minutes,https://genius.com/Foetus-sick-minutes-lyrics,,0,0,0.001063,0
887,QM4TW1576255,"['electro', 'electronic', 'industrial']",icepick method,hunting lodge,Will,1983,year,spotify:track:3h4BTnJ7k2vKyL8FLqyYUr,3h4BTnJ7k2vKyL8FLqyYUr,,...,2213,4873,[],icepick method,https://genius.com/Hunting-lodge-icepick-metho...,,0,0,0.001073,0
886,USRC10602214,"['dance', 'rock']",shut your face,polyrock,Polyrock,1980,year,spotify:track:2fNLJ7S3tchQcagTdRN4vg,2fNLJ7S3tchQcagTdRN4vg,,...,2197,5869,"['new wave', 'post-punk', 'philip glass-punkro...",shut your face,https://genius.com/Polyrock-shut-your-face-lyrics,,0,0,0.001105,0
908,GBAYE1801177,"['blues', 'british', 'metal', 'psych-rock', 'r...",watch out!,atomic rooster,Atomic Rooster,1980-01-01,day,spotify:track:4qSkje27kD8Vu7uYAhg9Rb,4qSkje27kD8Vu7uYAhg9Rb,,...,610,1379,[],watch out!,https://genius.com/Atomic-rooster-watch-out-ly...,,0,0,0.000237,0
885,GB4971500225,"['industrial', 'rock']",shut,"foetus,the foetus all nude revue",Sink,1989,year,spotify:track:0usGeQ8EHUH0kJcXLaH66Q,0usGeQ8EHUH0kJcXLaH66Q,,...,2090,7151,[],shut,https://genius.com/Foetus-shut-lyrics,,0,0,0.001108,0
890,USMC17024885,"['rock', 'soul']",cantaloupe island,el chicano,Viva El Chicano! (Their Very Best),1988-01-01,day,spotify:track:2DEyIFKO7SW351whjJz8Pv,2DEyIFKO7SW351whjJz8Pv,,...,1795,4816,"['zapfunk', 'rare groove']",cantaloupe island,https://genius.com/El-chicano-cantaloupe-islan...,,0,0,0.000888,0
884,QMPKX1672969,"['ambient', 'industrial', 'punk', 'punk-rock',...",come visit the big bigot,severed heads,(Come Visit) the Big Bigot,1986-09-01,day,spotify:track:3XsiMrV8V8y1J2BE54yh9m,3XsiMrV8V8y1J2BE54yh9m,,...,2272,6868,"['post-punk', 'synth', 'Hell Yeah', 'songs by ...",come visit the big bigot,https://genius.com/Severed-heads-come-visit-th...,,0,0,0.001177,0
881,GBAJH0400621,"['alt-rock', 'alternative', 'british', 'rock']",auto destruction,barry adamson,Moss Side Story,1989-03-06,day,spotify:track:4A2UUj7JtnzYnJemvigjpB,4A2UUj7JtnzYnJemvigjpB,,...,3605,8264,"['dark', 'pop', '80s', 'experimental', 'cinema...",auto destruction,https://genius.com/Barry-adamson-autodestructi...,,0,0,0.001813,0


In [34]:
df2 = pd.read_csv('final_dataset.csv')
df2.shape

(45210, 36)

In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45210 entries, 0 to 45209
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   isrc                    45210 non-null  object 
 1   genres                  45210 non-null  object 
 2   name                    45210 non-null  object 
 3   artists                 45210 non-null  object 
 4   album                   45210 non-null  object 
 5   release_date            45210 non-null  object 
 6   release_date_precision  45210 non-null  object 
 7   uri                     45210 non-null  object 
 8   spotify_id              45210 non-null  object 
 9   popularity              45210 non-null  int64  
 10  danceability            45210 non-null  float64
 11  energy                  45210 non-null  float64
 12  key                     45210 non-null  int64  
 13  loudness                45210 non-null  float64
 14  mode                    45210 non-null

In [41]:
#drop chart_power column
df.drop(columns=['chart_power'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['chart_power'], inplace=True)


In [42]:
#concatenate the two dataframes
df3 = pd.concat([df2, df], axis=0)
df3.shape

(45385, 36)

In [43]:
#delete duplicates
df3.drop_duplicates(subset=['uri'], inplace=True)

In [44]:
df3.shape

(45385, 36)

In [45]:
df3.to_csv('final_dataset.csv', index=False)