In [1]:
# imports
import pandas as pd
import os
import requests
import urllib
import re
from bs4 import BeautifulSoup, UnicodeDammit
from html import unescape
from typing import Optional

In [2]:
# load artists dataset
artists_df = pd.read_pickle(r"/content/drive/MyDrive/SocialGraphs/Spotify_data/artists_dataset.pickle")

# load spotify dataset 
spotify_df = pd.read_pickle(r'/content/drive/MyDrive/SocialGraphs/spotify_dataset_preprocessed.pickle')
# compute the tracks frequency and save it into spotify_df
spotify_df["tracks_freq"] = spotify_df.groupby(['tracks'])['tracks'].transform('count')
spotify_df.head()

Unnamed: 0,user_id,artistname,trackname,tracks,genre,tracks_freq
0,9cc0cfd4d7d7885102480dd99e7a90d6,(elvis costello),(The Angels Wanna Wear My) Red Shoes,"((The Angels Wanna Wear My) Red Shoes, (elvis ...","[art rock, folk, folk rock, mellow gold, new w...",60
1,9cc0cfd4d7d7885102480dd99e7a90d6,(elvis costello),"(What's So Funny 'Bout) Peace, Love And Unders...","((What's So Funny 'Bout) Peace, Love And Under...","[art rock, folk, folk rock, mellow gold, new w...",68
3,9cc0cfd4d7d7885102480dd99e7a90d6,(elvis costello),Accidents Will Happen,"(Accidents Will Happen, (elvis costello))","[art rock, folk, folk rock, mellow gold, new w...",68
4,9cc0cfd4d7d7885102480dd99e7a90d6,(elvis costello),Alison,"(Alison, (elvis costello))","[art rock, folk, folk rock, mellow gold, new w...",164
5,9cc0cfd4d7d7885102480dd99e7a90d6,(lissie),All Be Okay,"(All Be Okay, (lissie))",[],5


In [None]:
# unroll the artist lists in order to associate each song with the corresponding artist
spotify_df_exploded = spotify_df.explode('artistname')
spotify_df_exploded = spotify_df_exploded.drop_duplicates(["tracks", "artistname"])

# associate each song to the corresponding artist
artists_df["tracks"] = ""
artists_df["top_tracks"] = ""

i = 0
for _, artist in artists_df.iterrows():
    if i % 100 == 0:
        print(i, "/", len(artists_df))
    i+=1
    temp_df = spotify_df_exploded[spotify_df_exploded.artistname == artist.artistname]
    artist.tracks = sorted(temp_df[['tracks', 'tracks_freq']].apply(tuple, axis=1), key=lambda t: t[1], reverse=True)
    # get top 5 tracks from each artist
    artist.top_tracks = artist.tracks[0:(5 if len(artist.tracks) > 5 else len(artist.tracks))]

artists_df.head()

Create a tracks dataset, where each tracks will have the associated lyrics:

In [182]:
tracks_set = set([track for tracks in artists_df.top_tracks for track in tracks])
data = {'track': list(tracks_set),
        'lyrics': [None] * len(tracks_set)}

# create tracks dataset
tracks_df = pd.DataFrame(data)
tracks_df[['track', 'trackfreq']] = pd.DataFrame(tracks_df['track'].tolist(), index=tracks_df.index)
tracks_df[['trackname', 'artistname']] = pd.DataFrame(tracks_df['track'].tolist(), index=tracks_df.index)


# filter tracks which have a frequency less that 50 in the spotify datase
tracks_df = tracks_df[tracks_df.trackfreq > 50]
tracks_df.head()

Unnamed: 0,track,lyrics,trackfreq,trackname,artistname
2,"(Me Enamora, (juanes))",,58,Me Enamora,(juanes)
4,"(Bump n' Grind, (r. kelly))",,201,Bump n' Grind,(r. kelly)
5,"(Ecstasy, (atb))",,76,Ecstasy,(atb)
10,"(Pro Nails (Rusko Remix), (kid sister))",,53,Pro Nails (Rusko Remix),(kid sister)
13,"(Money On My Mind, (sam smith))",,573,Money On My Mind,(sam smith)


In [204]:
tracks_df

Unnamed: 0,track,lyrics,trackfreq,trackname,artistname
2,"(Me Enamora, (juanes))",,58,Me Enamora,(juanes)
4,"(Bump n' Grind, (r. kelly))",,201,Bump n' Grind,(r. kelly)
5,"(Ecstasy, (atb))",,76,Ecstasy,(atb)
10,"(Pro Nails (Rusko Remix), (kid sister))",,53,Pro Nails (Rusko Remix),(kid sister)
13,"(Money On My Mind, (sam smith))",,573,Money On My Mind,(sam smith)
...,...,...,...,...,...
23145,"(Ode To Oi, (tjr))",,97,Ode To Oi,(tjr)
23155,"(That'll Be The Day, (buddy holly))",,113,That'll Be The Day,(buddy holly)
23159,"(Behind Blue Eyes, (limp bizkit))",,292,Behind Blue Eyes,(limp bizkit)
23170,"(Bom Bom - Radio Edit, (sam))",,116,Bom Bom - Radio Edit,(sam)


In [6]:
# assign each track the genre
def get_genre(artists):
    genres = []    
    for g in list(artists_df[artists_df.artistname.isin(artists)].genres):
        if g is not None:
            genres += g
    return genres

tracks_df["genre"] = list(map(get_genre, tracks_df.artistname))

In [156]:
tracks_df.to_pickle(r'/content/drive/MyDrive/SocialGraphs/tracks_dataset.pickle_lyrics')

# Get lyrics

In [64]:
client_access_token = "xiw8qk2KQVbosuUax1XxElfo4HBwSuRMqfN6QfC5wOruHg9JptmG70Zn-8-sI5E-"

def get_lyrics_url(trackname, artists):
    artistname = ", ".join(artists)
    search_term = trackname + " " + artistname
    genius_search_url = f"http://api.genius.com/search?q={urllib.parse.quote(search_term)}&access_token={client_access_token}"

    response = requests.get(genius_search_url)
    json_data = response.json()
    lyrics_link = None
    for item in json_data["response"]["hits"]:
    # check if it is a match with the artist name
        if any(artist in item['result']["artist_names"].lower() for artist in artists):
            # check if it is a match with the track's title
            if item['result']['title'].lower() == trackname.lower():
                # Print the artist and title of each result
                lyrics_link = item['result']['url']
    return lyrics_link

In [71]:
def get_lyrics(url_data) -> Optional[str]:
    if url_data is None:
        return ""
    page = requests.get(url_data)

    html = BeautifulSoup(page.text, "html.parser")
    lyrics_path = html.find("div", class_="lyrics")  # finding div on Genius containing the lyrics
    if lyrics_path:
        lyrics = UnicodeDammit(lyrics_path.get_text().strip()).unicode_markup
    else:
        lyrics_path = html.find_all("div", class_=re.compile("^Lyrics__Container"))
        lyrics_data = []
        for x in lyrics_path:
            lyrics_data.append(UnicodeDammit(re.sub("<.*?>", "", str(x).replace("<br/>", "\n"))).unicode_markup)

        lyrics = "\n".join(unescape(lyrics_data))  # also convert escaped characters to symbols
    return lyrics

In [None]:
i = 0
for idx, entry in tracks_df.iterrows():
    if i % 10 == 0:
        os.system('clear')
        print(i , len(tracks_df))
    i += 1
    tracks_df.at[idx,'lyrics'] = get_lyrics(get_lyrics_url(entry.trackname, entry.artistname))

In [None]:
# remove construction like [Intro], [Verse 1], [Chorus]
tracks_df['lyrics'] = tracks_df['lyrics'].apply(lambda x: re.sub(r"\[(\w*|\W*)*\]", "", x))

In [13]:
tracks_df.to_pickle(r'/content/drive/MyDrive/SocialGraphs/tracks_dataset_lyrics.pickle')