In [3]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests
import re

In [4]:
cid = '1148e6cb98484349995fb34331aafbb4'
secret ='7332976c80414d38beb464bb45ce5011'

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)


In [5]:
# This code is mainly adapted from the following article about Spotify and Genius,
# with some slight modifications and improvements:
# https://medium.com/swlh/how-to-leverage-spotify-api-genius-lyrics-for-data-science-tasks-in-python-c36cdfb55cf3

#insert the URI as a string into the function
def get_album_tracks(uri_info):
    uri = []
    track = []
    duration = []
    explicit = []
    track_number = []
    one = sp.album_tracks(uri_info, limit=50, offset=0, market='US')
    df1 = pd.DataFrame(one)
    
    for i, x in df1['items'].items():
        uri.append(x['uri'])
        track.append(x['name'])
        duration.append(x['duration_ms'])
        explicit.append(x['explicit'])
        track_number.append(x['track_number'])
    
    df2 = pd.DataFrame({
    'uri':uri,
    'track':track,
    'duration_ms':duration,
    'explicit':explicit,
    'track_number':track_number})
    
    return df2


#insert output dataframe from the get_album_tracks function
def get_track_info(df):
    danceability = []
    energy = []
    key = []
    loudness = []
    speechiness = []
    acousticness = []
    instrumentalness = []
    liveness = []
    valence = []
    tempo = []
    for i in df['uri']:
        for x in sp.audio_features(tracks=[i]):
            danceability.append(x['danceability'])
            energy.append(x['energy'])
            key.append(x['key'])
            loudness.append(x['loudness'])
            speechiness.append(x['speechiness'])
            acousticness.append(x['acousticness'])
            instrumentalness.append(x['instrumentalness'])
            liveness.append(x['liveness'])
            valence.append(x['valence'])
            tempo.append(x['tempo'])
            
    df2 = pd.DataFrame({
    'danceability':danceability,
    'energy':energy,
    'key':key,
    'loudness':loudness,
    'speechiness':speechiness,
    'acousticness':acousticness,
    'instrumentalness':instrumentalness,
    'liveness':liveness,
    'valence':valence,
    'tempo':tempo})
    
    return df2


def merge_frames(df1, df2):
    df3 = df1.merge(df2, left_index= True, right_index= True)
    return df3


#function to scrape lyrics from genius
def scrape_lyrics(artistname, songname):
    artistname2 = str(artistname.replace(' ','-'))
    songname2 = str(songname.replace(' ','-'))
    page = requests.get('https://genius.com/'+ artistname2 + '-' + songname2 + '-' + 'lyrics')
    if not page.ok:
        return None
    html = BeautifulSoup(page.text, 'html.parser')
    for br in html.find_all("br"):
        br.replace_with("\n")
    lyrics = html.find_all("div", class_=re.compile("Lyrics__Container*"))
    text = ''
    if lyrics is not None:
        for l in lyrics:
            text += l.getText()
    return text


#function to attach lyrics onto data frame
#artist_name should be inserted as a string
def lyrics_onto_frame(df1, artist_name):
    for i,x in enumerate(df1['track']):
        test = scrape_lyrics(artist_name, x)
        df1.loc[i, 'lyrics'] = test
    return df1

In [20]:
hiphop_frames = []
hiphop_frames.append((get_album_tracks("spotify:album:5MS3MvWHJ3lOZPLiMxzOU6"), "drake"))
hiphop_frames.append((get_album_tracks("spotify:album:4dsMe3EBC8xURaxMhyorgf"), "nle choppa"))
hiphop_frames.append((get_album_tracks("spotify:album:0FYvMdfTfYJxnJnKs1wDb0"), "lil baby"))
hiphop_frames.append((get_album_tracks("spotify:album:3oDobVNZ3U9BpxOdokk0fL"), "dababy"))
hiphop_frames.append((get_album_tracks("spotify:album:26z5llzd194mcCZHADWd6k"), "don toliver"))
hiphop_frames.append((get_album_tracks("spotify:album:0OQjYkxlKHsQwYLJziIQrI"), "superstar pride"))
hiphop_frames.append((get_album_tracks("spotify:album:1GG6U2SSJPHO6XsFiBzxYv"), "tyler the creator"))
hiphop_frames.append((get_album_tracks("spotify:album:1ep4OEfNOhvcY85STfEtKy"), "jack harlow"))
hiphop_frames.append((get_album_tracks("spotify:album:2Sl8X3Uu2N4B2pVa9y5U29"), "destroy lonely"))
hiphop_frames.append((get_album_tracks("spotify:album:4tFqnaGkCBUjraLUw07Q67"), "rae sremmurd"))
hiphop_frames.append((get_album_tracks("spotify:album:25Uddgldy3slnChqKqHsIM"), "yeat"))
hiphop_frames.append((get_album_tracks("spotify:album:1v3fDc2dJvSEKOd3hOSjAH"), "lil yachty"))
hiphop_frames.append((get_album_tracks("spotify:album:5ujM8ZczJbszMvGoWeXmvW"), "g herbo"))
hiphop_merged = [(merge_frames(f, get_track_info(f)), a) for (f, a) in hiphop_frames]
hiphop_lyrics = [lyrics_onto_frame(f, a) for (f, a) in hiphop_merged]

In [37]:
pop_frames = []
pop_frames.append((get_album_tracks("spotify:album:2WFFcvzM0CgLaSq4MSkyZk"), "ed sheeran"))
pop_frames.append((get_album_tracks("spotify:album:1nrVofqDRs7cpWXJ49qTnP"), "sza"))
pop_frames.append((get_album_tracks("spotify:album:3U8n8LzBx2o9gYXvvNq4uH"), "raye"))
pop_frames.append((get_album_tracks("spotify:album:151w1FgRZfnKZA9FEcg9Z3"), "taylor swift"))
pop_frames.append((get_album_tracks("spotify:album:69AaAkdktFGnk9POmHENkT"), "jvke"))
pop_frames.append((get_album_tracks("spotify:album:68L5xVV9wydotfDXEik7eD"), "lizzy mcalpine"))
pop_frames.append((get_album_tracks("spotify:album:6FJxoadUE4JNVwWHghBwnb"), "beyonce"))
pop_frames.append((get_album_tracks("spotify:album:5kDmlA2g9Y1YCbNo2Ufxlz"), "sabrina carpenter"))
pop_frames.append((get_album_tracks("spotify:album:4LVa9bljQRvLYpWr8qyaXs"), "meghan trainor"))
pop_frames.append((get_album_tracks("spotify:album:3HHNR44YbP7XogMVwzbodx"), "post malone"))
pop_merged = [(merge_frames(f, get_track_info(f)), a) for (f, a) in pop_frames]
pop_lyrics = [lyrics_onto_frame(f, a) for (f, a) in pop_merged]

In [None]:
hiphop_data = pd.concat(hiphop_lyrics)
hiphop_data.dropna(subset=['lyrics'], inplace=True)
hiphop_data.to_pickle('./hiphop_df')
print(hiphop_data)

In [None]:
pop_data = pd.concat(pop_lyrics)
pop_data.dropna(subset=['lyrics'], inplace=True)
pop_data.to_pickle('./pop_df')
print(pop_data)