In [None]:
!pip install langdetect

In [8]:
# imports
import pandas as pd
import os
import requests
import urllib
import re
from bs4 import BeautifulSoup, UnicodeDammit
from html import unescape
from typing import Optional

import numpy as np
import matplotlib.pyplot as plt

from langdetect import detect, DetectorFactory
import nltk
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

tokenizer = RegexpTokenizer(r'\w+')

stop_words = set(stopwords.words('english'))
wnl = nltk.WordNetLemmatizer() # The WordNet lemmatizer removes affixes if the resulting word is in its dictionary.

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# load artists dataset
artists_df = pd.read_pickle(r"/content/drive/MyDrive/SocialGraphs/Spotify_data/artists_dataset.pickle")

# load spotify dataset 
spotify_df = pd.read_pickle(r'/content/drive/MyDrive/SocialGraphs/spotify_dataset_processed.pickle')

In [None]:
# unroll the artist lists in order to associate each song with the corresponding artist
spotify_df_exploded = spotify_df.explode('artistname')
spotify_df_exploded = spotify_df_exploded.drop_duplicates(["tracks", "artistname"])

# associate each song to the corresponding artist
artists_df["tracks"] = ""
artists_df["top_tracks"] = ""

i = 0
for _, artist in artists_df.iterrows():
    if i % 100 == 0:
        print(i, "/", len(artists_df))
    i+=1
    temp_df = spotify_df_exploded[spotify_df_exploded.artistname == artist.artistname]
    artist.tracks = sorted(temp_df[['tracks', 'tracks_freq']].apply(tuple, axis=1), key=lambda t: t[1], reverse=True)
    # get top 5 tracks from each artist
    artist.top_tracks = artist.tracks[0:(5 if len(artist.tracks) > 5 else len(artist.tracks))]

artists_df.head()

Create a tracks dataset, where each tracks will have the associated lyrics:

In [None]:
tracks_set = set([track for tracks in artists_df.top_tracks for track in tracks])
data = {'track': list(tracks_set),
        'lyrics': [None] * len(tracks_set)}

# create tracks dataset
tracks_df = pd.DataFrame(data)
tracks_df[['track', 'trackfreq']] = pd.DataFrame(tracks_df['track'].tolist(), index=tracks_df.index)
tracks_df[['trackname', 'artistname']] = pd.DataFrame(tracks_df['track'].tolist(), index=tracks_df.index)


# filter tracks which have a frequency less that 50 in the spotify datase
tracks_df = tracks_df[tracks_df.trackfreq > 50]
tracks_df.head()

Unnamed: 0,track,lyrics,trackfreq,trackname,artistname
2,"(Me Enamora, (juanes))",,58,Me Enamora,(juanes)
4,"(Bump n' Grind, (r. kelly))",,201,Bump n' Grind,(r. kelly)
5,"(Ecstasy, (atb))",,76,Ecstasy,(atb)
10,"(Pro Nails (Rusko Remix), (kid sister))",,53,Pro Nails (Rusko Remix),(kid sister)
13,"(Money On My Mind, (sam smith))",,573,Money On My Mind,(sam smith)


In [None]:
# assign each track the genre
def get_genre(artists):
    genres = []    
    for g in list(artists_df[artists_df.artistname.isin(artists)].genres):
        if g is not None:
            genres += g
    return genres

tracks_df["genre"] = list(map(get_genre, tracks_df.artistname))

In [None]:
tracks_df.to_pickle(r'/content/drive/MyDrive/SocialGraphs/tracks_dataset.pickle_lyrics')

# Get lyrics

In [3]:
tracks_df =  pd.read_pickle(r'/content/drive/MyDrive/SocialGraphs/Spotify_data/tracks_dataset_lyrics.pickle')

In [None]:
tracks_df.head()

Unnamed: 0,track,lyrics,trackfreq,trackname,artistname,genre
3,"[Good People, [jack johnson]]","[Verse 1]\nOh you win, it's your show now\nSo ...",204,Good People,[jack johnson],[neo mellow]
4,"[Get Lucky - Radio Edit, [daft punk]]",,1462,Get Lucky - Radio Edit,[daft punk],"[electro, filter house]"
7,"[Habits (Stay High), [tove lo]]","[Intro]\nOh-oh, oh-oh, oh-oh\n\n[Verse 1]\nI e...",419,Habits (Stay High),[tove lo],"[dance pop, electropop, metropopolis, pop, swe..."
8,"[The Time (Dirty Bit), [the black eyed peas]]",,392,The Time (Dirty Bit),[the black eyed peas],[]
11,"[Killing Me Softly With His Song, [roberta fla...",[Chorus]\nStrumming my pain with his fingers\n...,186,Killing Me Softly With His Song,[roberta flack],"[adult standards, classic soul, disco, funk, m..."


In [None]:
client_access_token = "xiw8qk2KQVbosuUax1XxElfo4HBwSuRMqfN6QfC5wOruHg9JptmG70Zn-8-sI5E-"

def get_lyrics_url(trackname, artists):
    artistname = ", ".join(artists)
    search_term = trackname + " " + artistname
    genius_search_url = f"http://api.genius.com/search?q={urllib.parse.quote(search_term)}&access_token={client_access_token}"

    response = requests.get(genius_search_url)
    json_data = response.json()
    lyrics_link = None
    for item in json_data["response"]["hits"]:
    # check if it is a match with the artist name
        if any(artist in item['result']["artist_names"].lower() for artist in artists):
            # check if it is a match with the track's title
            if item['result']['title'].lower() == trackname.lower():
                # Print the artist and title of each result
                lyrics_link = item['result']['url']
    return lyrics_link

In [None]:
def get_lyrics(url_data) -> Optional[str]:
    if url_data is None:
        return ""
    page = requests.get(url_data)

    html = BeautifulSoup(page.text, "html.parser")
    lyrics_path = html.find("div", class_="lyrics")  # finding div on Genius containing the lyrics
    if lyrics_path:
        lyrics = UnicodeDammit(lyrics_path.get_text().strip()).unicode_markup
    else:
        lyrics_path = html.find_all("div", class_=re.compile("^Lyrics__Container"))
        lyrics_data = []
        for x in lyrics_path:
            lyrics_data.append(UnicodeDammit(re.sub("<.*?>", "", str(x).replace("<br/>", "\n"))).unicode_markup)

        lyrics = "\n".join(unescape(lyrics_data))  # also convert escaped characters to symbols
    return lyrics

In [None]:
i = 0
for idx, entry in tracks_df.iterrows():
    if i % 10 == 0:
        os.system('clear')
        print(i , len(tracks_df))
    i += 1
    tracks_df.at[idx,'lyrics'] = get_lyrics(get_lyrics_url(entry.trackname, entry.artistname))

In [4]:
# drop tracks without lyrics
tracks_df = tracks_df.dropna()

In [12]:
# drop tracks without lyrics in english
DetectorFactory.seed = 0
tracks_df = tracks_df[list(map(lambda l: detect(l) == 'en', tracks_df.lyrics))]

In [None]:
# remove construction like [Intro], [Verse 1], [Chorus]
tracks_df['lyrics'] = tracks_df['lyrics'].apply(lambda x: re.sub(r"\[(\w*|\W*)*\]", "", x))

In [15]:
tracks_df.shape

(3137, 8)

In [16]:
tracks_df.to_pickle(r'/content/drive/MyDrive/SocialGraphs/Spotify_data/tracks_dataset_lyrics.pickle')

Calculate sentiment score:

In [None]:
# load labMT dataset
data_labMT = pd.read_csv(r'./labMT.txt', sep="\t")
data_labMT.replace("--", None, inplace=True)
data_labMT.head()

Unnamed: 0,word,happiness_rank,happiness_average,happiness_standard_deviation,twitter_rank,google_rank,nyt_rank,lyrics_rank
0,laughter,1,8.5,0.9313,3600,--,--,1728
1,happiness,2,8.44,0.9723,1853,2458,--,1230
2,love,3,8.42,1.1082,25,317,328,23
3,happy,4,8.3,0.9949,65,1372,1313,375
4,laughed,5,8.26,1.1572,3334,3542,1313,2332


In [None]:
def get_tokens(document, stopwords=False, lemmatize=False):
    # 1. and 2. Tokenize the document into individual strings
    # and Remove all punctuation from your list of tokens
    tokens = tokenizer.tokenize(document)
    
    # 3. set evarything to lower case
    tokens = [w.lower() for w in tokens]
    
    if stopwords is True:
        # remove stop words
        tokens = [w for w in tokens if w not in stop_words]
    if lemmatize is True:
        # lemmatize the words
        tokens = [wnl.lemmatize(w) for w in tokens]
    
    return tokens

Tokenize the lyrics:

In [None]:
# remove construction like [Intro], [Verse 1], [Chorus]
tracks_df['lyrics'] = tracks_df['lyrics'].apply(lambda x: re.sub(r"\[([^]]+)\]", "", x))

In [None]:
tracks_df['tokens'] = tracks_df['lyrics'].apply(get_tokens)

Compute sentiment for each song:

In [None]:
def sentiment(tokens):
    """
        returns the average sentiment value of the tokens
    """
    sentiment = data_labMT[data_labMT.word.isin(tokens)].happiness_average.mean()
    return sentiment

In [None]:
tracks_df['sentiment'] = tracks_df["tokens"].apply(sentiment)