# Data preparation
The purpose of this notebook is to prepare the data and gather information from it to be used for analysis.


In [None]:
import json
import pandas as pd
import numpy as np
import networkx as nx
import nltk
from nltk import word_tokenize
import pandas as pd
import spotipy as sp
import os
from spotipy.oauth2 import SpotifyClientCredentials

## Sentiment
The purpose of this section is to calculate the sentiment of every song and to assign an average sentiment to every artist.
To do this the dictionary `artist_song_id`, which was constructed during the construction and filtering of the network(`network_artists.ipynb`), is loaded.
The dictionary contains the song id of the remaining songs for every artist id in the network. 

In [None]:
## Load graph
G = nx.read_graphml("../../data/graphs/G_final.graphml")
## Load dictionary file:
with open("../../data/other_files/artist_song_id.json") as json_file:
    artist_song_id = json.load(json_file)

To calculate the sentiment the wordlist from LabMT is downloaded and loaded:

In [None]:
df = pd.read_csv('../../data/sentiment/Data_Set_S1.txt', sep="\t", header=None)
df.columns = ["word", "happiness_rank", "happiness_average", "happiness_standard_deviation", "twitter_rank", 
                "google_rank", "nyt_rank", "lyrics_rank"]
df.head()

Unnamed: 0,word,happiness_rank,happiness_average,happiness_standard_deviation,twitter_rank,google_rank,nyt_rank,lyrics_rank
0,laughter,1,8.5,0.9313,3600,--,--,1728
1,happiness,2,8.44,0.9723,1853,2458,--,1230
2,love,3,8.42,1.1082,25,317,328,23
3,happy,4,8.3,0.9949,65,1372,1313,375
4,laughed,5,8.26,1.1572,3334,3542,--,2332


The song lyrics however still needs a bit of preprocessing. 
Each song lyric is therefor tokenized, lowercased, cleaned of punctuation and other signs and lemmatized.

In [None]:
# These downloads are needed to run the next code block
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def text_preprocess(text_file):
    # Tokenize
    text1 = word_tokenize(text_file.lower())

    # Remove punctuations and other signs
    text2 = [word for word in text1 if word.isalnum()]

    # Lemmatize
    wnl = nltk.WordNetLemmatizer()
    text3 = [wnl.lemmatize(t) for t in text2]

    return text3

We can now calculate and store the sentiment for every song.

In [None]:
song_sentiment = {}
for artist_id in G.nodes():
    songs = artist_song_id[artist_id]
    for song in songs:
        text_file = open("../../data/genius/all_song_lyrics_cleaned/" + song + ".txt", "r").read()
        temp_word = text_preprocess(text_file) # Preprocess the text file
        temp_df = pd.DataFrame(data={'word':temp_word}).merge(df,on='word')
        song_sentiment[song] = np.mean(temp_df.happiness_average)

## Save the dictionary
with open("../../data/other_files/song_sentiment.json", "w") as json_file:
    json.dump(song_sentiment,json_file)

Finally the average sentiment for every artist in the network is calculated and added as an attribute to the network and the network is saved as ``G_final_sentiment.graphml``.

In [None]:
for artist_id in G.nodes():
    songs = artist_song_id[artist_id]
    sentiment_temp = [] # to temporarily store the sentiment of each song
    for song in songs:
        sentiment_temp.append(song_sentiment[song])
    G._node[artist_id]['sentiment'] = np.mean(sentiment_temp)

# Save the graph
nx.write_graphml(G,"../../data/graphs/G_final_sentiment.graphml",encoding='utf-8')

# Dataframes
To make analysis easier two seperate dataframes containing all information about the songs and the artists are constructed.
The year of the artist and song is further addede as this is also available from the Spotify API.
A dataframe for the songs is first constructed.

**Dataframe for songs**


In [None]:
# Set the Spotify developer id and secret
sp = sp.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=os.environ["SP_CLIENT_ANDREAS"],
    client_secret=os.environ["SP_SECRET_ANDREAS"]
    )
)

In [None]:
## Retrieve the date of the song from Spotify
song_year = {}
failed_songs = []

for song_id in song_sentiment.keys():
    try:
        track = sp.track(song_id) # Call spotify API
        song_year[song_id] = track['album']['release_date'] # Get the date
        time.sleep(0.05)
    except:
        failed_songs.append(song_id)

print("Number of failed songs:", len(failed_songs))

# Save the file
with open("../../data/other_files/song_year.json", "w") as json_file:
    json.dump(song_year,json_file)

Number of failed songs: 13691


In [None]:
## Dictionary containing the artist id for every song id
song_artist = {}
for artist_id in G.nodes():
    songs = artist_song_id[artist_id]
    for song in songs:
        song_artist[song] = artist_id

In [None]:
# construct dataframe for song information
df_song_info = pd.DataFrame(columns=['song_id','artist','artist_id','genre','sentiment','year'])

for song_id in song_year.keys():
    # Get info for new row of dataframe
    year = song_year[song_id][0:4]
    artist_id = song_artist[song_id]
    artist = G._node[artist_id]['name']
    genre = G._node[artist_id]['genre']
    sentiment = song_sentiment[song_id]
    # Specify and append new row
    new_row = {'song_id':song_id,'artist': artist,'artist_id':artist_id,
               'genre':genre,'sentiment':sentiment,'year':year}
    df_song_info = df_song_info.append(new_row, ignore_index=True)

df_song_info.to_pickle("../../data/other_files/df_song_info.pkl")
df_song_info.head()    

Unnamed: 0,song_id,artist,artist_id,genre,sentiment,year
0,5HQEmiV2lKnSO6qa2fsR7x,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.36713,1975
1,1LOZMYF5s8qhW7Rv4w2gun,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.734272,1978
2,6KEWtSOGKpIXGw6l1uJgsR,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.650507,1977
3,1QQgSUKCG8GakzMOwi4lFS,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.270608,1973
4,4E2gdBRKC12MJWFUOkH0UN,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.734272,2002


**Dataframe for artists**  
To construct the dataframe with information on artist level we need to assign a year to each artist.
However, as one artist can have songs which are from several different years we will calculate a mean for the release year for all songs and assign this year to the artist. 
Intitially a dicionary containing the average year for each artist id is therefore constructed.

In [None]:
# Calulate and save the year of each artist
artist_year = {}
for artist_id in artist_song_id:
    years = []
    for song_id in artist_song_id[artist_id]:
        years.append(int(song_year[song_id][0:4]))
    artist_year[artist_id] = round(np.mean(years))


The number of people the artist has collaborated with is also added to the dataframe.
This number will be the same as the edges in the network and thereby also the degree of the artist node.

In [None]:
# get degree of every artist in the network
degrees = G.degree()

# construct dataframe for song information
df_artist_info = pd.DataFrame(columns=['artist','artist_id','genre','sentiment','year','degree'])

for artist_id in artist_year.keys():
    # Get info for new row of dataframe
    year = artist_year[artist_id]
    artist = G._node[artist_id]['name']
    genre = G._node[artist_id]['genre']
    degree = degrees[artist_id]
    sentiment = G._node[artist_id]['sentiment']
    # Specify and append new row
    new_row = {'artist': artist,'artist_id':artist_id,'genre':genre,
               'sentiment':sentiment,'year':year,'degree':degree}
    df_artist_info = df_artist_info.append(new_row, ignore_index=True)

df_artist_info.to_pickle("../../data/other_files/df_artist_info.pkl")
df_artist_info.head() 

Unnamed: 0,artist,artist_id,genre,sentiment,year,degree
0,10cc,6i6WlGzQtXtz7GcC5H5st5,Rock,5.551358,1981,2
1,Paul McCartney,4STHEaNw4mPZ2tzheohgXB,Rock,5.711872,1986,21
2,Rakim,3PyWEKLWI0vHPmoNrIX0QE,Hip-Hop,5.373357,2002,20
3,12th Planet,3V1h3kAdiVDBiwlY2i6dJz,Pop,5.593255,2017,6
4,Skrillex,5he5w2lnU9x7JFhnwcekXX,Pop,5.371399,2014,48


## Wordclouds

The purpose of this section is to prepare the text so it is ready to be analyzed by wordclouds.
To do this a document for each genre is created consisting of all the lyrics within the genre.


In [None]:
## get the different genres
genres = []
for nod in G.nodes():
    genres.append(G._node[nod]['genre'])
genres = set(genres)

In [None]:
# Construct dictionary for the text for each genre
genre_text = {}
for genre in genres:
    genre_text[genre] = ''

# Retrieve the text
for artist_id in G.nodes():
    songs = artist_song_id[artist_id] # list of song for the artist
    song_lyrics = '' # empty string
    for song in songs:
        # Load file
        text_file = open("../../data/genius/all_song_lyrics_cleaned/" + song + ".txt", "r").read()
        song_lyrics += text_file + ' ' # add all song files
    genre = G._node[artist_id]['genre'] # Get genre of node
    # Add the text to the genre in the dictionary
    genre_text[genre] += song_lyrics + ' '

The function `text_preprocess` is used again to clean the text and the dictionary `genre_text_clean` containing the tokenized words for each genre is saved.

In [None]:
genre_text_clean = {}
# Text preprocess the text for each genre
for genre in genres:
    genre_text_clean[genre] = text_preprocess(genre_text[genre])

## Save the dictionary
with open("../../data/other_files/genre_text_clean.json", "w") as json_file:
    json.dump(genre_text_clean,json_file)