In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [None]:
df = pd.read_csv('data/drexler_complete_lyrics.csv', sep=';')
df.sample(10)

### Let's start with wordcount
##### Following: http://sigdelta.com/blog/text-analysis-in-pandas/

In [None]:
df['words'] = df.lyrics.str.strip().str.split('[\W_]+')

In [None]:
rows = list()
for row in df[['words']].iterrows():
    r = row[1]
    for word in r.words:
        rows.append((word))

words = pd.DataFrame(rows, columns=['word'])
words.head()

In [None]:
#Remove empty rows and cases
words = words[words.word.str.len() > 0]
words['word'] = words.word.str.lower()
words.head()

In [None]:
#Word count per song
counts = words\
    .word.value_counts()\
    .to_frame()\
    .rename(columns={'word':'n_w'})
counts

## So, from this we get to know two important facts
 - Most used words in Jorge Drexler's songs are 'de', 'que', 'la', 'el' and 'y'
 - We need to exclude stopwords.

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('spanish')

In [None]:
for a_stopword in stopwords:
    if a_stopword in counts.index:
        counts = counts.drop(a_stopword)

In [None]:
counts.head(20)

### WTF is that 'na' topping the list?

In [None]:
songs = list()
for row in df[['title','words']].iterrows():
    r = row[1]
    if 'na' in r.words:
        songs.append(r.title)

songs

### Now it's pretty obvious, just listen to the song.

In [None]:
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/YFBe7hLUPKo?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')


In [None]:
counts = counts.drop('na')

In [None]:
counts.head(20)

In [None]:
words_by_album = {}
words_by_year = {}
top_five_words = counts.head(5).index
for row in df[['album','words','year']].iterrows():
    r = row[1]
    if r.album == 'Others':
        continue
    if r.album not in words_by_album:
        words_by_album[r.album] = {
            top_five_words[0]: 0,
            top_five_words[1]: 0,
            top_five_words[2]: 0,
            top_five_words[3]: 0,
            top_five_words[4]: 0
        }
    if r.year not in words_by_year:
        words_by_year[r.year] = {
            top_five_words[0]: 0,
            top_five_words[1]: 0,
            top_five_words[2]: 0,
            top_five_words[3]: 0,
            top_five_words[4]: 0
        }
    for a_word in r.words:
        if a_word == top_five_words[0]:
            words_by_album[r.album][top_five_words[0]] += 1
            words_by_year[r.year][top_five_words[0]] += 1
        if a_word == top_five_words[1]:
            words_by_album[r.album][top_five_words[1]] += 1
            words_by_year[r.year][top_five_words[1]] += 1
        if a_word == top_five_words[2]:
            words_by_album[r.album][top_five_words[2]] += 1
            words_by_year[r.year][top_five_words[2]] += 1
        if a_word == top_five_words[3]:
            words_by_album[r.album][top_five_words[3]] += 1
            words_by_year[r.year][top_five_words[3]] += 1
        if a_word == top_five_words[4]:
            words_by_album[r.album][top_five_words[4]] += 1
            words_by_year[r.year][top_five_words[4]] += 1

In [None]:
top_five_words

In [None]:
ROWS = 4
COLS = 3
max_appearance = 0
for album in words_by_album:
    for word in words_by_album[album]:
        a_number = words_by_album[album][word]
        if a_number > max_appearance:
            max_appearance = a_number

albums_ordered_by_year = [
    'La Luz Que Sabe Robar', 'Radar', 'Vaivén', 
    'Llueve', 'Frontera', 'Sea', 
    'Eco', '12 Segundos de Oscuridad','Amar la trama',
    'Bailar en la cueva', 'Salvavidas de hielo'
]
years_for_albums = [
    1992, 1994, 1996, 1997, 1999, 
    2001, 2004, 2006, 2010, 2014, 
    2017
]

colormap = {
    'mar': 'xkcd:royal blue',
    'tiempo': 'xkcd:mustard',
    'luna': 'grey',
    'noche': 'black',
    'corazón': 'xkcd:crimson' 
}

def autolabel(rects, x, y):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        axes[x,y].text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom')


In [None]:
fig, axes = plt.subplots(nrows=ROWS, ncols=COLS, figsize=(20,30))

for row in range(ROWS):
    for col in range(COLS):
        if not albums_ordered_by_year:
            break
        an_album_title = albums_ordered_by_year.pop(0)
        an_album_year = years_for_albums.pop(0)
        an_album = words_by_album[an_album_title]
        rects = axes[row, col].bar(range(len(an_album)), list(an_album.values()), align='center')
        axes[row, col].set_xticks(range(len(an_album)))
        axes[row, col].set_xticklabels(list(an_album.keys()))
        axes[row, col].yaxis.set_major_locator(MaxNLocator(integer=True))
        axes[row, col].set_ylim([0,max_appearance + 3])
        axes[row, col].set_title("%s - %d \n Term frequency per album" % (an_album_title, an_album_year))
        axes[row, col].spines['right'].set_visible(False)
        axes[row, col].spines['top'].set_visible(False)
        rects[0].set_color('xkcd:royal blue')
        rects[1].set_color('xkcd:mustard')
        rects[2].set_color('black')
        rects[3].set_color('grey')
        rects[4].set_color('xkcd:crimson')
        autolabel(rects, row, col)

plt.subplots_adjust(hspace = 0.3)
plt.show()

## Interesting insight:
    - He loved talking about the sea ('mar') in the 90's but grew bored as time passed by.
    - After a time in his albums where he doesn't revisit his favorite terms a lot, comes back to monothematic stuff.
    - Bailar en la cueva is super focused on the moon ('luna') and the night ('noche').
    - Salvavidas de hielo is about the pass of time ('tiempo'), a trend which he recovered from Frontera, and again, the night ('noche').

In [None]:
words_by_year = pd.DataFrame(words_by_year).transpose()

In [None]:
values_for_y_ticks = []
for a_word in words_by_year.columns:
    values_for_y_ticks.extend(words_by_year[a_word].unique())
values_for_y_ticks = list(set(values_for_y_ticks))

In [None]:
fig, ax = plt.subplots(figsize=(30,15))
for top_word in list(words_by_year.columns):
    ax.plot(
        words_by_year.index, 
        words_by_year[top_word], 
        '-', 
        color=colormap[top_word], 
        linewidth=5, 
        marker='o',
        markersize=12
    )
leg = plt.legend()
# get the individual lines inside legend and set line width
for line in leg.get_lines():
    line.set_linewidth(5)
# get label texts inside legend and set font size
for text in leg.get_texts():
    text.set_fontsize(20)
plt.xticks(words_by_year.index)
plt.yticks(values_for_y_ticks)
ax.spines["top"].set_visible(False)
plt.tick_params(axis='both', labelright=True, labelsize=20)
ax.set_title("Term frequency per year", fontdict={'fontsize': 25})
plt.show()

## Lets polish this

Future lines of work:

- http://www.everydayanalytics.ca/2013/06/radiohead-lyrics-data-visualization-and-content-analysis.html
- https://www.promptcloud.com/blog/data-visualization-text-mining-taylor-swift-song-lyrics
- https://medium.com/@krisshaffer/exploring-musical-data-with-r-an-introduction-to-computational-music-analysis-2216d061fed6 chords through spotify api?
- 

Expanding dataset?

In [82]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from IPython.display import clear_output

In [None]:
artist_name = 'Jorge Drexler'
client_credentials_manager = SpotifyClientCredentials(client_id='a9114ca9983341f4b5a3c70a8e1b4c78',
                                                      client_secret='1e8bac81838d4c968484b035d68f8b2b')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
artist_id = sp.search(q='artist:' + artist_name, type='artist')['artists']['items'][0]['id']
albums_ids = [(x['id'], x['name']) for x in sp.artist_albums(artist_id, album_type='album')['items']]
albums_ids = {x[1]: x[0] for x in albums_ids}

In [None]:
del albums_ids['Cara B']
del albums_ids['La Edad Del Cielo']

In [None]:
albums_ids

In [77]:
spotify_raw_df = pd.DataFrame()
for an_album_title, an_album_id in albums_ids.items():
    a_dataframe = pd.DataFrame(sp.album_tracks(an_album_id)['items'])
    a_dataframe['album'] = an_album_title
    if not spotify_raw_df.shape[0]:
        spotify_raw_df = pd.DataFrame(a_dataframe)
    else:
        spotify_raw_df = pd.concat([spotify_raw_df, a_dataframe])

In [80]:
del spotify_raw_df['artists']
del spotify_raw_df['available_markets']
del spotify_raw_df['explicit']
del spotify_raw_df['external_urls']
del spotify_raw_df['href']
del spotify_raw_df['is_local']
del spotify_raw_df['preview_url']
del spotify_raw_df['type']

In [81]:
spotify_raw_df = spotify_raw_df.reset_index()

In [83]:
for index, row in spotify_raw_df.iterrows():
    print(index)
    audio_analysis = sp.audio_analysis(row['id'])
    audio_features = sp.audio_features(row['id'])[0]
    spotify_raw_df.at[index, 'duration_s'] = audio_analysis['track']['duration']
    spotify_raw_df.at[index, 'acousticness'] = audio_features['acousticness']
    spotify_raw_df.at[index, 'danceability'] = audio_features['danceability']
    spotify_raw_df.at[index, 'energy'] = audio_features['energy']
    spotify_raw_df.at[index, 'key'] = audio_features['key']
    spotify_raw_df.at[index, 'tempo'] = audio_features['tempo']
    spotify_raw_df.at[index, 'time_signature'] = audio_features['time_signature']
    spotify_raw_df.at[index, 'valence'] = audio_features['valence']
    spotify_raw_df.at[index, 'mode'] = audio_features['mode']
    clear_output(wait=True)

125


In [84]:
spotify_raw_df

Unnamed: 0,index,disc_number,duration_ms,id,name,track_number,uri,album,duration_s,acousticness,danceability,energy,key,tempo,time_signature,valence,mode
0,0,1,231653,7pBoi7yWCPzn3UjeMsGKg6,Movimiento,1,spotify:track:7pBoi7yWCPzn3UjeMsGKg6,Salvavidas de hielo,231.65333,0.9000,0.784,0.457000,2.0,125.021,4.0,0.5460,1.0
1,1,1,184963,7zSBOSD4w5LGv1bIblkVFp,Telefonía,2,spotify:track:7zSBOSD4w5LGv1bIblkVFp,Salvavidas de hielo,184.96354,0.8290,0.915,0.592000,6.0,126.060,4.0,0.8410,0.0
2,2,1,207986,2gjB9GgSZFlj0YwItEACpQ,Silencio,3,spotify:track:2gjB9GgSZFlj0YwItEACpQ,Salvavidas de hielo,207.98667,0.2630,0.887,0.568000,4.0,127.923,4.0,0.1270,0.0
3,3,1,182453,3cApCJqmVwE23cxYUADZpx,Pongamos que hablo de Martínez,4,spotify:track:3cApCJqmVwE23cxYUADZpx,Salvavidas de hielo,182.45333,0.8930,0.849,0.458000,9.0,119.950,4.0,0.4610,1.0
4,4,1,261066,0R1P6xtRxAQRS0UMo4wkgg,Estalactitas,5,spotify:track:0R1P6xtRxAQRS0UMo4wkgg,Salvavidas de hielo,261.06667,0.7970,0.772,0.630000,0.0,136.019,4.0,0.9090,1.0
5,5,1,183880,41gpbNDcsSh4r8YgVMbQqh,Asilo (feat. Mon Laferte),6,spotify:track:41gpbNDcsSh4r8YgVMbQqh,Salvavidas de hielo,183.88000,0.8080,0.502,0.239000,0.0,127.484,5.0,0.3670,1.0
6,6,1,218040,2mJYTwrs3lMTJ0Z2TXnbZZ,Abracadabras (feat. Julieta Venegas),7,spotify:track:2mJYTwrs3lMTJ0Z2TXnbZZ,Salvavidas de hielo,218.04000,0.8160,0.763,0.532000,7.0,96.116,4.0,0.4200,1.0
7,7,1,292293,7lc9tI9eS9ULnOdKTsBQ92,Mandato,8,spotify:track:7lc9tI9eS9ULnOdKTsBQ92,Salvavidas de hielo,292.29333,0.5740,0.776,0.607000,0.0,100.967,4.0,0.9370,1.0
8,8,1,217413,2RiFT9ZJhR8jcBaz5Si7DC,Despedir a los glaciares,9,spotify:track:2RiFT9ZJhR8jcBaz5Si7DC,Salvavidas de hielo,217.41333,0.5570,0.428,0.576000,7.0,112.595,4.0,0.1430,1.0
9,9,1,218520,5787WaGd5DQvL14A6SJ8cG,Quimera,10,spotify:track:5787WaGd5DQvL14A6SJ8cG,Salvavidas de hielo,218.52000,0.8940,0.831,0.486000,11.0,87.023,4.0,0.4270,0.0
