

## Libraries and Set File Directory


In [62]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import os
import sys
from IPython.display import display, HTML
from IPython.display import clear_output
import matplotlib.pyplot as plt
from ipywidgets.widgets import Button, Label, IntSlider, Output, IntRangeSlider, Text
from ipywidgets.widgets import Layout, HBox, VBox


api = None

In [63]:
# Libraries for interactive data exploration

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import random
import seaborn as sns; sns.set()
##import cufflinks as cf
##from plotly.offline import plot

In [64]:
# Change this directory to the location of the shared Spotify Data-X Project on Google Drive
#sharedDirectory = '/content/drive/My Drive/Colab Notebooks/Spotify Data-X Project'
sharedDirectory = '/Users/Alessandro/GitHub Repos/Spotify-Data-X-Project'

os.chdir(sharedDirectory)
sys.path.append(sharedDirectory + "/Utils/")
import searchAPI

# Interactive Data Exploration/Personalization

### Data Preparation

In [65]:
#Sampling 100 data entries for faster drawing
n = 2743
sample_size = 100
skip_values = sorted(random.sample(range(1,n), n-sample_size))
tracks = pd.read_csv('/Users/Alessandro/GitHub Repos/Spotify-Data-X-Project/Data/Ellek-Liked-Tracks.csv', skiprows = skip_values)
catalog = pd.read_csv('/Users/Alessandro/GitHub Repos/Spotify-Data-X-Project/Data/spotifyCatalog.csv')

In [66]:
tracks.drop('Unnamed: 0', axis=1, inplace=True)

In [67]:
# Rename features to have better readability
tracks.rename(columns={
    'artist.id':'Artist_ID',
    'artist.name':'Artist_Name',
    'track.name':'Track_Name',
    'addedAt':'Added_Date',
    'artist.genres':'Genres',
    'artist.popularity':'Artist_Popularity',
    'track.popularity':'Track_Popularity',
    'track.explicit':'Dirty',
    'trackAttributes.duration_ms':'Duration(ms)',
    'trackAttributes.tempo':'Tempo',
    'trackAttributes.mode':'Mode',
    'trackAttributes.key':'Key',
    'trackAttributes.time_signature':'Time_Signature',
    'trackAttributes.loudness':'Loudness',
    'trackAttributes.acousticness':'Acousticness',
    'trackAttributes.danceability':'Danceability',
    'trackAttributes.energy':'Energy',
    'trackAttributes.instrumentalness':'Instrumentalness',
    'trackAttributes.liveness':'Liveness',
    'trackAttributes.speechiness':'Speechiness'
    }, inplace=True)

In [68]:
tracks.head()

Unnamed: 0,track.id,Artist_ID,Artist_Name,Track_Name,Added_Date,Track_Popularity,Artist_Popularity,Genres,Dirty,Duration(ms),...,Mode,Key,Time_Signature,Loudness,Acousticness,Danceability,Energy,Instrumentalness,Liveness,Speechiness
0,0gRiVUZhN7AErXjiJnCqPX,0qEcf3SFlpRcb3lK3f2GZI,Grand Funk Railroad,Walk Like A Man (You Can Call Me Your Man) - R...,2019-11-21T22:20:33Z,23,60,"country rock,roots rock,symphonic rock,hard ro...",False,245200,...,1,2,4,-4.153,0.000927,0.554,0.901,0.0555,0.216,0.0631
1,2JZHwLYnvwGqRUwHi2240w,09s6bLEw45wioK9ytf3nsA,Westerman,Kathy's Song,2019-11-05T05:18:48Z,30,51,uk alternative pop,False,255467,...,1,8,4,-9.914,0.296,0.67,0.402,0.511,0.114,0.054
2,7FwBtcecmlpc1sLySPXeGE,3eqjTLE0HfPfh78zjh6TqT,Bruce Springsteen,Dancing In the Dark,2019-10-31T01:38:55Z,76,81,"rock,mellow gold,permanent wave,classic rock,h...",False,241307,...,0,1,4,-5.64,0.0115,0.527,0.942,0.0,0.188,0.0366
3,5JDFvZQCRDct3gswcJzOCH,6IFXsrXBpwbIqtOUOiAa3p,Rick Springfield,Kristina,2019-08-01T21:18:30Z,22,65,"pop rock,new wave pop,yacht rock,dance rock,al...",False,182827,...,1,4,4,-10.639,0.129,0.444,0.671,1e-06,0.171,0.0503
4,4LBJyUs3B4ovaIImY5w5gW,3kbBWco9PZ5eSQsNScwG6U,Built To Spill,Velvet Waltz,2019-07-21T16:56:04Z,30,57,"noise pop,stomp and holler,anti-folk,freak fol...",False,513427,...,0,7,4,-7.028,2.4e-05,0.19,0.791,0.827,0.11,0.0284


In [69]:
tracks[['Track_Popularity','Artist_Popularity','Artist_Name','Track_Name','Genres','Dirty','Mode','Time_Signature','Key']] = tracks[['Track_Popularity','Artist_Popularity','Artist_Name','Track_Name','Genres','Dirty','Mode','Time_Signature','Key']].astype('category', copy=False)

In [70]:
tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
track.id             100 non-null object
Artist_ID            100 non-null object
Artist_Name          100 non-null category
Track_Name           100 non-null category
Added_Date           100 non-null object
Track_Popularity     100 non-null category
Artist_Popularity    100 non-null category
Genres               99 non-null category
Dirty                100 non-null category
Duration(ms)         100 non-null int64
Tempo                100 non-null float64
Mode                 100 non-null category
Key                  100 non-null category
Time_Signature       100 non-null category
Loudness             100 non-null float64
Acousticness         100 non-null float64
Danceability         100 non-null float64
Energy               100 non-null float64
Instrumentalness     100 non-null float64
Liveness             100 non-null float64
Speechiness          100 non-null float64
dtypes: cat

In [71]:
## Time Series Exploration

## Convert Added_Date to a datetime object
tracks.Added_Date = pd.to_datetime(tracks.Added_Date)
## Add day of the week, month, year, date of the month, time of day columns to tracks dataframe
tracks['Hour'] = tracks.Added_Date.dt.hour
tracks['Day'] = tracks.Added_Date.dt.day
tracks['Month'] = tracks.Added_Date.dt.month
tracks['Year'] = tracks.Added_Date.dt.year
tracks['Day of Week'] = tracks.Added_Date.dt.dayofweek
tracks['Day of Year'] = tracks.Added_Date.dt.dayofyear

## Show newly added time series columns
tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 27 columns):
track.id             100 non-null object
Artist_ID            100 non-null object
Artist_Name          100 non-null category
Track_Name           100 non-null category
Added_Date           100 non-null datetime64[ns, UTC]
Track_Popularity     100 non-null category
Artist_Popularity    100 non-null category
Genres               99 non-null category
Dirty                100 non-null category
Duration(ms)         100 non-null int64
Tempo                100 non-null float64
Mode                 100 non-null category
Key                  100 non-null category
Time_Signature       100 non-null category
Loudness             100 non-null float64
Acousticness         100 non-null float64
Danceability         100 non-null float64
Energy               100 non-null float64
Instrumentalness     100 non-null float64
Liveness             100 non-null float64
Speechiness          100 non-null float6

### Interactive Plotting/Visualization

In [72]:
## Interactive scatterplot with one categorical variable (with Seaborn)

@interact
def count_plot(x=list(tracks.select_dtypes('category').columns),
                ):
    plt.figure(figsize=(20,5))
    sns.countplot(x=x, data=tracks, palette='hls')
    plt.xticks(
        rotation=50, 
        horizontalalignment='right',
        fontweight='light',
        fontsize=8  
)
plt.show()

interactive(children=(Dropdown(description='x', options=('Artist_Name', 'Track_Name', 'Track_Popularity', 'Art…

In [73]:
## Interactive scatterplot with two quantitative variables (with Seaborn)

@interact
def scatter_plot(x=list(tracks.select_dtypes('float').columns) + ["Added_Date"], 
                 y=list(tracks.select_dtypes('float').columns)[1:],
                #  theme=list(cf.themes.THEMES.keys()), 
                #  colorscale=list(cf.colors._scales_names.keys())
                 ):
    
    sns.scatterplot(data = tracks, x=x, y=y)

interactive(children=(Dropdown(description='x', options=('Tempo', 'Loudness', 'Acousticness', 'Danceability', …

In [74]:
## Interactive time series exploratory anaylsis (with Seaborn)
@interact
def time_plot(x=['Hour','Day','Month','Year','Day of Week', 'Day of Year'], 
                 y=list(tracks.select_dtypes('float').columns)[1:],
                #  theme=list(cf.themes.THEMES.keys()), 
                #  colorscale=list(cf.colors._scales_names.keys())
                 ):
    
    sns.lineplot(data = tracks, x=x, y=y)

interactive(children=(Dropdown(description='x', options=('Hour', 'Day', 'Month', 'Year', 'Day of Week', 'Day o…

# Find Similar Songs

In [75]:
spotify = pd.read_csv("Data/spotifyCatalog.csv")
spotify.head()

Unnamed: 0.1,Unnamed: 0,album_id,album_name,artist_ids,artist_names,duration_ms,id,name,us_available,year,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genreArrStr
0,354801,50hRYWK49z2kBf6cMxHmZe,ジムノペディ サティ作品集 II,"['459INk8vcC0ebEef82WjIK', '6ltV1dxtdNmFBvpZz2...","['Erik Satie', '白石光隆']",68268,6VKX9Dheh0qLmjfK2Npq4z,ばら十字教団の最初の思想,True,2012,...,874,0,107,-30.795,1000,57,139.804,4,188,['classical']
1,553689,51WBTsiitqhIE9DkHNvZqR,Necessary Roughness,['5CxwOBCoGSvWxfDgNoa1Nv'],['The Lady Of Rage'],237026,3ptH0tXq7HEaIWxyn7vkOy,Get With Da Wickedness,True,2001,...,0,11,322,-5.39,0,357,88.183,4,802,"['pop', 'rap', 'hip', 'funk']"
2,290293,57CBQ2KFwHoY3vJGHgofCo,Retreat,['4qDGDPGMIJuIvPfUGe0Ngg'],['Cutty Ranks'],230866,0SPxr6sGUeRlTHndjOryKT,Me Fit,True,1991,...,0,6,39,-17.489,0,98,172.787,4,967,['dance']
3,321077,6HT1eWnFxuB1apcSuOzpd2,Puccini: Boheme (La),"['0OzxPXyowUEQ532c9AmHUR', '7oPmR7kujiCfv7EjD1...","['Giacomo Puccini', 'Stanislav Beňačka', 'Carm...",115066,5q9ClErLj0ZHXiAkUU0Rsh,"La boheme: Act II: Signorina Mimi (Marcello, M...",True,1990,...,0,9,667,-19.135,1000,49,125.942,3,202,"['classical', 'folk']"
4,560873,2RmEcwZVXG5Z1rHED7MYhs,Kompilation,['51qSeH9HimuYMMQ7qbWGrk'],['Jürgen Paape'],357921,1T3UB1f8rkB72u6GeTRKrp,Fruity Loops #2,True,2011,...,775,10,81,-12.442,1000,48,123.996,4,354,"['electr', 'house']"


In [76]:
spotifyCols = spotify.columns
dataCols = ['year', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo','valence']
outputCols = ["name", "artist_names", "year", "genreArrStr", "id"]
X = spotify[dataCols].astype(np.float16)#.values.astype(np.float16)
# spotify.head(2)

In [77]:
def preProcessTrack(track):  
    track["year"] *= 20
    track["acousticness"] *= 1
    track["danceability"] *= 1
    track["energy"] *= 1
    track["instrumentalness"] *= 1
    track["liveness"] *= 1
    track["loudness"] *= 10
    track["speechiness"] *= 1
    track["tempo"] *= 10
    track["valence"] *= 1
    return track

X = preProcessTrack(X).values

In [78]:
tree = BallTree(X, leaf_size=10)

## Logging into Spotify

After running the cell below, open the printed URL and authorize Spotify access. Upon doing that, you will be redirected to what looks like a 404 error page (that is expected!).

Just paste the URL of the page you are redirected to below to authenticate the Spotify login.

In [80]:
if api is None: api = searchAPI.SpotifyAPI()

In [81]:
def filter(series):
    series = pd.Series(series)
    year = slider1.value
    songs = slider2.value
    if (series['year'] >= year[0] and series['year'] <= year[1]):
        return True
    else:
        return False
  

In [82]:
slider1 = IntRangeSlider(value=[1985,2020], min=1900, max=2020, step = 1, description='Year', layout=Layout(width='490px'))
slider2 = IntSlider(value = 15, min = 1, max = 100, step = 1, description="Songs", layout = Layout(width = '490px'))
button1 = Button(description='Search Song', type='success')
button2 = Button(description = 'Create Playlist',type='success' )
button3 = Button(description = 'New Song')
text1 = Text(value='', placeholder='e.g. Wrecking Ball', description='Song Name:')
text2 = Text(value='', placeholder='e.g. 0', description='ID of song:')
box = VBox(children=(text1, button1 ))
box2 = (HBox(children=(text2, slider1, slider2, button3, button2)))
def on_button_clicked1(but):
    findNearestTracks()
button1.on_click(on_button_clicked1)
def on_button_clicked2(but):
    findNearest()
button2.on_click(on_button_clicked2)
def on_button_clicked3(but):
    clear_output(wait=False)
    refresh()
button3.on_click(on_button_clicked3)


In [91]:
def findNearestTracks():
    songs = slider2.value
    searchQuery = text1.value
    queryResults = api.search(searchQuery, limit=10)
    display(queryResults[["track.name", "artist.name", "year"]])
    display(box2)

def findNearest():
    year = slider1.value
    songs = slider2.value
    searchQuery = text1.value
    trackSelection = text2.value
    queryResults = api.search(searchQuery, limit=10)
    track = queryResults.loc[int(trackSelection), :]
    trackData = track.loc[dataCols]
    trackData = trackData.astype(np.float16)
    trackData = preProcessTrack(trackData)
    answer = pd.DataFrame(columns=outputCols)
    k = 1
    while answer.shape[0] < songs:
        dist, idx = tree.query(trackData.values.reshape(1,-1), k=k)
        out = spotify.iloc[idx[0][k-1], :][outputCols]
        if filter(out):
            answer = answer.append(out)
            k += 1
        else:
            k +=1
            continue
    display(answer)
    display(button3)
    return track
def refresh():
    clear_output(wait=False)
    display(box)

In [92]:
display(box)

VBox(children=(Text(value='Hot N Cold', description='Song Name:', placeholder='e.g. Wrecking Ball'), Button(de…