# Generate Data

In [None]:
pip install spotipy --upgrade

In [None]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

import pandas as pd
import numpy as np

In [None]:
CLIENT_ID     = 'df768b43d2cd406a9605cd8261680906'
CLIENT_SECRET = '0db4a7ce151f424eb51e0da9fbc4985f'
REDIRECT_URI  = 'https://localhost'

scope = "playlist-read-private"

credentials = oauth2.SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET)


spotify = spotipy.Spotify(auth_manager=credentials)

#credentials = SpotifyOAuth(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, redirect_uri=REDIRECT_URI, scope=scope)
#spotify = spotipy.Spotify(auth_manager=credentials)


In [None]:
#gets track ids from playlist id
def get_playlist_track_ids(playlist_id):
    res = spotify.playlist_tracks(playlist_id)
    track_ids = []
    artists = []
    tracks = []

    for i in range(len(res['items'])):
        tracks.append(res['items'][i]['track']['name']) # song name
        artists.append(res['items'][i]['track']['artists'][0]['name']) # artist name
        track_ids.append(res['items'][i]['track']['id']) # track_id
        
    return tracks, artists, track_ids


#get ids based on artist name and track
def get_track_ids(artists, track):
    track_ids = []
    for i in range(len(artists)):
        res = spotify.search(q='artist:' + artists[i] + ' track:' + track[i], type="track", limit=1)
        track_id = res['tracks']['items'][0]['id']
        track_ids.append(track_id)
    return track_ids


# clean data 
# remove non-qualitative features 
def get_track_features(track_ids):
    track_features = []
    
    for i in range(len(track_ids)):
        
        features = spotify.audio_features(track_ids[i])
        features[0].pop('type')
        features[0].pop('id')
        features[0].pop('uri')
        features[0].pop('track_href')
        features[0].pop('analysis_url')
        
        track_features.append(list(features[0].values()))
        
    df = pd.DataFrame(np.array(track_features), columns=list(features[0].keys()))
    df.insert(0, 'id', np.array(track_ids))
    #TODO: append column for artist name
    
    return df

### DO NOT RUN THESE CELLS:

The cells below will generate our data from two playlists created by Spotify:

1. Viral 50 - daily hits from 70 countries (+ global playlist)
2. Top 50 - weekly hits from 52 countries (+ global playlist)

As these playlists are updated periodically, the cell below can be run to generate a more up-to-date sample. We've also removed duplicates as there is considerable overlap between countries. 

In [None]:
countries_daily = ['USA', 'Global', 'United Kingdom', 'France', 'Russia', 'Mexico', 'Spain', 'Philippines', 'Colombia', 
             'South Korea', 'Netherlands', 'Norway', 'Ukraine', 'Canada', 'Sweden', 'Finland', 'Denmark', 'Poland', 
             'Turkey', 'Australia', 'Taiwan', 'Dominican Republic', 'Vietnam', 'Japan', 'Indonesia', 'Chile', 'Italy', 
             'Argentina', 'Israel', 'Germany', 'Belgium', 'Hong Kong', 'Brazil', 'India', 'Ireland', 'Greece', 'Peru',
             'Austria', 'Iceland', 'Bulgaria', 'Lithuania', 'Bolivia', 'Romania', 'Latvia', 'Ecuador', 'Morocco', 'Malaysia',
             'Estonia', 'Thailand', 'Portugal', 'Hungary', 'Paraguay', 'Egypt', 'Cyprus', 'Honduras', 'Guatemala', 'Costa Rica',
             'Singapore', 'South Africa', 'Saudi Arabia', 'Uruguay', 'Andorra', 'Czech Republic', 'New Zealand', 'Switzerland',
             'Nicaragua', 'UAE', 'El Salvador', 'Slovakia', 'Panama', 'Luxembourg']

# array of dataframes (1 entry per playlist/country)
playlist_df_daily = []

for i in range(len(countries)):
    res = spotify.search('viral 50 - ' + countries_daily[i], type="playlist", limit=1)
    tracks, artists, track_ids = get_playlist_track_ids(res['playlists']['items'][0]['id'])
    playlist_df_daily.append(get_track_features(track_ids))
    

daily_songs = pd.concat(playlist_df_daily, ignore_index=True)
daily_songs = daily_songs.drop_duplicates(subset='id', keep='first', ignore_index=True)
daily_songs.to_csv('Viral_50_Playlist_Data.csv')
print('Viral 50 (daily) data generated')


countries_weekly = ['Global', 'Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Canada', 'Chile', 'Colombia', 
                    'Czech Republic', 'Denmark', 'Republic', 'Ecuador', 'Egypt', 'Finland', 'Germany', 'Guatemala', 
                    'Hong Kong', 'Hungary', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Malaysia', 
                    'Mexico', 'Morocco', 'Netherlands', 'New Zealand', 'Norway', 'Panama', 'Peru', 'Philippines', 'Poland',
                    'Romania', 'Russia', 'Saudi Arabia', 'Singapore', 'Slovakia', 'South Africa', 'South Korea', 'Spain', 
                    'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'Ukraine', 'UAE', 'United Kingdom', 'USA', 
                    'Vietnam']

# array of dataframes (1 entry per playlist/country)
playlist_df_weekly = []

for i in range(len(countries_weekly)):
    res = spotify.search('Top Songs - ' + countries_weekly[i], type="playlist", limit=1)
    tracks, artists, track_ids = get_playlist_track_ids(res['playlists']['items'][0]['id'])
    playlist_df_weekly.append(get_track_features(track_ids))
    
    
weekly_songs = pd.concat(playlist_df_weekly, ignore_index=True)
weekly_songs = weekly_songs.drop_duplicates(subset='id', keep='first', ignore_index=True)
weekly_songs.to_csv('Top_Songs_Playlist_Data.csv')
print('Top Songs (weekly) data generated')


combined_df = pd.concat([daily_songs, weekly_songs], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset='id', keep='first', ignore_index=True)

print(combined_df.shape)
combined_df.to_csv('Total_Data.csv')

### Load sample

In [5]:
sample = pd.read_csv('Total_Data.csv', index_col=0)
sample

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,4ZtFanR9U6ndgddUvNcjcG,0.563,0.664,9.0,-5.044,1.0,0.1540,0.33500,0.000000,0.0849,0.6880,166.928,178147.0,4.0
1,3VqeTFIvhxu3DIe4eZVzGq,0.759,0.459,8.0,-5.187,1.0,0.0948,0.00323,0.000000,0.0906,0.6950,109.997,164442.0,4.0
2,6HU7h9RYOaPRFeh0R3UeAr,0.442,0.612,2.0,-7.222,1.0,0.1120,0.58400,0.000006,0.3700,0.1780,180.917,215507.0,4.0
3,67BtfxlNbhBmCDR2L2l8qd,0.610,0.508,8.0,-6.682,0.0,0.1520,0.29700,0.000000,0.3840,0.7580,178.818,137876.0,4.0
4,748mdHapucXQri7IAO8yFK,0.762,0.701,8.0,-3.541,1.0,0.0286,0.23500,0.000158,0.1230,0.7420,110.968,208867.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1761,4dF0mDPTlpjThf1mJyK83B,0.447,0.263,5.0,-13.754,0.0,0.0342,0.82800,0.004900,0.1180,0.0721,147.161,208937.0,4.0
1762,5BUl0EBl4tOmPtT9IN0UUP,0.623,0.346,2.0,-9.881,0.0,0.0682,0.89600,0.000000,0.1160,0.1850,84.621,169186.0,5.0
1763,11ZulcYY4lowvcQm4oe3VJ,0.964,0.398,11.0,-8.981,0.0,0.0795,0.00151,0.000039,0.1010,0.5630,105.012,178405.0,4.0
1764,3HqQpSTLHJRKDhulVacRFb,0.761,0.821,8.0,-5.238,1.0,0.2020,0.18500,0.000000,0.0692,0.3880,108.130,125879.0,4.0


# Implement PCA

In [93]:
import scipy.io as sio
import matplotlib
import matplotlib.pyplot as plt
from numpy.matlib import repmat
from sklearn.preprocessing import normalize

%matplotlib inline

In [94]:
def eigsort(V, eigvals):
    
    # Sort the eigenvalues from largest to smallest. Store the sorted
    # eigenvalues in the column vector lambd.
    lohival = np.sort(eigvals)
    lohiindex = np.argsort(eigvals)
    lambd = np.flip(lohival)
    index = np.flip(lohiindex)
    Dsort = np.diag(lambd)
    
    # Sort eigenvectors to correspond to the ordered eigenvalues. Store sorted
    # eigenvectors as columns of the matrix vsort.
    M = np.size(lambd)
    Vsort = np.zeros((M, M))
    for i in range(M):
        Vsort[:,i] = V[:,index[i]]
    return Vsort, Dsort


# normc(M) normalizes the columns of M to a length of 1.
def normc(Mat):
    return normalize(Mat, norm='l2', axis=0)


def PCA_(data):
    
    m,n = data.shape

    mean = np.mean(data, 0)
    
    Z = data - mean
    C = np.dot(Z.T, Z)

    D, V = np.linalg.eig(C)
    Vsort, Dsort = eigsort(V, D)
    U = np.dot(Z, Vsort)
    U = normc(U)
        
    return U, Vsort, Dsort, Z

# Run PCA

In [95]:
pca_df = data_combined_df.drop(columns='id')
data_pca = pca_df.to_numpy()
U, Vsort, Dsort, Z = PCA_(data_pca)

In [80]:
eigenvalues = []
for i in range(13):
    eigenvalues.append(Dsort[i,i])

eigenvalues

[5821336641117.754,
 1464033.1408334444,
 22515.57750737075,
 14907.95403630556,
 425.20137469129094,
 139.7019234835391,
 102.35534478538538,
 83.4214421733707,
 33.15281395760269,
 25.710307677087545,
 23.654365475479047,
 17.834268948733808,
 14.694951062927375]

In [81]:
c = np.dot(U.T, Z)

# project data into new coordinate space, consisting of top 4 eigenvectors
proj_data = np.dot( U[:,0:4], c[0:4] ) + np.mean(data_pca, 0)

proj_data.shape

(1766, 13)

In [82]:
import plotly.express as px

In [83]:
# visualizes all the original dimensions
features = pca_df.columns
fig = px.scatter_matrix(pca_df, dimensions=features)
fig.update_traces(diagonal_visible=False)
fig.show()

In [89]:
# visualizes all the principal components
fig = px.scatter_matrix(
    U,
    labels=tuple(features),
    dimensions=range(13),
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [90]:
# visualizes first four principal components
fig = px.scatter_matrix(
    U,
    labels=tuple(features),
    dimensions=range(4),
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [116]:
### THOUGHT: you can view this graph from a certain angle, which will represent the principle components above
a =  proj_data[:,0]
b =  proj_data[:,1]
c =  proj_data[:,2]
cc = proj_data[:,3]

fig = px.scatter_3d(x=a, y=b, z=c, color=cc)
fig.show()

In [88]:
### keeping this to cross reference PCA solution w sklearn package, DELETE LATER



# visualizes all the principal components
# from sklearn.decomposition import PCA

# pca = PCA()
# components = pca.fit_transform(pca_df[features])

# labels = {
#     str(i): f"PC {i+1} ({var:.1f}%)"
#     for i, var in enumerate(pca.explained_variance_ratio_ * 100)
# }
# fig = px.scatter_matrix(
#     components,
#     labels=labels,
#     dimensions=range(4),
# )
# fig.update_traces(diagonal_visible=False)
# fig.show()

# Implement K means

# Run K means

# Run Mixture of Gaussians

In [108]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=4).fit(proj_data)
labels = gmm.predict(proj_data)

# EDA (look at clusters)

In [None]:
#TODO

# junk cells

In [None]:
artists = ['tom misch', 'calvin harris', 'justin bieber']
tracks = ['movie', 'feel so close', 'off my face']

track_ids = get_track_ids(artists, tracks)

get_track_features(track_ids)

track_ids[0]

In [None]:
artist = 'calvin harris'
track = 'feel so close'
res = spotify.search(q='artist:' + artist + ' track:' + track, type="track", limit=1)
#print(res)
track_id = res['tracks']['items'][0]['id']
#get track id from query
#print(res['tracks']['items'][0]['id'])

features = spotify.audio_features(track_id)

print(features)

features[0].pop('type')
features[0].pop('id')
features[0].pop('uri')
features[0].pop('track_href')
features[0].pop('analysis_url')

print(features[0].values())
print(list(features[0].keys()))

In [None]:
res = spotify.playlist_tracks('37i9dQZEVXbLiRSasKsNU9')

res['items'][0]['track']['name'] # song name
res['items'][0]['track']['artists'][0]['name'] # artist name
res['items'][0]['track']['id'] # track_id

I predict that liveness won't help
Features may contribute and correlate strongly together. 
PCA might be able to pick out these features that correlate strongly together


pipeline:
(spotify) playlists -> tracks -> track ids -> features

1. sample data

In [None]:
def PCA(data):
    
    m,n = data.shape

    mean = np.mean(data, 1)
    Z = data - np.tile(mean, (n,1)).T
    C = np.matmul(Z,Z.T)/n
    
    D, V = np.linalg.eig(C)
    Vsort, Dsort = eigsort(V,D)
    
    U = np.matmul(Z,Vsort)
    U = normc(U)
    
    return U, Vsort, Dsort