In [None]:
import numpy as np
import matplotlib.pyplot as plt

import utilityfunctions as uf

# Clustering the Spotify data

In [None]:
# Let's define a distance metric; which one is this??
def distance(a, b):
    subtracted = a-b
    return np.sqrt(np.dot(subtracted.T, subtracted))

In [None]:
# Let's define a function to calculate the distance from each data point to each centroid
def get_distances(item, centroids):
    distances = [distance(item, centroid) for centroid in centroids]
    return distances

# Let's define a function to update cluster assignments given a set of centroids
def update_clusters(data, centroids):
    return [np.argmin(get_distances(item, centroids)) for item in data]

In [None]:
# Let's define a function to get all the data points assigned to a cluster
def get_points_in_cluster(data, clusters, clusterid):
    return [data[j] for j in range(len(clusters)) if clusters[j] == clusterid]
   
# Let's define a function to update the centroids
def update_centroids(data, k, clusters):
    return np.array([np.mean(get_points_in_cluster(data, clusters, j), axis=0) for j in range(k)])

In [None]:
# Let's define a function to measure the inertia
def inertia(data, centroids, clusters):
    sum_squares = 0
    for i in range(len(data)):
        sum_squares += distance(data[i], centroids[clusters[i]])**2
    return sum_squares / len(data)

## Let's try it on the Spotify data!

In [None]:
import pandas as pd
import plotly.express as px

columns=['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections','popularity']
data = np.array(np.genfromtxt('data/spotify_dataset.csv', delimiter=',', skip_header=1, usecols=(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17), dtype=float, encoding='utf-8'))  
# And just the first 500 rows
data = data[:500]
print(uf.getShapeType(data))
print(uf.getSummaryStatistics(data))

# Why am I removing the labels?
(data, y) = uf.split(data, 0)

# For the purposes of visualization, I'm going to use PCA to reduce this dataset to 3 dimensions
data = uf.preprocess(data, zscore=True)
eigenvals, eigenvecs = uf.pca_with_plots(data)

In [None]:
import plotly.graph_objects as go
from plotly.validators.scatter.marker import SymbolValidator

def project_and_plot(data, centroids, eigenvecs, project=True):
    if project:
        v = eigenvecs[:, :3]
        projected = data@v
        projected_centroids = centroids@v
    else:
        projected = data
        projected_centroids = centroids
    projected_with_labels = np.hstack((projected, np.array([clusters]).T))
    fig = go.Figure()
    fig.add_trace(go.Scatter3d(x=projected_with_labels[:, 0], y=projected_with_labels[:, 1], z=projected_with_labels[:, 2], mode='markers', marker=dict(size=8, color=projected_with_labels[:, 3])))
    fig.add_trace(go.Scatter3d(x=projected_centroids[:, 0], y=projected_centroids[:, 1], z=projected_centroids[:, 2], mode='markers'))   
    fig.show()

### Six clusters, random initialization

Why six?

In [None]:
k = 6

# Let's pick k points to be centroids, at random
centroidids = np.random.choice(np.arange(len(data)), size=k, replace=False)
centroids = np.array([data[x] for x in centroidids])

clusters = update_clusters(data, centroids)
project_and_plot(data, centroids, eigenvecs)

In [None]:
# Let's loop over updating the centroids and plotting
while input() != 'stop':
    centroids = update_centroids(data, k, clusters)
    clusters = update_clusters(data, centroids)
    print(inertia(data, centroids, clusters))
    project_and_plot(data, centroids, eigenvecs)

### In PCA space

If your data is heterogeneous, for example very different ranges, then you will *need* to normalize before doing kNN, or the bigger variables will dominate the clustering.

In [None]:
v = eigenvecs[:, :3]
projected = data@v

# Let's pick 10 at random
centroidids = np.random.choice(np.arange(len(projected)), size=k, replace=False)
centroids = np.array([projected[x] for x in centroidids])

clusters = update_clusters(projected, centroids)
project_and_plot(projected, centroids, eigenvecs, project=False)

In [None]:
# Let's loop over updating the centroids and plotting
while input() != 'stop':
    centroids = update_centroids(projected, k, clusters)
    clusters = update_clusters(projected, centroids)
    
    print(inertia(projected, centroids, clusters))
    project_and_plot(projected, centroids, eigenvecs, project=False)

Of course, we want to know how good our clusters are. How can we figure that out, without labels? Well, here we do have labels but it's a number from 0 to 1. Maybe we could "bin" the labels? Or, maybe there are clusters in this data that have nothing to do with danceability!

## Resources
* For a list of lots of clustering algorithms, see https://scikit-learn.org/stable/modules/clustering.html