# SPOTIFY CLUSTERRING

**IMPORT LIBRARIES**

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.cluster import KMeans 
from scipy.stats import zscore

In [2]:
df = pd.read_csv('spotify.csv')

**LOADING THE DATA**

In [4]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [5]:
df.shape

(32833, 23)

In [6]:
df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
dtype: object

In [7]:
df.isnull().sum()

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.isnull().sum()

track_id                    0
track_name                  0
track_artist                0
track_popularity            0
track_album_id              0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [10]:
df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
dtype: object

In [11]:
df = df.drop(['track_album_id','playlist_id'],axis=1)

In [12]:
dummy = pd.get_dummies(['track_name', 'track_artist','track_album_name','playlist_name','playlist_genre'])

In [13]:
df = pd.concat([df, dummy], axis=1)

In [14]:
df.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
playlist_genre               object
playlist_name                object
track_album_name             object
track_artist                 object
track_name                   object
dtype: object

In [15]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_genre,playlist_subgenre,danceability,...,instrumentalness,liveness,valence,tempo,duration_ms,playlist_genre.1,playlist_name.1,track_album_name.1,track_artist.1,track_name.1
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,pop,dance pop,0.748,...,0.0,0.0653,0.518,122.036,194754,False,False,False,False,True
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,pop,dance pop,0.726,...,0.00421,0.357,0.693,99.972,162600,False,False,False,True,False
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,pop,dance pop,0.675,...,2.3e-05,0.11,0.613,124.008,176616,False,False,True,False,False
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Call You Mine - The Remixes,2019-07-19,Pop Remix,pop,dance pop,0.718,...,9e-06,0.204,0.277,121.956,169093,False,True,False,False,False
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,pop,dance pop,0.65,...,0.0,0.0833,0.725,123.976,189052,True,False,False,False,False


In [16]:
df.columns

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_name', 'track_album_release_date', 'playlist_name',
       'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'playlist_genre',
       'playlist_name', 'track_album_name', 'track_artist', 'track_name'],
      dtype='object')

In [17]:
df = df.drop(['track_id','track_album_release_date','playlist_subgenre','track_name', 'track_artist','track_album_name','playlist_name','playlist_genre'],axis=1)

In [18]:
df.columns

Index(['track_popularity', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')

In [19]:
df.dtypes

track_popularity      int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
dtype: object

In [20]:
df = df.iloc[:,1:]
dfScaled = df.apply(zscore)

In [21]:
dfScaled

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.642015,1.201668,0.173348,1.367040,0.876132,-0.481253,-0.333961,-0.377985,-0.809246,0.031929,0.042833,-0.518802
1,0.490384,0.643388,1.557808,0.585736,0.876132,-0.688547,-0.468726,-0.359210,1.081094,0.782525,-0.777291,-1.056175
2,0.138877,1.284580,-1.211111,1.100025,-1.141381,-0.324301,-0.436856,-0.377881,-0.519571,0.439395,0.116133,-0.821933
3,0.435246,1.279053,0.450240,0.984251,0.876132,-0.049884,-0.667687,-0.377943,0.089589,-1.001750,0.039860,-0.947661
4,-0.033430,0.742884,-1.211111,0.685114,0.876132,-0.702366,-0.432758,-0.377985,-0.692598,0.919777,0.114944,-0.614097
...,...,...,...,...,...,...,...,...,...,...,...,...
32828,-1.563520,1.234833,-0.934219,1.641416,0.876132,-0.132801,-0.449604,-0.377985,-0.799525,-1.289121,0.270836,-0.358012
32829,-0.915644,0.483091,-1.488003,0.755381,0.876132,-0.642152,-0.790570,-0.358943,1.197742,-0.474188,0.266041,2.127884
32830,-0.867398,0.676554,0.173348,0.609159,-1.141381,-0.581938,-0.306644,-0.377980,-0.260354,-0.319779,0.264108,-0.262132
32831,-0.198845,1.046897,-0.934219,1.123782,0.876132,0.019214,-0.762296,0.188369,0.990369,-0.868787,0.264814,2.367073


In [None]:
sns.pairplot(dfScaled,diag_kind='kde')

<seaborn.axisgrid.PairGrid at 0x1fa778e8890>

In [None]:
OMP_NUM_THREADS=1
from scipy.spatial.distance import cdist  # compute distance between each pai of 2 colletion of imputs 
clusters = range(1,10)
meanDistortion=[]


for k in clusters :  
    model = KMeans(n_clusters=k,n_init=10)
    
    model.fit(dfScaled)
    prediction = model.predict(dfScaled)
    
    meanDistortion.append(sum(np.min(cdist(dfScaled,model.cluster_centers_,'euclidean'),axis=1))/dfScaled.shape[0])

In [None]:
meanDistortion

In [None]:
plt.plot(clusters,meanDistortion,'bx-')
plt.xlabel('k')
plt.ylabel('Mean distortion')
plt.title("Selecting k with elbow method")

In [None]:
final_model = KMeans(2,n_init=10)
final_model.fit(dfScaled)
prediction = final_model.predict(dfScaled)

In [None]:
# Append the prediction 

df['Clusters']= prediction
df.head()

In [None]:
df['Clusters'].value_counts()

In [None]:
df[df['Clusters'] == 1]

In [None]:
df[df['Clusters'] == 1]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Create a sample dataset with two clusters
np.random.seed(42)
cluster_1 = np.random.normal(loc=[5, 5], scale=1.0, size=(50, 2))
cluster_2 = np.random.normal(loc=[15, 15], scale=1.0, size=(50, 2))

data = np.vstack((cluster_1, cluster_2))
df = pd.DataFrame(data, columns=['x', 'y'])
df['cluster'] = [0] * 50 + [1] * 50  # Assign cluster labels

# Step 2: Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='x', y='y', hue='cluster', palette='viridis', style='cluster', s=100)
plt.title('Visualization of Two Clusters')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend(title='Cluster')
plt.show()
