In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

# begin pca stuff
from sklearn.decomposition import PCA
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Importing data
path = Path('Resources/Spotify_Song_Attributes.csv')
df = pd.read_csv(path)

In [3]:
# Dropping irrelevant columns
df.drop(columns=['type','uri','track_href','analysis_url','msPlayed','duration_ms'],inplace=True)

In [4]:
# Dropping nulls
df.dropna(inplace=True)

In [5]:
# Value count on genre
df['genre'].value_counts()[:10]

genre
alt z                    656
pop                      602
filmi                    412
dance pop                172
singer-songwriter pop    164
alternative metal        150
anime lo-fi              136
art pop                  126
drift phonk              124
brostep                  116
Name: count, dtype: int64

In [6]:
number_genres = df['genre'].nunique()
number_genres

523

In [7]:
df.head()

Unnamed: 0,trackName,artistName,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,time_signature
1,"""In The Hall Of The Mountain King"" from Peer G...",London Symphony Orchestra,british orchestra,0.475,0.13,7.0,-17.719,1.0,0.051,0.916,0.956,0.101,0.122,112.241,14Qcrx6Dfjvcj0H8oV8oUW,4.0
2,#BrooklynBloodPop!,SyKo,glitchcore,0.691,0.814,1.0,-3.788,0.0,0.117,0.0164,0.0,0.366,0.509,132.012,7K9Z3yFNNLv5kwTjQYGjnu,4.0
3,$10,Good Morning,experimental pop,0.624,0.596,4.0,-9.804,1.0,0.0314,0.475,0.203,0.119,0.896,120.969,3koAwrM1RO0TGMeQJ3qt9J,4.0
4,(I Just) Died In Your Arms,Cutting Crew,album rock,0.625,0.726,11.0,-11.402,0.0,0.0444,0.0158,0.000169,0.0625,0.507,124.945,4ByEFOBuLXpCqvO1kw8Wdm,4.0
5,(L)only Child,salem ilese,alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475,22lJaG2yxlSjIwdUIddcFk,3.0


In [8]:
pca_df_to_work_with = df.drop(columns=['trackName', 'artistName', 'id','time_signature'])
pca_df_to_work_with.head()

Unnamed: 0,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
1,british orchestra,0.475,0.13,7.0,-17.719,1.0,0.051,0.916,0.956,0.101,0.122,112.241
2,glitchcore,0.691,0.814,1.0,-3.788,0.0,0.117,0.0164,0.0,0.366,0.509,132.012
3,experimental pop,0.624,0.596,4.0,-9.804,1.0,0.0314,0.475,0.203,0.119,0.896,120.969
4,album rock,0.625,0.726,11.0,-11.402,0.0,0.0444,0.0158,0.000169,0.0625,0.507,124.945
5,alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475


In [55]:
# Geeks for Geeks; sidgautam
top_15 = pca_df_to_work_with['genre'].value_counts().head(15) 
top_15 = top_15.keys()
top_15.tolist()


['alt z',
 'pop',
 'filmi',
 'dance pop',
 'singer-songwriter pop',
 'alternative metal',
 'anime lo-fi',
 'art pop',
 'drift phonk',
 'brostep',
 'modern alternative rock',
 'lo-fi study',
 'edm',
 'anime',
 'chill pop']

In [60]:
# Saturn Cloud
mask = pca_df_to_work_with['genre'].isin(top_15)
mask

1        False
2        False
3        False
4        False
5         True
         ...  
10074     True
10075    False
10077    False
10078     True
10079    False
Name: genre, Length: 8580, dtype: bool

In [61]:
#Saturn Cloud
pca_df_to_work_with2 = pca_df_to_work_with[mask]
pca_df_to_work_with2

Unnamed: 0,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
5,alt z,0.645,0.611,8.0,-5.925,0.0,0.1370,0.290,0.000021,0.237,0.645,157.475
9,dance pop,0.759,0.699,0.0,-5.745,0.0,0.0307,0.202,0.000131,0.443,0.907,92.960
14,singer-songwriter pop,0.830,0.414,3.0,-7.387,1.0,0.1480,0.497,0.000000,0.187,0.797,87.990
16,singer-songwriter pop,0.459,0.214,5.0,-10.660,1.0,0.0403,0.634,0.000000,0.125,0.397,163.816
17,alt z,0.639,0.724,7.0,-6.346,1.0,0.0664,0.452,0.000000,0.159,0.522,129.712
...,...,...,...,...,...,...,...,...,...,...,...,...
10069,art pop,0.324,0.416,11.0,-8.920,0.0,0.0368,0.262,0.000037,0.110,0.151,113.986
10070,pop,0.799,0.539,1.0,-6.351,1.0,0.0421,0.199,0.000017,0.165,0.394,136.948
10073,pop,0.784,0.845,3.0,-2.793,1.0,0.0596,0.286,0.000016,0.074,0.888,105.981
10074,alt z,0.745,0.477,11.0,-7.706,0.0,0.0880,0.202,0.000000,0.120,0.454,136.055


In [63]:
pca_df2 = pca_df_to_work_with2.set_index('genre')
pca_df2

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
alt z,0.645,0.611,8.0,-5.925,0.0,0.1370,0.290,0.000021,0.237,0.645,157.475
dance pop,0.759,0.699,0.0,-5.745,0.0,0.0307,0.202,0.000131,0.443,0.907,92.960
singer-songwriter pop,0.830,0.414,3.0,-7.387,1.0,0.1480,0.497,0.000000,0.187,0.797,87.990
singer-songwriter pop,0.459,0.214,5.0,-10.660,1.0,0.0403,0.634,0.000000,0.125,0.397,163.816
alt z,0.639,0.724,7.0,-6.346,1.0,0.0664,0.452,0.000000,0.159,0.522,129.712
...,...,...,...,...,...,...,...,...,...,...,...
art pop,0.324,0.416,11.0,-8.920,0.0,0.0368,0.262,0.000037,0.110,0.151,113.986
pop,0.799,0.539,1.0,-6.351,1.0,0.0421,0.199,0.000017,0.165,0.394,136.948
pop,0.784,0.845,3.0,-2.793,1.0,0.0596,0.286,0.000016,0.074,0.888,105.981
alt z,0.745,0.477,11.0,-7.706,0.0,0.0880,0.202,0.000000,0.120,0.454,136.055


In [9]:
pca_df_to_work_with["genre"].value_counts()

genre
alt z                         656
pop                           602
filmi                         412
dance pop                     172
singer-songwriter pop         164
                             ... 
children's folk                 2
detroit indie                   2
celtic rock                     2
electro                         2
australian alternative pop      2
Name: count, Length: 523, dtype: int64

In [10]:
pca_df = pca_df_to_work_with.set_index("genre")
pca_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
british orchestra,0.475,0.13,7.0,-17.719,1.0,0.051,0.916,0.956,0.101,0.122,112.241
glitchcore,0.691,0.814,1.0,-3.788,0.0,0.117,0.0164,0.0,0.366,0.509,132.012
experimental pop,0.624,0.596,4.0,-9.804,1.0,0.0314,0.475,0.203,0.119,0.896,120.969
album rock,0.625,0.726,11.0,-11.402,0.0,0.0444,0.0158,0.000169,0.0625,0.507,124.945
alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475


In [11]:
# Instantiate the PCA instance and declare the number of PCA variables
pca = PCA(n_components=3)

In [67]:
df_pca2 = pca.fit_transform(pca_df2)
df_pca2[:5]

array([[-39.08089811,  -0.73119004,   2.73493545],
       [ 25.3874272 ,  -3.10281663,  -5.28923631],
       [ 30.41331592,  -1.68886937,  -2.27139648],
       [-45.24323129,   4.31530579,  -0.17502281],
       [-11.32081254,  -1.30015066,   1.71948748]])

In [69]:
amountdata_explained2 = pca.explained_variance_ratio_
amountdata_explained2

array([0.95615071, 0.02838129, 0.01492738])

In [70]:
# Fit the PCA model on the transformed credit card DataFrame
df_pca = pca.fit_transform(pca_df)

# Review the first 5 rows of list data
df_pca[:5]

array([[  7.21235966,   8.94620249,   1.77528275],
       [-13.02052186,  -4.35497414,  -4.2571762 ],
       [ -1.78084845,   1.28612518,  -1.25047078],
       [ -5.70759488,   2.97511995,   5.76714394],
       [-38.40171827,  -1.35399245,   2.72496699]])

In [71]:
 # Calculate the PCA explained variance ratio
amountdata_explained = pca.explained_variance_ratio_
amountdata_explained

array([0.95365023, 0.03129739, 0.01451048])

In [14]:
# sum of variance accounted for
print(f"PCA info: {amountdata_explained}")
print(f"Explained Variance: {sum(amountdata_explained)}")

PCA info: [0.95365023 0.03129739 0.01451048]
Explained Variance: 0.9994580988691859


In [84]:
song_pca_df2 = pd.DataFrame(
    df_pca2,
    columns = ['PCA1','PCA2','PCA3']
)

In [85]:
 # Create the PCA DataFrame
song_pca_df = pd.DataFrame(
    df_pca,
    columns=["PCA1", "PCA2", "PCA3"]
)


In [86]:
song_pca_df2.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,-39.080898,-0.73119,2.734935
1,25.387427,-3.102817,-5.289236
2,30.413316,-1.688869,-2.271396
3,-45.243231,4.315306,-0.175023
4,-11.320813,-1.300151,1.719487


In [87]:
song_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,7.21236,8.946202,1.775283
1,-13.020522,-4.354974,-4.257176
2,-1.780848,1.286125,-1.250471
3,-5.707595,2.97512,5.767144
4,-38.401718,-1.353992,2.724967


In [88]:
song_pca_df2.drop(columns=["PCA3"], inplace=True)
song_pca_df2.head()

Unnamed: 0,PCA1,PCA2
0,-39.080898,-0.73119
1,25.387427,-3.102817
2,30.413316,-1.688869
3,-45.243231,4.315306
4,-11.320813,-1.300151


In [89]:
song_pca_df.drop(columns=["PCA3"], inplace=True)
song_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,7.21236,8.946202
1,-13.020522,-4.354974
2,-1.780848,1.286125
3,-5.707595,2.97512
4,-38.401718,-1.353992


In [90]:
pca_with_genre2 = pd.concat([pca_df_to_work_with2, song_pca_df2], axis="columns")[["genre", "PCA1", "PCA2"]].set_index("genre")
pca_with_genre2.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alt z,4.387375,-2.479845
dance pop,-25.564251,0.132
singer-songwriter pop,23.339888,-3.261445
singer-songwriter pop,23.296996,-2.983431
alt z,-23.60458,-1.587518


In [19]:
pca_with_genre = pd.concat([pca_df_to_work_with, song_pca_df], axis="columns")[["genre", "PCA1", "PCA2"]].set_index("genre")
pca_with_genre.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
british orchestra,-13.020522,-4.354974
glitchcore,-1.780848,1.286125
experimental pop,-5.707595,2.97512
album rock,-38.401718,-1.353992
alt z,0.984533,-3.926671


In [91]:
pca_with_genre_index2 = pca_with_genre2.index.tolist()
pca_with_genre_index2

['alt z',
 'dance pop',
 'singer-songwriter pop',
 'singer-songwriter pop',
 'alt z',
 'alt z',
 'alternative metal',
 'pop',
 'pop',
 'singer-songwriter pop',
 'modern alternative rock',
 'pop',
 'singer-songwriter pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'alt z',
 'pop',
 'filmi',
 'art pop',
 'alt z',
 'singer-songwriter pop',
 'alt z',
 'alt z',
 'pop',
 'alt z',
 'alt z',
 'pop',
 'art pop',
 'edm',
 'alt z',
 'alt z',
 'alternative metal',
 'lo-fi study',
 'pop',
 'pop',
 'singer-songwriter pop',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'pop',
 'lo-fi study',
 'alt z',
 'chill pop',
 'filmi',
 'modern alternative rock',
 'brostep',
 'anime',
 'pop',
 'alt z',
 'singer-songwriter pop',
 'filmi',
 'filmi',
 'filmi',
 'filmi',
 'pop',
 'pop',
 'alternative metal',
 'brostep',
 'dance pop',
 'brostep',
 'brostep',
 'art pop',
 'alt z',
 'pop',
 'dance pop',
 'pop',
 'pop',
 'filmi',
 'filmi',
 'anime lo-fi',
 'anime lo

In [20]:
pca_with_genre_index = pca_with_genre.index.tolist()
pca_with_genre_index

['british orchestra',
 'glitchcore',
 'experimental pop',
 'album rock',
 'alt z',
 'guitar case',
 'cloud rap',
 'dance pop',
 'desi hip hop',
 'lo-fi sleep',
 'contemporary country',
 'bedroom r&b',
 'singer-songwriter pop',
 'singer-songwriter pop',
 'alt z',
 'la pop',
 'lo-fi chill',
 'alt z',
 'orchestral soundtrack',
 'glitchcore',
 'comic',
 'alternative metal',
 'glitchcore',
 'deep underground hip hop',
 'pop',
 'brooklyn drill',
 'classical',
 'american orchestra',
 'classical',
 'pop',
 'singer-songwriter pop',
 'modern alternative rock',
 'pop',
 'scandipop',
 'alabama indie',
 'singer-songwriter pop',
 'stomp and holler',
 'pop',
 'anime score',
 'pop',
 'pop',
 'pop',
 'dfw rap',
 'punjabi pop',
 'folk-pop',
 'acoustic pop',
 'atl hip hop',
 'pop',
 'alt z',
 'pop',
 'filmi',
 'art pop',
 'japanese vgm',
 'sleep',
 'electronica',
 'alt z',
 'singer-songwriter pop',
 'australian pop',
 'danish pop',
 'melodic rap',
 'alt z',
 'boy band',
 'alt z',
 'solipsynthm',
 'pop',


In [92]:
pca_with_genre2.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alt z,4.387375,-2.479845
dance pop,-25.564251,0.132
singer-songwriter pop,23.339888,-3.261445
singer-songwriter pop,23.296996,-2.983431
alt z,-23.60458,-1.587518


In [21]:
pca_with_genre.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
british orchestra,-13.020522,-4.354974
glitchcore,-1.780848,1.286125
experimental pop,-5.707595,2.97512
album rock,-38.401718,-1.353992
alt z,0.984533,-3.926671


In [94]:
pca_with_genre2.reset_index(inplace=True)

In [22]:
pca_with_genre.reset_index(inplace=True)

In [95]:
pca_with_genre2.dropna(inplace=True)

In [23]:
pca_with_genre.dropna(inplace=True)

In [96]:
pca_with_genre2.set_index('genre', inplace=True)

In [24]:
pca_with_genre.set_index('genre', inplace=True)

In [97]:
pca_with_genre2.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alt z,4.387375,-2.479845
dance pop,-25.564251,0.132
singer-songwriter pop,23.339888,-3.261445
singer-songwriter pop,23.296996,-2.983431
alt z,-23.60458,-1.587518


In [25]:
pca_with_genre.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
british orchestra,-13.020522,-4.354974
glitchcore,-1.780848,1.286125
experimental pop,-5.707595,2.97512
album rock,-38.401718,-1.353992
alt z,0.984533,-3.926671


In [98]:
# elbow method with pca

# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=42)
    k_model.fit(pca_with_genre2)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow2 = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow2.head()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,k,inertia
0,1,818093.738576
1,2,312008.888462
2,3,163486.890496
3,4,112830.953307
4,5,78562.550732


In [26]:
# elbow method with pca

# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=42)
    k_model.fit(pca_with_genre)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,k,inertia
0,1,6326776.0
1,2,2344691.0
2,3,1195858.0
3,4,830220.7
4,5,589295.8


In [99]:
 # Plot the Elbow Curve
df_elbow2.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [27]:
 # Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [100]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=42)

# Fit the model
model.fit(pca_with_genre2)

# Make predictions
k_3_2= model.predict(pca_with_genre2)

# Create a copy of the PCA DataFrame
song_pca_predictions_df2 = pca_with_genre2.copy()

# Add a class column with the labels
song_pca_predictions_df2["predictions"] = k_3_2

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=42)

# Fit the model
model.fit(pca_with_genre)

# Make predictions
k_3 = model.predict(pca_with_genre)

# Create a copy of the PCA DataFrame
song_pca_predictions_df = pca_with_genre.copy()

# Add a class column with the labels
song_pca_predictions_df["predictions"] = k_3

  super()._check_params_vs_input(X, default_n_init=10)


In [103]:
# Plot the clusters
song_pca_predictions_df2.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="predictions",
    hover_cols=['genre'],
    title='pca'
)

In [30]:
# Plot the clusters
song_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="predictions"
)

In [31]:
song_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="predictions",
    hover_cols=['genre'],
    title='pca'
)

In [102]:
song_pca_predictions_df2.head()

Unnamed: 0_level_0,PCA1,PCA2,predictions
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alt z,4.387375,-2.479845,0
dance pop,-25.564251,0.132,1
singer-songwriter pop,23.339888,-3.261445,2
singer-songwriter pop,23.296996,-2.983431,2
alt z,-23.60458,-1.587518,1


In [32]:
song_pca_predictions_df.head()

Unnamed: 0_level_0,PCA1,PCA2,predictions
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
british orchestra,-13.020522,-4.354974,0
glitchcore,-1.780848,1.286125,0
experimental pop,-5.707595,2.97512,0
album rock,-38.401718,-1.353992,2
alt z,0.984533,-3.926671,0
