In [70]:
# Dependencies
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import numpy as np

In [71]:
# Importing data
path = Path('../data/genre_data.csv')
df = pd.read_csv(path)

df.head()

Unnamed: 0.1,Unnamed: 0,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,5,alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475
1,9,dance pop,0.759,0.699,0.0,-5.745,0.0,0.0307,0.202,0.000131,0.443,0.907,92.96
2,14,singer-songwriter pop,0.83,0.414,3.0,-7.387,1.0,0.148,0.497,0.0,0.187,0.797,87.99
3,16,singer-songwriter pop,0.459,0.214,5.0,-10.66,1.0,0.0403,0.634,0.0,0.125,0.397,163.816
4,17,alt z,0.639,0.724,7.0,-6.346,1.0,0.0664,0.452,0.0,0.159,0.522,129.712


In [72]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475
1,dance pop,0.759,0.699,0.0,-5.745,0.0,0.0307,0.202,0.000131,0.443,0.907,92.96
2,singer-songwriter pop,0.83,0.414,3.0,-7.387,1.0,0.148,0.497,0.0,0.187,0.797,87.99
3,singer-songwriter pop,0.459,0.214,5.0,-10.66,1.0,0.0403,0.634,0.0,0.125,0.397,163.816
4,alt z,0.639,0.724,7.0,-6.346,1.0,0.0664,0.452,0.0,0.159,0.522,129.712


In [73]:
df.set_index('genre', inplace=True)
df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
alt z,0.645,0.611,8.0,-5.925,0.0,0.137,0.29,2.1e-05,0.237,0.645,157.475
dance pop,0.759,0.699,0.0,-5.745,0.0,0.0307,0.202,0.000131,0.443,0.907,92.96
singer-songwriter pop,0.83,0.414,3.0,-7.387,1.0,0.148,0.497,0.0,0.187,0.797,87.99
singer-songwriter pop,0.459,0.214,5.0,-10.66,1.0,0.0403,0.634,0.0,0.125,0.397,163.816
alt z,0.639,0.724,7.0,-6.346,1.0,0.0664,0.452,0.0,0.159,0.522,129.712


In [74]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
data_scaled = StandardScaler().fit_transform(df)

In [75]:
data_scaled[:5]

array([[ 0.15437895,  0.11892202,  0.77789829,  0.4202189 , -1.23185239,
         0.76220316, -0.16633608, -0.41940106,  0.50764954,  0.85531226,
         1.37687301],
       [ 0.96969697,  0.48181997, -1.47912744,  0.45637079, -1.23185239,
        -0.63325824, -0.43950181, -0.41902298,  2.13945852,  1.99567627,
        -0.89907298],
       [ 1.47748276, -0.69347455, -0.63274279,  0.12658519,  0.81178557,
         0.9066065 ,  0.4762242 , -0.4194712 ,  0.1115794 ,  1.51689749,
        -1.07440355],
       [-1.17587678, -1.51824264, -0.06848636, -0.53077672,  0.81178557,
        -0.5072335 ,  0.90149357, -0.4194712 , -0.37954757, -0.22411628,
         1.60056942],
       [ 0.11146747,  0.58491598,  0.49577008,  0.33566364,  0.81178557,
        -0.16460375,  0.33653719, -0.4194712 , -0.11021988,  0.31995052,
         0.397456  ]])

In [76]:
# Create a DataFrame with the scaled data
df_scaled = pd.DataFrame(data_scaled, columns=df.columns)

In [77]:
# Create a list with the number of k-values from 1 to 11
scaled_k = list(range(1,11))

In [78]:
# Create an empty list to store the inertia values
scaled_inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in scaled_k:
    k_model = KMeans(n_clusters=i, random_state=42)
    k_model.fit(df_scaled)
    scaled_inertia.append(k_model.inertia_)

In [79]:
# Create a dictionary with the data to plot the Elbow curve
scaled_elbow_data = {'k':scaled_k,'inertia':scaled_inertia}

# Create a DataFrame with the data to plot the Elbow curve
scaled_elbow_df = pd.DataFrame(scaled_elbow_data)

In [80]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
scaled_elbow_plot = scaled_elbow_df.hvplot.line(
    x="k", 
    y="inertia", 
    title="Scaled Elbow Curve", 
    xticks=scaled_k)
scaled_elbow_plot

In [81]:
# Initialize the K-Means model using the best value for k
scaled_model = KMeans(n_clusters=4, random_state=42)

In [82]:
# Fit the K-Means model using the scaled data
scaled_model.fit(df_scaled)

In [83]:
# Predict the clusters to group the cryptocurrencies using the scaled data
scaled_k_4 = scaled_model.predict(df_scaled)

# Print the resulting array of cluster values.
scaled_k_4

array([3, 3, 2, ..., 2, 3, 1])

In [84]:
# Create a copy of the DataFrame
scaled_predictions = df_scaled.copy()

In [85]:
# Add a new column to the DataFrame with the predicted clusters
scaled_predictions['clusters'] = scaled_k_4

# Display sample data
scaled_predictions.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,clusters
0,0.154379,0.118922,0.777898,0.420219,-1.231852,0.762203,-0.166336,-0.419401,0.50765,0.855312,1.376873,3
1,0.969697,0.48182,-1.479127,0.456371,-1.231852,-0.633258,-0.439502,-0.419023,2.139459,1.995676,-0.899073,3
2,1.477483,-0.693475,-0.632743,0.126585,0.811786,0.906607,0.476224,-0.419471,0.111579,1.516897,-1.074404,2
3,-1.175877,-1.518243,-0.068486,-0.530777,0.811786,-0.507234,0.901494,-0.419471,-0.379548,-0.224116,1.600569,1
4,0.111467,0.584916,0.49577,0.335664,0.811786,-0.164604,0.336537,-0.419471,-0.11022,0.319951,0.397456,2


In [86]:
# Create a PCA model instance
pca = PCA(n_components=2)

In [87]:
key_features = scaled_predictions[['energy','acousticness','loudness','instrumentalness','clusters']]

# Use the PCA model with `fit_transform`
spotify_pca = pca.fit_transform(key_features)

# View the first five rows of the DataFrame. 
spotify_pca[:5]


array([[-0.97989333, -0.25974673],
       [-1.29925877, -0.03889177],
       [ 0.28913609, -0.77838331],
       [ 1.6446311 , -1.16879954],
       [-0.49944738, -0.34102069]])

In [88]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
amountdata_explained = pca.explained_variance_ratio_

print(f"PCA info: {amountdata_explained}")
print(f"Explained Variance: {sum(amountdata_explained)}")

PCA info: [0.69236076 0.15996367]
Explained Variance: 0.8523244261709437


In [89]:
# Create the PCA DataFrame
pca_df = pd.DataFrame(
    spotify_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-0.979893,-0.259747
1,-1.299259,-0.038892
2,0.289136,-0.778383
3,1.644631,-1.1688
4,-0.499447,-0.341021


In [90]:
# add genre to pca df
pca_df = pd.concat([df.reset_index(),pca_df],axis='columns')[['genre','PCA1','PCA2']].set_index('genre')
pca_df.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alt z,-0.979893,-0.259747
dance pop,-1.299259,-0.038892
singer-songwriter pop,0.289136,-0.778383
singer-songwriter pop,1.644631,-1.1688
alt z,-0.499447,-0.341021


In [91]:
# elbow method with pca

# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=42)
    k_model.fit(pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,13215.10966
1,2,5902.224093
2,3,2815.892665
3,4,2015.116533
4,5,1284.241782


In [92]:
 # Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [93]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=42)

# Fit the model
model.fit(pca_df)

# Make predictions
k_3 = model.predict(pca_df)

# Create a copy of the PCA DataFrame
song_pca_predictions_df = pca_df.copy()

# Add a class column with the labels
song_pca_predictions_df["predictions"] = k_3

In [94]:
# Plot the clusters
song_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="predictions",
    hover_cols=['genre']
)

In [95]:
pca_df.head()

Unnamed: 0_level_0,PCA1,PCA2
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
alt z,-0.979893,-0.259747
dance pop,-1.299259,-0.038892
singer-songwriter pop,0.289136,-0.778383
singer-songwriter pop,1.644631,-1.1688
alt z,-0.499447,-0.341021


In [96]:
spca = PCA(n_components=2)
spca.fit(key_features)
print('energy','acousticness','loudness','instrumentalness','clusters')
print(spca.components_)

energy acousticness loudness instrumentalness clusters
[[-0.48573831  0.45877056 -0.49520128  0.34611262 -0.43424602]
 [ 0.31864356 -0.40041962 -0.12398083  0.84926551  0.03882328]]


In [97]:
# Plot the clusters
song_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="predictions",
    hover_cols=['genre']
)

In [100]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
import os

In [120]:
import requests
import json

In [121]:
url = "https://developer.spotify.com"

In [122]:
print(requests.get(url))

<Response [200]>


In [123]:
print(requests.get(url).json())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [117]:
client_id = os.environ.get('client_id')

In [118]:
client_id

In [111]:
client_secret=os.environ.get('2a1756b4d1fe41efa1523507a17d6261')

In [119]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id,
                                                           client_secret))

SpotifyOauthError: No client_id. Pass it or set a SPOTIPY_CLIENT_ID environment variable.

In [None]:
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)