In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc
import time
import pickle


#importing datasets with audio features

hot_songs = pd.read_csv('hot_songs_final.csv') 
not_hot_songs = pd.read_csv('not_hot_songs_final.csv') 



FileNotFoundError: [Errno 2] No such file or directory: 'hot_songs_final.csv'

## 1/ add hot or not column

In [None]:

hot_songs['hot_or_not'] = 'H'
not_hot_songs['hot_or_not'] = 'N'

# display(hot_songs)
# print()
# display(not_hot_songs)

## 2/ concat both dataframes 

In [None]:
hot_or_not = pd.concat([hot_songs, not_hot_songs], ignore_index=True)
hot_or_not.head()

In [None]:
hot_or_not = hot_or_not.drop(columns="Unnamed: 0")

## 3/ drop audio features that seems irrelevant

In [None]:
# List of columns to drop
columns_to_drop = ["type", "uri", "track_href", "analysis_url", "duration_ms"]
# Drop the specified columns
hot_or_not_clean = hot_or_not.drop(columns=columns_to_drop)
hot_or_not.head()

In [None]:
#droping the ID column to avoid confusion for the scaling & clustering

hot_or_not_IDless = hot_or_not_clean.drop(columns=["id","hot_or_not","track_name","artists"])
hot_or_not_IDless.head()

## 4/ Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(hot_or_not_IDless)
hot_or_not_scaled = scaler.transform(hot_or_not_IDless)
filename = "hot_or_not_scaler.pickle" # Path with filename
with open(filename, "wb") as file:
        pickle.dump(scaler,file)
        
hot_or_not_scaled_df = pd.DataFrame(hot_or_not_scaled, columns = hot_or_not_IDless.columns)

display(hot_or_not_scaled_df.head())

## 5/ test different dimensional reduction technics
### a/ Testing PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(hot_or_not_scaled_df)
principal_components = pca.transform(hot_or_not_scaled_df)
principal_components_df = pd.DataFrame(principal_components, columns=['PCA_'+ str(i) for i in range(1,hot_or_not_scaled_df.shape[1]+1)])
principal_components_df.head()


In [None]:
print(pca.explained_variance_ratio_)
cumulated_explained_variance_ratio = [sum(pca.explained_variance_ratio_[0:i+1]) for i,value in enumerate(pca.explained_variance_ratio_)]
cumulated_explained_variance_ratio

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].plot(np.arange(1,13), pca.explained_variance_ratio_)
ax[0].set_xlabel("Principal component")
ax[0].set_title("Variance explained by each Principal Component")
ax[1].plot(np.arange(1,13),cumulated_explained_variance_ratio)
ax[1].set_title("Acumulated variance explained by Principal Components")
ax[1].set_xlabel("Number of Principal components")
plt.tight_layout()
plt.show()

### b/ Testing Isomap

In [None]:
from sklearn.manifold import Isomap

iso = Isomap(n_neighbors=12, n_components=3)
iso.fit(hot_or_not_scaled_df)
hot_or_not_isomap_transformed = iso.transform(hot_or_not_scaled_df)
hot_or_not_isomap_transformed_df = pd.DataFrame(hot_or_not_isomap_transformed, columns=["ISO_1","ISO_2","ISO_3"])
hot_or_not_isomap_transformed_df.head()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Replace 'hot_or_not_isomap_transformed_df' with your actual DataFrame
x = hot_or_not_isomap_transformed_df['ISO_1']
y = hot_or_not_isomap_transformed_df['ISO_2']
z = hot_or_not_isomap_transformed_df['ISO_3']

ax.scatter(x, y, z)

ax.set_xlabel("ISO_1")
ax.set_ylabel("ISO_2")
ax.set_zlabel("ISO_3")

plt.show()

### c/ Testing UMAP

In [None]:
from umap import UMAP

reducer = UMAP(n_components=2,random_state=42)
reducer.fit(hot_or_not_scaled_df)

hot_or_not_umap_transformed = reducer.transform(hot_or_not_scaled_df)
hot_or_not_umap_transformed_df = pd.DataFrame(hot_or_not_umap_transformed, columns=["UMAP_1","UMAP_2"])
hot_or_not_umap_transformed_df.head()

In [None]:
fig, ax = plt.subplots()
ax.scatter(hot_or_not_umap_transformed_df ['UMAP_1'], hot_or_not_umap_transformed_df ['UMAP_2'])
ax.set_xlabel("UMAP_1")
ax.set_ylabel("UMAP_2")
plt.show()

# Clustering
## 1/ Using HDBSCAN

In [None]:
# 6/ select clustering method most performant for our dataset
#from the umap dim reduction, seems that more or less 5 groups appear with sufficient distance and concentration



#we are selecting HD

In [None]:
from sklearn.datasets import make_classification
from sklearn.cluster import HDBSCAN
from scipy.spatial.distance import euclidean
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from kneed import KneeLocator

from dbcv import *

In [None]:
# 7/ run it 
# Creating the clustering model
#model = HDBSCAN() # Default values are: eps=0.5, min_samples=5
#model = HDBSCAN(min_cluster_size=10,min_samples=5)
import os
model = HDBSCAN(min_cluster_size=30,min_samples=7)

path = "models/"
    # Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:
    # Create a new directory because it does not exist
    os.makedirs(path)
    print("The new directory is created!")

filename = "HDBSCAN.pkl" # use a descriptive name for your encoder but keep the ".pkl" file extension
with open(path+filename, "wb") as file:
    pickle.dump(model, file) # Replace "variable" with the name of the variable that contains your transformer

# HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that's often used for discovering clusters of varying shapes and sizes in data. The key advantage of HDBSCAN is its ability to handle clusters of different densities. When using HDBSCAN, you may need to adjust several parameters to achieve optimal results. Here are the main parameters of HDBSCAN:
# min_samples:
# This parameter sets the minimum number of samples required to form a dense region. It determines the smallest cluster that can be formed. Smaller values make clusters more sensitive to noise, while larger values may result in fewer and larger clusters.
# min_cluster_size:
# It sets the minimum number of points required to form a cluster. Clusters smaller than this size are treated as noise. Adjusting this parameter influences the granularity of the clustering. Larger values lead to larger clusters and potentially more noise.
# metric:
# This parameter defines the distance metric used for calculating distances between points. Common choices include Euclidean distance, Manhattan distance, or other appropriate distance metrics based on your data.
# alpha:
# The alpha parameter influences the size of the neighborhood around each point. It's a scaling factor for the neighborhood, determining the number of neighbors a point must have to be considered part of a cluster. Smaller values make clusters more tightly packed.
# cluster_selection_method:
# This parameter determines the method used to select the final clusters. Options include 'eom' (Excess of Mass), 'leaf' (Cluster hierarchy leaf), or None for no automatic cluster selection.
# allow_single_cluster:
# If set to True, this parameter allows HDBSCAN to assign all points to a single cluster if no clusters can be found that meet the criteria specified by min_cluster_size and min_samples.




# fit model and predict clusters
hot_or_not_clustered = model.fit_predict(hot_or_not_umap_transformed_df) # .fit(X_scaled_df)

pd.Series(hot_or_not_clustered).value_counts().sort_index()

In [None]:
hot_or_not_umap_transformed_df.shape

In [None]:
dbcv_score = round(DBCV(hot_or_not_umap_transformed_df.iloc[:,:2].to_numpy(), hot_or_not_clustered, dist_function=euclidean))
print("The DBCV score is {:.2f}".format(dbcv_score))

In [None]:
hot_or_not_umap_transformed_df['cluster'] = hot_or_not_clustered
hot_or_not_umap_transformed_df.head()

In [None]:
hot_or_not_umap_transformed_df.shape

In [None]:

# Creating a pallette of hihg contrast colors with as many color as cluster labels
#custom_palette = sns.color_palette(cc.glasbey, n_colors=cluster.size)
custom_palette = sns.color_palette(cc.glasbey, n_colors=hot_or_not_clustered.size)#n_colors=cluster.size
# Modifiying the palette to set the color "black=(0.,0.,0.)" to the label "-1"
#custom_palette = [color if cluster != -1 else (0.,0.,0.) for cluster, color in zip(cluster,custom_palette)]
sns.scatterplot(hot_or_not_umap_transformed_df, x="UMAP_1", y="UMAP_2", hue="cluster", palette=custom_palette);
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

## Removing the noise: 

In [None]:
# hot_or_not_cluster_df = pd.concat([hot_or_not, hot_or_not_umap_transformed_df['cluster']], axis=1)
# hot_or_not_cluster_df

In [None]:
# hot_or_not_cluster_df_no_noise = hot_or_not_umap_transformed_df[hot_or_not_umap_transformed_df['cluster'] != -1]
# hot_or_not_cluster_df_no_noise

In [None]:
# # fit model and predict clusters again
# model = HDBSCAN(min_cluster_size=30,min_samples=7)

# hot_or_not_cluster_df_no_noise2 = model.fit_predict(hot_or_not_cluster_df_no_noise) # .fit(X_scaled_df)

# pd.Series(hot_or_not_cluster_df_no_noise2).value_counts().sort_index()

In [None]:
# hot_or_not_umap_transformed_df['cluster'] = hot_or_not_cluster_df_no_noise2
# hot_or_not_umap_transformed_df.head()

## 2/ Using KMEAN

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=9, random_state=1234)
kmeans.fit(hot_or_not_scaled_df)

In [None]:
clusters2 = kmeans.predict(hot_or_not_scaled_df)
clusters2
pd.Series(clusters2).value_counts().sort_index() # Number of wines in each cluster

In [None]:
hot_or_not_umap_transformed_df_kmean['clusters2'] = clusters2
#hot_or_not_umap_transformed_df_kmean.drop(columns=['cluster'], inplace=True)

hot_or_not_umap_transformed_df_kmean.head()

In [None]:
custom_palette = sns.color_palette(cc.glasbey)
# Modifiying the palette to set the color "black=(0.,0.,0.)" to the label "-1"
#custom_palette = [color if cluster != -1 else (0.,0.,0.) for cluster, color in zip(cluster,custom_palette)]
sns.scatterplot(data=hot_or_not_umap_transformed_df_kmean, x="UMAP_1", y="UMAP_2", hue="clusters2", palette=custom_palette);

plt.show()

In [None]:
print("The Silhouette score of the model is: ",round(silhouette_score(hot_or_not_umap_transformed_df_kmean, model.fit_predict(hot_or_not_umap_transformed_df_kmean)),2))

### Finding the most relevant "K"

In [None]:
# K=9 seems after analysis the best possible input, with a Silhouette score of 0.5 (0.5 and above considered good) and giving sufficient granularity

K = range(2, 21)

inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=1234,
                    verbose=1)
    kmeans.fit(hot_or_not_scaled_df)

    filename = "kmeans_" + str(k) + ".pickle" # Path with filename # kmeans_n.pickle
    with open(filename, "wb") as file:
        pickle.dump(kmeans,file)

    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(hot_or_not_scaled_df, kmeans.predict(hot_or_not_scaled_df)))


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,2,figsize=(16,8))
ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('k')
ax[0].set_ylabel('inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[0].set_title('Elbow Method showing the optimal k')
ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('k')
ax[1].set_ylabel('silhouette score')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[1].set_title('Silhouette Method showing the optimal k')

## Concat hot_or_not df with cluster

In [None]:
#adding cluster kmean to the original dataframe
hot_or_not["cluster_kmean"] = clusters2
hot_or_not.head()

In [None]:
#Saving the updated dataframe as CSV
hot_or_not.to_csv('hot_or_not_clustered.csv', index=False)

hot_or_not_umap_transformed_df.to_csv('hot_or_not_umap_transformed_df.csv', index=False)

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\antho\anaconda3

  added / updated specs:
    - liblapack


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libblas-3.9.0              |1_h8933c1f_netlib         193 KB  conda-forge
    liblapack-3.9.0            |5_hd5c7e75_netlib         2.7 MB  conda-forge
    m2w64-gcc-libgfortran-5.3.0|                6         342 KB  conda-forge
    m2w64-gcc-libs-5.3.0       |                7         520 KB  conda-forge
    m2w64-gcc-libs-core-5.3.0  |                7         214 KB  conda-forge
    m2w64-gmp-6.1.0            |                2         726 KB  conda-forge
    scikit-learn-1.3.0         |  py311hf62ec03_1         8.1 MB
    ------------------------------------------------------


The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/noarch::imbalanced-learn==0.11.0=pyhd8ed1ab_0
  - conda-forge/noarch::pynndescent==0.5.11=pyhca7485f_0
  - defaults/win-64::scikit-learn-intelex==2023.1.1=py311haa95532_0
  - conda-forge/win-64::umap-learn==0.5.5=py311h1ea47a8_0
  - defaults/win-64::_anaconda_depends==2023.09=py311_mkl_1


  current version: 23.7.4
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0


