In [None]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
import matplotlib.pyplot as plt 
import seaborn as sns 
import matplotlib.patches as mpatches 
import plotly
from scipy.spatial import distance
import copy
from scipy import spatial 
from matplotlib import transforms 
from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA 
import warnings
warnings.filterwarnings("ignore") 


In [None]:
#importing the data
songData=pd.read_csv("C:\\Users\\Bryan\\Downloads\\SpotifyFeatures.csv")

In [None]:
#looking at the data
songData.head()
#genres in the data set include trap, techno, techhouse, psytrance, darktrap, DnB, Hrdstyle, underground rap, Trap metal, emo,
#rap. RnB, Pop, and HipHop

In [None]:
songData['genre'].unique()

In [None]:
songData["genre"].value_counts().sum()

In [None]:
songData.describe().T

In [None]:
genres =songData.groupby("genre").mean()
#It classified by genre variable 
print(genres)

In [None]:
#pairplot of pop
songDataPop = songData[songData['genre'].str.contains('Pop')]
df=copy.deepcopy(songDataPop)
sns.set_style("darkgrid")
sns.pairplot(df,corner=True,hue="genre")

In [None]:
#pairplot of electronic music
songDataElectronic = songData[songData['genre'].str.contains('Electronic')]
df=copy.deepcopy(songDataElectronic)
sns.set_style("darkgrid")
sns.pairplot(df,corner=True,hue="genre")

In [None]:
#pairplot of every genre that begins with r
songDataR = songData[songData['genre'].str.contains('R')]
df=copy.deepcopy(songDataR)
sns.set_style("darkgrid")
sns.pairplot(df,corner=True,hue="genre")

In [None]:
#pairplot of all song genres
df=copy.deepcopy(songData)

In [None]:
sns.pairplot(df,corner=True,hue="genre")

In [None]:
#boxplot of genres and their attributes

In [None]:
px.box(data_frame=songData,y="danceability", color="genre")

In [None]:
px.box(data_frame=songData,y="energy", color="genre")

In [None]:
px.box(data_frame=songData,y="loudness", color="genre")

In [None]:
px.box(data_frame=songData,y="speechiness", color="genre")

In [None]:
px.box(data_frame=songData,y="acousticness", color="genre")

In [None]:
px.box(data_frame=songData,y="instrumentalness", color="genre")

In [None]:
px.box(data_frame=songData,y="liveness", color="genre")

In [None]:
px.box(data_frame=songData,y="tempo", color="genre")

In [None]:
songData.hist(layout=(7,2),figsize=(20, 30))

In [None]:
#pie graph of distribution of songs in the dataset
px.pie(songData.genre,
       labels = songData.genre.value_counts().index,
       values = songData.genre.value_counts().values,
       names = songData.genre.value_counts().index,
       title = "Spotify Genres"
      )

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(songData.corr(),annot=True)
plt.title('Heatmap of Correlation Between Elements of a Song')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(x='energy',y='danceability',data=songData)
plt.title('Energy vs Danceability')
plt.xlabel('Energy')
plt.ylabel('Danceability')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.lineplot(x='energy',y='acousticness',data=songData)
plt.title('Energy vs Acousticness')
plt.xlabel('Energy')
plt.ylabel('Acousticness')
plt.show()

In [None]:
plt.figure(figsize=(12,12))
sns.jointplot(x=songData["energy"].values, y=songData['popularity'].values, size=10, kind="kde",)
plt.ylabel('energy', fontsize=10)
plt.xlabel("popularity", fontsize=10)
plt.title("Energy v Popularity", fontsize=10)
plt.show();

In [None]:
songData.shape

In [None]:
def prepro(df, cols, num_components):
    dummies = pd.get_dummies(df.drop(cols, axis=1)) #create dummy variables for columns in in put that are not present in cols
    merged = pd.concat([df[cols], dummies], axis=1) #concat dummy vartialbe swith df
    merged = merged.groupby(cols).max() #group by colymns and max value
    genres = merged.filter(regex='^genre')
    x = merged.iloc[:, 11:]
    pca = PCA(n_components=num_components)
    x_pca = pca.fit_transform(x)
    merged = merged.iloc[:, :11]
    merged[['pca_' + str(i+1) for i in range(num_components)]] = x_pca[:, :num_components]
    merged = (merged - merged.min()) / (merged.max() - merged.min())
    return merged, genres

num_components = 2
cols = ['artist_name','track_name','track_id']

X, genres = prepro(songData, cols, num_components)
X = X.sort_values('popularity', ascending=False) #sort by popularity

for i, col in enumerate(X.columns):
    print(i, col)

sns.pairplot(X.iloc[::10, :])
plt.show()

In [None]:
for i in range(50):
    print(i, X.iloc[i, :].name)

In [None]:
num_clust = 30
wcss = []
#use elbow method to determine correct amount of clusters to have
X_numeric = X.select_dtypes(np.number)

for i in range(1, num_clust+1):
    print(f'k={i}')
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_numeric)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
sns.lineplot(x=range(1, num_clust+1), y=wcss, marker='x', color='blue')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
num_clust = 10
#cluster values into 10 clusters
kmeans = KMeans(n_clusters = num_clust, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

X.loc[:, 'cluster'] = y_kmeans

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

ax.hist(
    y_kmeans,
    bins=num_clust,
    density=True,
    alpha=0.8
)

plt.title('Distribution by cluster')
plt.show()

In [None]:
col_nums = {}

for i, c in enumerate(X.columns[:-1]):
    col_nums[c] = i
    
col_nums



In [None]:
inds = {'index': 39} #song we want recommendations for

target_ind = inds['index']
target_vals = X.iloc[target_ind, :-1] #songs only within same cluster

genres['cluster'] = X['cluster']
tmp = X[X['cluster'] == X['cluster'].iloc[target_ind]]
#using cosine simlarity between target song and songs within each row
tmp['index'] = tmp.apply(lambda row: spatial.distance.cosine(target_vals, row[:-1]), axis=1)

print('Most similar songs:')
head = tmp.sort_values(['index']).head(15)

for index, row in head.iterrows():
    print(f'    {index[0]} - {index[1]}')