### Libraries

In [None]:
import pandas as pd
import numpy as np
import unicodedata

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

### Options & Data

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv("/home/b4/Documents/VisualCodeStudio/spotify_project/data/tracks_dataframe.csv", index_col=0)

### Functions

In [None]:
# Search song name in dataframe
def search_song(song, dataframe):
  nfkd_form = unicodedata.normalize('NFKD', song)
  only_ascii = nfkd_form.encode('ASCII', 'ignore')
  search = only_ascii.decode()
  return dataframe[dataframe['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.contains(search, case=False)]


In [None]:
# Return 6 recommended songs based on features' similarity
## Function update: Search for song name or spotify's song ID
def recomended(song):    

  try:
    try:
      song_index = df[df['id']==song].index[0]
    except:
      song_index = df[df['name'].str.contains(song, case=False)].index[0]
    index_dist = list(zip(indices[song_index], distancias[song_index]))
    print(f"\nSearch Song: {df.iloc[song_index]['name']} - {df.iloc[song_index]['artist_name']}")
    for x, y in index_dist[1:]:
      print(f"""
            Distance: {round(y, 2)}
            Track: {df.iloc[x]['track_number']}
            Name: {df.iloc[x]['name']}
            Album: {df.iloc[x]['album_name']}
            Artist: {df.iloc[x]['artist_name']}
            Key: {df.iloc[x]['key']}
            """)
  except:
    print('Hmm.. Song not on the list.')


## Code

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df[df['danceability'].isnull()]

In [None]:
df[df['name']=='Meu Consolador']

In [None]:
df.drop(index=1200, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.isnull().sum()

### Model

In [None]:
features = df[['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
               'track_pop', 'album_pop', 'artist_pop']]

In [None]:
features.isnull().sum()

In [None]:
corr = features.corr()
matrix = np.triu(corr)
plt.subplots(figsize=(20,8))
sns.heatmap(features.corr(), annot=True, mask=matrix, cmap='coolwarm')

In [None]:
std = StandardScaler()

In [None]:
features_std = std.fit_transform(features)
features_std_df = pd.DataFrame(features_std, columns=features.columns)

In [None]:
features_std_df

In [None]:
model = KMeans(5)
cluster = model.fit_predict(features_std_df)

In [None]:
df['cluster'] = cluster

### Visualization

In [None]:
df.groupby(by='cluster').median()

In [None]:
df[df['cluster']==4].describe()

In [None]:
sns.boxplot(data=df, x='cluster', y='danceability')
plt.show()

### Recommendation

In [None]:
cluster

In [None]:
features_std_df['cluster'] = cluster

In [None]:
features_std_df

In [None]:
model = NearestNeighbors(n_neighbors=7)

In [None]:
model.fit(features_std_df)

In [None]:
distancias, indices = model.kneighbors(features_std_df)

In [None]:
indices

In [None]:
distancias

In [None]:
song_index = df[df['name']=='Andando Sobre as Águas - Ao vivo'].index[0]

In [None]:
indices[song_index]

In [None]:
distancias[song_index]

In [None]:
index_dist = list(zip(indices[song_index], distancias[song_index]))

In [None]:
index_dist

In [None]:
teste = list(index_dist)

In [None]:
teste

In [None]:
for x, y in index_dist:
  print(f"""
            Distance: {round(y, 2)}
            Track: {df.iloc[x]['track_number']}
            Name:{df.iloc[x]['name']}
            Album: {df.iloc[x]['album_name']}
            Artist: {df.iloc[x]['artist_name']}
            Key: {df.iloc[x]['key']}
        """)

## Recommended

In [None]:
search_song('', df)

In [None]:
recomended('')