In [4]:
### spotify recommendation engine


import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('top50.csv',encoding = 'ISO-8859-1') ### load data and we use encoding so we can get full data

### by default data is in utf-8 which has only english chars. so to get all the chars we use encoding

In [7]:
df.head() ### displays first 5 rows and all columns

Unnamed: 0.1,Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
0,1,Señorita,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,79
1,2,China,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,92
2,3,boyfriend (with Social House),Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
3,4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
4,5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,94


In [10]:
df.columns = [i.lower().replace('.','') for i in df] ### here we are cleaning the columns for our ease. we replace dots

### with no spaces

In [11]:
df.columns ### display columns

Index(['unnamed: 0', 'trackname', 'artistname', 'genre', 'beatsperminute',
       'energy', 'danceability', 'loudnessdb', 'liveness', 'valence', 'length',
       'acousticness', 'speechiness', 'popularity'],
      dtype='object')

In [79]:
class SpotifyRecommendation:
    
    def __init__(self): ### we initialize our constructor here
        
        self.df = pd.read_csv('top50.csv', encoding='ISO-8859-1')
        self.numerical_df = None
        self.results = dict()
        
    def columns_rename(self): ### changing name of columns for our ease
        
        self.df.columns = [i.lower().replace('.','') for i in self.df]
        
    def remove_unwanted_columns(self):
        
        self.df.drop(columns = ['unnamed: 0'], axis = 1, inplace = True)
        
    def cleaning_str_column(self):
        
        for i in ['trackname','artistname','genre']:
            self.df[i] = self.df[i].str.lower()
            
    def normalized_algo(self,col):
        
        max_d = self.df[col].max()
        min_d = self.df[col].min()
        return (self.df[col] - min_d) / (max_d - min_d)
    
    def normalized_column(self):
        
        df_numerical = self.df.select_dtypes(include = ['int64'])
        self.numerical_df = df_numerical
        for col in df_numerical.columns:
            self.df[col] = self.normalized_algo(col)
            
    def cluster_creation(self):
        
        from sklearn.cluster import KMeans
        
        km = KMeans(n_clusters = 10)
        pred = km.fit_predict(self.numerical_df)
        self.df['cluster_val'] = pred
        
    def fit(self):
        
        total_songs = list(self.df['trackname'].unique())
        
        for song_name in total_songs:
            distances = []
            song_res = self.df[self.df['trackname'] == song_name].head(1).values[0]
            rem_data = self.df[self.df['trackname'] != song_name]
            
            for r_songs in rem_data.values:
                dist = 0
                
                for idx,col in enumerate(rem_data.columns):
                    if not col in ['trackname','artistname','genre']:
                        dist = dist + np.absolute(float(song_res[idx]) - float(r_songs[idx]))
                        
                distances.append(dist)
            rem_data['distance'] = distances
            rem_data = rem_data.sort_values('distance')
            temp = rem_data.to_dict(orient = 'records')
            self.results[song_name] = temp
            
    def predict(self,song_name,top=5):
        
        return self.results[song_name][0:top]
    
    
                

In [80]:
s = SpotifyRecommendation()

In [81]:
s.columns_rename()

In [82]:
s.cleaning_str_column()

In [83]:
s.remove_unwanted_columns()

In [84]:
s.normalized_column()

In [85]:
s.cluster_creation()

In [86]:
s.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rem_data['distance'] = distances


In [87]:
s.predict('bad guy')

[{'trackname': 'bad guy (with justin bieber)',
  'artistname': 'billie eilish',
  'genre': 'electropop',
  'beatsperminute': 0.47619047619047616,
  'energy': 0.23214285714285715,
  'danceability': 0.6229508196721312,
  'loudnessdb': 0.0,
  'liveness': 0.1320754716981132,
  'valence': 0.6823529411764706,
  'length': 0.41237113402061853,
  'acousticness': 0.32432432432432434,
  'speechiness': 0.627906976744186,
  'popularity': 0.76,
  'cluster_val': 0,
  'distance': 0.80311619213925},
 {'trackname': '7 rings',
  'artistname': 'ariana grande',
  'genre': 'dance pop',
  'beatsperminute': 0.5238095238095238,
  'energy': 0.0,
  'danceability': 0.8032786885245902,
  'loudnessdb': 0.0,
  'liveness': 0.07547169811320754,
  'valence': 0.27058823529411763,
  'length': 0.32989690721649484,
  'acousticness': 0.7837837837837838,
  'speechiness': 0.6976744186046512,
  'popularity': 0.76,
  'cluster_val': 0,
  'distance': 1.4496013286013043},
 {'trackname': 'talk',
  'artistname': 'khalid',
  'genre':