## Spotify recommendation system

In [11]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [34]:
# reading and overviewing data
data = pd.read_csv("spotify.csv")
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [35]:
# gaining information regarding features in dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174389 entries, 0 to 174388
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      174389 non-null  float64
 1   artists           174389 non-null  object 
 2   danceability      174389 non-null  float64
 3   duration_ms       174389 non-null  int64  
 4   energy            174389 non-null  float64
 5   explicit          174389 non-null  int64  
 6   id                174389 non-null  object 
 7   instrumentalness  174389 non-null  float64
 8   key               174389 non-null  int64  
 9   liveness          174389 non-null  float64
 10  loudness          174389 non-null  float64
 11  mode              174389 non-null  int64  
 12  name              174389 non-null  object 
 13  popularity        174389 non-null  int64  
 14  release_date      174389 non-null  object 
 15  speechiness       174389 non-null  float64
 16  tempo             17

In [36]:
# check for null values
data.isnull().sum()

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
dtype: int64

In [37]:
# if we think logically then we can remove some features as they do not contribute as per our problem statement
# columns to be removed: artists,id,name,release_date and year
df = data.drop(["artists","id","release_date","year"],axis=1)

In [38]:
# finding correlation between each and every features
df.corr()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
acousticness,1.0,-0.263217,-0.089169,-0.750852,-0.208176,0.221956,-0.028028,-0.029654,-0.546639,0.064633,-0.396744,-0.022437,-0.22384,-0.166968
danceability,-0.263217,1.0,-0.100757,0.204838,0.200842,-0.215589,0.026266,-0.110033,0.249541,-0.048358,0.123746,0.239962,0.005479,0.536713
duration_ms,-0.089169,-0.100757,1.0,0.060516,-0.033808,0.103621,0.00202,0.028942,0.019791,-0.046849,0.024717,-0.097838,-0.008182,-0.183199
energy,-0.750852,0.204838,0.060516,1.0,0.102561,-0.17775,0.03578,0.134815,0.779267,-0.05616,0.328939,-0.112616,0.266448,0.326418
explicit,-0.208176,0.200842,-0.033808,0.102561,1.0,-0.130609,0.005282,0.037288,0.106249,-0.062503,0.152545,0.353872,0.008075,-0.009275
instrumentalness,0.221956,-0.215589,0.103621,-0.17775,-0.130609,1.0,-0.004619,-0.047941,-0.317562,-0.056731,-0.300625,-0.133966,-0.068656,-0.219188
key,-0.028028,0.026266,0.00202,0.03578,0.005282,-0.004619,1.0,-0.003368,0.025227,-0.127397,0.001951,0.009648,0.005009,0.025592
liveness,-0.029654,-0.110033,0.028942,0.134815,0.037288,-0.047941,-0.003368,1.0,0.062695,0.001677,-0.078959,0.122034,0.008586,-0.005781
loudness,-0.546639,0.249541,0.019791,0.779267,0.106249,-0.317562,0.025227,0.062695,1.0,-0.01925,0.337194,-0.213504,0.217914,0.30252
mode,0.064633,-0.048358,-0.046849,-0.05616,-0.062503,-0.056731,-0.127397,0.001677,-0.01925,1.0,0.007652,-0.040711,0.002438,0.021592


In [39]:
# transforming numerical data using MinMax scaler
dtt = [ 'int16', 'int32', 'int64', 'float16', 'float32', 'float64' ]
data_num = data.select_dtypes(include=dtt)
for i in data_num.columns:
    MinMaxScaler(i)

In [40]:
# we want to find similarity between songs, this similarity can be easily found using K-means clustering algorithm.
km = KMeans(n_clusters=10)
features = km.fit_predict(data_num)
data['features'] = features
MinMaxScaler(data['features'])

MinMaxScaler(feature_range=0         4
1         4
2         4
3         3
4         4
         ..
174384    4
174385    4
174386    0
174387    0
174388    0
Name: features, Length: 174389, dtype: int32)

In [44]:
## Spotify recommendation
songs = input("Enter a songs for building recommendations: ")
amount = int(input("Enter number of songs to be recommended: "))
distance = [ ]
song = data[(data.name.str.lower()==songs.lower())].head(1).values[0]
rec = data[data.name.str.lower()!=songs.lower()]

for songs in tqdm(rec.values):
    d = 0
    for col in np.arange(len(rec.columns)):
        if not col in [1,6,12,14,18]:
            d+=np.absolute(float(song[col])-float(songs[col]))
    distance.append(d)
rec['distance'] = distance
rec.sort_values('distance')
columns = ['artists','name']
rec[columns][:amount]

Enter a songs for building recommendations: Lovers Rock
Enter number of songs to be recommended: 10


100%|████████████████████████████████| 174387/174387 [00:56<00:00, 3095.84it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,artists,name
0,['Mamie Smith'],Keep A Song In Your Soul
1,"[""Screamin' Jay Hawkins""]",I Put A Spell On You
2,['Mamie Smith'],Golfing Papa
3,['Oscar Velazquez'],True House Music - Xavier Santos & Carlos Gomi...
4,['Mixe'],Xuniverxe
5,['Mamie Smith & Her Jazz Hounds'],Crazy Blues - 78rpm Version
6,['Mamie Smith'],Don't You Advertise Your Man
7,['Mamie Smith & Her Jazz Hounds'],Arkansas Blues
8,['Francisco Canaro'],La Chacarera - Remasterizado
9,['Meetya'],Broken Puppet - Original Mix
