In [1]:
import numpy as np
import pandas as pd
import spotipy
import utils
import sys
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
pd.options.mode.chained_assignment = None
import pickle

In [61]:
def to_32(df):
    df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)
    df[df.select_dtypes(np.int64).columns] = df.select_dtypes(np.int64).astype(np.int32)
    return df

# Importing complete feature set

Since our complete feature set ( or even feature set 1 or 2) are of huge size (in GBs) and simply loading them into this file will take up lots of time and memory and it will convert csv columns into float64 by default, we will do the following steps

- load only first few rows of either the full feature set or any of its 2 parts

- store all column names in a variable 'float_cols'

- use float_cols to typecase all float columns dataype (by default float64) to float32


By doing so, we can decrease the memory consumption + the time taken to import csv

(currectly it takes about 2 min 11 sec on an average to load the "complete_feature_set.csv' file which is 3.36GBs of data)
(i'm using c engine to load csv data as pyarrow consumes too much memory and processor capacilty)

In [4]:
client_id = 'ca8668888ab6408b8da70b6385815a3e'; # Your client id
client_secret = 'bfd03aef2fc9405b89e002888d921d84'; # Your secret

In [81]:
feature_set = pd.read_pickle('complete_feature_set.pkl')

In [82]:
feature_set

Unnamed: 0,genre|432hz,genre|48g,genre|_brasileira,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_hip_hop,genre|abstract_idm,genre|accordeon,genre|accordion,...,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id,name,artists,id_artists
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5PsP2lVnRTrwKS1uurWpr0,Gli anni (96),883,6bMul6rmRS03x38tWKYifO
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3xzZiwmetUYGS8YHOyTBR7,Red Streamliner,Little Feat,0ZIwOAzDuGPspzK7yiTc4S
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,05F9Tb9SUQZ6YCDt9cHxQB,Dice que soy mujeriego,Pedro Infante,7y33enVLfDvft6HGNmcxdV
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0KSar1knxet2s1mBYW7QeN,Rock Salt & Nails,J.D. Crowe & The New South,6gfqvidGxCJgbqSaserlkF
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4Wu10lusWcdGgHOA28hXTI,Martha,Jefferson Airplane,2qFr8w5sWUITRlzZ9kZotF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,7Bk0uXKk1uPT0XuQbpFzvs,Fiel,Los Legendarios,0n6sKrG0xKAf8xmdqeNGke
205888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,27OeeYzk6klgBh83TSvGMA,WITHOUT YOU,The Kid LAROI,2tIP7SsRs7vjIcLrU85W8J
205889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,4cG7HUWYHBV6R6tHn1gxrl,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,Riton,7i9j813KFoSBMldGqlh2Z1
205890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ


In [5]:
scope='user_library_read'

if len(sys.argv)>1:
    username=sys.argv[1]
else:
    print('Usage: %s username' % (sys.argv[0],))
    sys.exit()

In [6]:
auth_manager=SpotifyClientCredentials(client_id=client_id,client_secret=client_secret)
sp=spotipy.Spotify(auth_manager=auth_manager)

In [7]:
token=spotipy.util.prompt_for_user_token(scope,client_id=client_id,client_secret=client_secret,redirect_uri='http://localhost:8881/')

In [8]:
sp=spotipy.Spotify(auth=token)

In [9]:
sp.__init__(auth=token,auth_manager=auth_manager)

In [10]:
me=sp.current_user()


In [11]:
me

{'display_name': 'DEMO',
 'external_urls': {'spotify': 'https://open.spotify.com/user/m576gyx643ryle5a4kq7f0e0z'},
 'followers': {'href': None, 'total': 0},
 'href': 'https://api.spotify.com/v1/users/m576gyx643ryle5a4kq7f0e0z',
 'id': 'm576gyx643ryle5a4kq7f0e0z',
 'images': [],
 'type': 'user',
 'uri': 'spotify:user:m576gyx643ryle5a4kq7f0e0z'}

In [15]:
id_name={}
list_photo={}
user_playlist=sp.current_user_playlists(50,0)
playlists=pd.DataFrame(user_playlist)
play=pd.Series(playlists['items'])
for i in range(len(play)):
    id_name[play[i]['name']]=play[i]['uri'].split(':')[2]
    img=pd.DataFrame(play[i]['images'])
    list_photo[play[i]['uri'].split(':')[2]]=img['url'][0]
#for i in playlists:
#    id[i['name']]=i['uri'].split(':')[2]
#    list_photo[i['uri'].split(':')[2]]=i['images'][0]['url']


In [16]:
id_name

{'BIBI + Ashuashu': '37i9dQZF1EXCb1XC3YPFU8',
 'Japanese': '1kScRPUAOTGdtzEWLA5bUV',
 'Kpop gg': '7rCKcrtZ9Amnl1HgzTK7WF',
 'Top tier hindi song lole': '7DCmMKqgh4wGyBKUVTLGxU',
 'Autumn': '5kF4PLlYEERA1aQGFJ1hV8',
 'RVbside vibe': '6l41Z5q1h2lTASJcR8hD7k'}

In [18]:
list_photo

{'37i9dQZF1EXCb1XC3YPFU8': 'https://blend-playlist-covers.spotifycdn.com/celebrity-blends/BIBI66679/en.jpg',
 '1kScRPUAOTGdtzEWLA5bUV': 'https://mosaic.scdn.co/640/ab67616d0000b2730979148729dbe13a648e560eab67616d0000b2733dcf67594e3c4f8f2dba0b82ab67616d0000b273684d81c9356531f2a456b1c1ab67616d0000b273e921fc686dbd113760a3c1ea',
 '7rCKcrtZ9Amnl1HgzTK7WF': 'https://mosaic.scdn.co/640/ab67616d0000b2736538b8e1b5c7b2a9d2211769ab67616d0000b273b64001fa6292caefc7605550ab67616d0000b273bc125f40131dd5869b2ec36cab67616d0000b273df5022bdf1ac4bf52135c4be',
 '7DCmMKqgh4wGyBKUVTLGxU': 'https://i.scdn.co/image/ab67706c0000bebb519cc9a0bb08faa7adc00137',
 '5kF4PLlYEERA1aQGFJ1hV8': 'https://i.scdn.co/image/ab67706c0000bebb7211a771459fbc3b65c6e2a2',
 '6l41Z5q1h2lTASJcR8hD7k': 'https://mosaic.scdn.co/640/ab67616d0000b2736017bca98dea58ceddea77c1ab67616d0000b2736538b8e1b5c7b2a9d2211769ab67616d0000b27370a04b3e66d6a4a38237dc7fab67616d0000b273b64001fa6292caefc7605550'}

In [17]:
def create_necessary_outputs(playlist_name,id_dic,df):
    playlist=pd.DataFrame()
    playlist_name=playlist_name
    
    for ix,i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        playlist.loc[ix,'artist']=i['track']['artists'][0]['name']
        playlist.loc[ix,'name']=i['track']['name']
        playlist.loc[ix,'id']=i['track']['id']#['uri'].split(':')[2]
        playlist.loc[ix,'url']=i['track']['album']['images'][1]['url']
        playlist.loc[ix,'date_added']=i['added_at']
    playlist['date_added']=pd.to_datetime(playlist['date_added'])
    playlist=playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending=False)
    return playlist


In [19]:
dataset_playlist_intersection_songs=create_necessary_outputs('RVbside vibe',id_name,feature_set)

In [20]:
dataset_playlist_intersection_songs.head(50)

Unnamed: 0,artist,name,id,url,date_added
26,Wonder Girls,Why So Lonely,3mKK73NhylzXdHV4qZGxI4,https://i.scdn.co/image/ab67616d00001e02018ac0...,2021-11-02 05:12:54+00:00
19,Red Velvet,Psycho,3CYH422oy1cZNoo0GTG1TK,https://i.scdn.co/image/ab67616d00001e02df5022...,2021-09-18 06:11:48+00:00
14,Red Velvet,Be Natural,41qLzxymjkp0R5vl3REb1S,https://i.scdn.co/image/ab67616d00001e0272ee4e...,2021-09-18 06:11:07+00:00
11,TWICE,CRY FOR ME,2xtP8RNbo2BEMzLX7tK7aq,https://i.scdn.co/image/ab67616d00001e02cf3bd6...,2021-09-18 06:09:39+00:00
10,Red Velvet,"Bad Boy - English Version, Bonus Track",78HD9IN4cKE1MMHWeVJPWh,https://i.scdn.co/image/ab67616d00001e026017bc...,2021-09-18 06:09:26+00:00
8,Red Velvet,Bad Boy,5GKwq4sO5ZHKuWaDmdwMQc,https://i.scdn.co/image/ab67616d00001e02b64001...,2021-09-18 06:08:50+00:00
7,Red Velvet,Peek-A-Boo,42tFTth2jcF7iSo0RBjfJF,https://i.scdn.co/image/ab67616d00001e026538b8...,2021-09-18 06:08:47+00:00


In [22]:
dataset_playlist_intersection_songs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 26 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   artist      7 non-null      object             
 1   name        7 non-null      object             
 2   id          7 non-null      object             
 3   url         7 non-null      object             
 4   date_added  7 non-null      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 336.0+ bytes


In [21]:
def generate_playlist_vector(complete_feature_set,playlist_df,weight_factor):
    complete_feature_set_playlist=complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id',axis=1).mean(axis=0)
    complete_feature_set_playlist=complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on='id',how='inner')
    complete_feature_set_nonplaylist=complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]
    
    playlist_feature_set=complete_feature_set_playlist.sort_values('date_added',ascending=False)
    
    most_recent_date=playlist_feature_set.iloc[0,-1]
    
    for ix,row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent']=int((most_recent_date.to_pydatetime()-row.iloc[-1].to_pydatetime()).days/30)
        
    playlist_feature_set['weight']=playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted=playlist_feature_set.copy()
    
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-7].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final=playlist_feature_set_weighted.iloc[:,:-7]
    
    return playlist_feature_set_weighted_final.sum(axis=0),complete_feature_set_nonplaylist

In [23]:
#feature_set_nonplaylist dataframe consists of songs which are not present in the dataset_playlist_intersection
playlist_vector,feature_set_nonplaylist=generate_playlist_vector(feature_set,dataset_playlist_intersection_songs,1.09)

In [24]:
#converting to all float64s to float32 to increase memory efficiency
playlist_vector=playlist_vector.astype(dtype=np.float32)
playlist_vector.info()

<class 'pandas.core.series.Series'>
Index: 4174 entries, genre|432hz to year|2021
Series name: None
Non-Null Count  Dtype  
--------------  -----  
4174 non-null   float32
dtypes: float32(1)
memory usage: 48.9+ KB


In [31]:
playlist_vector

genre|432hz          0.000000
genre|48g            0.000000
genre|_brasileira    0.000000
genre|_hip_hop       0.000000
genre|a_cappella     0.000000
                       ...   
year|2017            0.458716
year|2018            0.917431
year|2019            0.458716
year|2020            0.458716
year|2021            0.000000
Length: 4174, dtype: float32

In [30]:
feature_set_nonplaylist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205885 entries, 0 to 205891
Columns: 4178 entries, genre|432hz to id_artists
dtypes: float32(4174), object(4)
memory usage: 3.2+ GB


In [33]:
def generate_recs(features,nonplaylist_features):
    a=nonplaylist_features.drop(['id','name','artists','id_artists'],axis=1)
    b=features.values.reshape(1,-1)
    cos=cosine_similarity(a.values,b)[:,0]
    temp=pd.DataFrame()
    temp['id']=nonplaylist_features['id']
    temp['name']=nonplaylist_features['name']
    temp['artists']=nonplaylist_features['artists']
    temp['id_artists']=nonplaylist_features['id_artists']
    temp['sim']=cos
    non_playlist_df_top10=(temp.sort_values('sim',ascending=False)).head(15)
    non_playlist_df_top10['url']=non_playlist_df_top10['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    

    return non_playlist_df_top10

In [34]:
top_15_recs=generate_recs(playlist_vector,feature_set_nonplaylist)

In [35]:
top_15_recs.reset_index(inplace=True,drop=True)
top_15_recs

Unnamed: 0,id,name,artisis,id_artists,sim,url
182168,5Ek40FyMPprcvtyZDjqylX,What is Love?,TWICE,7n2Ycct7Beij7Dj7meI4X0,0.936378,https://i.scdn.co/image/ab67616d00001e028e8191...
171200,54w5JUSWCJu64pnw8alImP,I WANT YOU BACK,TWICE,7n2Ycct7Beij7Dj7meI4X0,0.936232,https://i.scdn.co/image/ab67616d00001e028e0603...
171131,5DVgfulxeJZJYc8FseyfUf,AS IF IT’S YOUR LAST,BLACKPINK,41MozSoPIsD1dJM0CLPjZF,0.936083,https://i.scdn.co/image/ab67616d00001e025e07ce...
174849,2jL9sjFc2LZsQBGbQnrjXR,Time for the moon night,GFRIEND,0qlWcS66ohOIi0M8JZwPft,0.935369,https://i.scdn.co/image/ab67616d00001e02674f69...
122349,0dnFBKDhthNNAYy0RTCkEF,SWEET TALKER,TWICE,7n2Ycct7Beij7Dj7meI4X0,0.934612,https://i.scdn.co/image/ab67616d00001e028e8191...
175782,2CWnuF7ht55Ajyeg4WPvWu,DDU-DU DDU-DU - KR Ver.,BLACKPINK,41MozSoPIsD1dJM0CLPjZF,0.93446,https://i.scdn.co/image/ab67616d00001e02e5957d...
200188,26OVhEqFDQH0Ij77QtmGP9,YES or YES,TWICE,7n2Ycct7Beij7Dj7meI4X0,0.934358,https://i.scdn.co/image/ab67616d00001e028c66dd...
199837,4DYIDSMIB5y2UmZFv9fxeX,Dance The Night Away,TWICE,7n2Ycct7Beij7Dj7meI4X0,0.934181,https://i.scdn.co/image/ab67616d00001e02d610ab...
133505,7AWtaOSb7cgi16wJr3mkuf,REALLY - KR Ver.,BLACKPINK,41MozSoPIsD1dJM0CLPjZF,0.93386,https://i.scdn.co/image/ab67616d00001e02e5957d...
199839,3BPoSr2pO34Aan6alFfVto,BBoom BBoom,MOMOLAND,5RR0MLwcjc87wjSw2JYdwx,0.933692,https://i.scdn.co/image/ab67616d00001e02a5bb4e...


In [37]:
#for i in range(len(kpop_top40['id'])):
#    kpop_top40['name'][i]=sp.track(kpop_top40['id'][i])['name']
top_15_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          15 non-null     object 
 1   name        15 non-null     object 
 2   artisis     15 non-null     object 
 3   id_artists  15 non-null     object 
 4   sim         15 non-null     float32
 5   url         15 non-null     object 
dtypes: float32(1), object(5)
memory usage: 788.0+ bytes
