## Install Dependencies

In [1]:
import spotipy
import pandas as pd
import json
from datetime import datetime as dt
import re

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


## Get own data from Spotify (to add to the larger database)

In [2]:
#Set up spotify using an access token
access_token = 'ACCESS_TOKEN'
access_token = 'BQAme5gOGomcD7wrHPtTKNAPBx_egx6FVtbRxtcipic0cOcyTFn_41mf0QTkrQMpmGJil7tY2SMZ3-9T1ld_bRnKXC46z_BkTP8iztIEabbsZLCHJuc'
sp = spotipy.Spotify(auth=access_token)

In [52]:
#Grab all of data from a playlist
results = sp.playlist('5c4BBjb1RsgCgnq8cy1kRt')
test = results['tracks']['items']

ids=[]
songs = {
    "name": [],
    'artist': [],
    'genre': [],
    'date_added': [], 
    'popularity': []
}
for result in results['tracks']['items']:
    ids.append(result['track']['id'])
    songs['name'].append(result['track']['name'])
    songs['artist'].append(result['track']['artists'][0]['name'])
    artist_id=result['track']['artists'][0]['id']
    songs['genre'].append(sp.artist(artist_id)['genres'])
    songs['date_added'].append(dt.strptime(result['added_at'][:10], '%Y-%m-%d').date())
    songs['popularity'].append(result['track']['popularity'])



In [53]:
# Combine the audio features with the song information
audio_features = pd.DataFrame.from_dict(sp.audio_features(tracks=ids))
my_songs = pd.concat([pd.DataFrame.from_dict(songs), audio_features], axis=1)

In [54]:
#select the important columns to use
my_songs = my_songs[['id', 'name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'genre', 'artist', 'date_added']]

In [55]:
my_songs_exploded = my_songs.explode(column=['genre'])

## Download Song Database

From Kaggle

In [56]:
songbase = pd.read_csv('train.csv')

In [57]:
songbase.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [58]:
songbase.rename(columns={'track_id':'id', 'track_name':'name', 'track_genre':'genre'}, inplace=True)

In [59]:
#select the important columns to use

# my_songs = my_songs[['id', 'name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                    #  'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist', 'artist_id', 'date_added']]

songbase = songbase[['id', 'name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artists', 'genre']]

In [60]:
songbase.dropna(subset=['artists'], inplace=True)


In [61]:
songbase['artist'] = songbase['artists'].apply(lambda x: x.split(";"))

In [62]:
#Check to see if we have separated the songs with multiple artists
songbase[songbase['artist'].apply(lambda x: len(x)>1)]


Unnamed: 0,id,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artists,genre,artist
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,57,0.438,0.359,0,-9.734,1,0.0557,0.21000,0.000000,0.1170,0.1200,76.332,Ingrid Michaelson;ZAYN,acoustic,"[Ingrid Michaelson, ZAYN]"
6,6Vc5wAMmXdKIAM7WUoEb7N,Say Something,74,0.407,0.147,2,-8.822,1,0.0355,0.85700,0.000003,0.0913,0.0765,141.284,A Great Big World;Christina Aguilera,acoustic,"[A Great Big World, Christina Aguilera]"
8,0IktbUcnAGrvD03AWnz3Q8,Lucky,74,0.625,0.414,0,-8.700,1,0.0369,0.29400,0.000000,0.1510,0.6690,130.088,Jason Mraz;Colbie Caillat,acoustic,"[Jason Mraz, Colbie Caillat]"
14,4LbWtBkN82ZRhz9jqzgrb3,Hold On - Remix,56,0.755,0.780,2,-6.084,1,0.0327,0.12400,0.000028,0.1210,0.3870,120.004,Chord Overstreet;Deepend,acoustic,"[Chord Overstreet, Deepend]"
16,6xKeQgzfjixSUld14qUezm,ily (i love you baby),56,0.706,0.112,2,-18.098,1,0.0391,0.82700,0.000004,0.1250,0.4140,110.154,Andrew Foy;Renee Foy,acoustic,"[Andrew Foy, Renee Foy]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113966,42bdU7oDyRvyRXaKbUrtfu,Victory Is Yours - Live,44,0.418,0.786,9,-5.257,1,0.0471,0.01050,0.000000,0.3420,0.0656,139.920,Bethel Music;Bethany Wohrle,world-music,"[Bethel Music, Bethany Wohrle]"
113967,1eZYPovTvmxk3QlVD2VpCX,We Will Not Be Shaken - Live,46,0.392,0.668,9,-7.342,1,0.0296,0.00356,0.000560,0.5880,0.1150,148.013,Bethel Music;Brian Johnson,world-music,"[Bethel Music, Brian Johnson]"
113969,7mD7yAQm4GcifSnWqJdZHi,Ain’t No Grave - Live,47,0.370,0.831,4,-4.484,1,0.0442,0.01290,0.000122,0.1090,0.1850,148.008,Bethel Music;Molly Skaggs,world-music,"[Bethel Music, Molly Skaggs]"
113975,2yAo8cJDVoMjBMS5MgrO8P,This Is Living,38,0.611,0.857,0,-6.248,0,0.0384,0.00182,0.000000,0.3260,0.4640,128.001,Hillsong Worship;Hillsong Young & Free,world-music,"[Hillsong Worship, Hillsong Young & Free]"


In [63]:
songbase_exploded = songbase.explode(['artist'])
songbase_exploded.drop(columns=['artists'], inplace=True)

In [64]:
my_songs_exploded.head(1)

Unnamed: 0,id,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre,artist,date_added
0,2RnBxdoRGOBzy2yvThWtMe,LET EM GO,73,0.364,0.512,11,-9.122,0,0.106,0.0337,0.0,0.107,0.134,118.392,gen z singer-songwriter,Matt Hansen,2024-08-11


In [65]:
songbase_exploded

Unnamed: 0,id,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre,artist
0,5SuOikwiRyPMVoIQDJUgSV,Comedy,73,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,acoustic,Gen Hoshino
1,4qPNDBW1i3p13qLCt0Ki3A,Ghost - Acoustic,55,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,acoustic,Ben Woodward
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,57,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,acoustic,Ingrid Michaelson
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,57,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,acoustic,ZAYN
3,6lfxq3CG4xtTiEg7opyCyx,Can't Help Falling In Love,71,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,acoustic,Kina Grannis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,2C3TZjDRiAzdyViavDJ217,Sleep My Little Boy,21,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,world-music,Rainy Lullaby
113996,1hIz5L4IB9hN3WRYPOCGPw,Water Into Light,22,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,world-music,Rainy Lullaby
113997,6x8ZfSoqDjuNa5SVP5QjvX,Miss Perfumado,22,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,world-music,Cesária Evora
113998,2e6sXL2bYv4bSz6VTdnfLs,Friends,41,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,world-music,Michael W. Smith


In [66]:
all_songs = pd.concat([songbase_exploded, my_songs_exploded[:-1]])

In [67]:
# Drop duplicates
all_songs['nameartist'] = all_songs.apply(lambda x: str(x['name']) + str(x['artist']), axis=1)

all_songs.drop_duplicates(subset = ['nameartist'], inplace=True)

all_songs.drop(columns=['nameartist'], inplace=True)


In [68]:
all_songs

Unnamed: 0,id,name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre,artist,date_added
0,5SuOikwiRyPMVoIQDJUgSV,Comedy,73,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.715,87.917,acoustic,Gen Hoshino,
1,4qPNDBW1i3p13qLCt0Ki3A,Ghost - Acoustic,55,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.267,77.489,acoustic,Ben Woodward,
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,57,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.120,76.332,acoustic,Ingrid Michaelson,
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,57,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.120,76.332,acoustic,ZAYN,
3,6lfxq3CG4xtTiEg7opyCyx,Can't Help Falling In Love,71,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.143,181.740,acoustic,Kina Grannis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,1QNt0bhIXWu5XdlXlYI4iI,House of Cards,52,0.462,0.3090,8,-6.489,1,0.0308,0.8210,0.000000,0.0976,0.374,98.666,singer-songwriter pop,Alexander Stewart,2024-08-11
34,4rdIhBrlaAOPnlTlugZBgz,echo,54,0.631,0.5030,5,-7.451,0,0.0371,0.4040,0.000000,0.1160,0.514,127.926,singer-songwriter pop,Alexander Stewart,2024-08-11
35,4tUwBXtIker14xxcDxSXCO,Backwards,0,0.508,0.6610,1,-3.166,0,0.1280,0.0111,0.000000,0.3150,0.699,79.603,singer-songwriter pop,Alexander Stewart,2024-08-11
36,3W1FAAdNDVdyJxJwV8Lps2,WHERE YOU BELONG,65,0.562,0.4870,9,-5.476,0,0.0300,0.2310,0.000000,0.1110,0.134,96.056,gen z singer-songwriter,Matt Hansen,2024-08-12


In [69]:
all_songs = all_songs.fillna('')
all_songs.drop_duplicates(subset='id', inplace=True)

## Generate Features

- MinMaxScalar() for all of the float variables (danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo)
- OHE the 'key', 'mode', and 'year' variables
- (if genre is added) TF-IDF for genre

In [70]:
def ohe(df, col, weight):
    ohe = OneHotEncoder(sparse_output=False)
    df_features = ohe.fit_transform(df[[col]])
    ohe_df = pd.DataFrame(df_features, columns=ohe.get_feature_names_out([col])) * weight
    return ohe_df

In [71]:
def tfidf(df, col):
    ti = TfidfVectorizer()
    matrix = ti.fit_transform(df[col])
    genre_df = pd.DataFrame(matrix.toarray(), columns=col + "|" + ti.get_feature_names_out())
    genre_df.reset_index(drop = True, inplace=True)

    return genre_df
    


In [72]:
def create_features(df):
    #minmaxscalar for float variables
    floats = df.dtypes[df.dtypes == 'float64'].index.values
    floats = df[floats].reset_index(drop = True)
    minmax = MinMaxScaler()
    floats_scaled = pd.DataFrame(minmax.fit_transform(floats), columns = floats.columns) * 0.5

    #ohe on key, mode, year
    ohes = ohe(df, 'mode', 0.2)
    # pd.concat([ohes, ohe(songbase_exploded, 'year', 0.75)], axis=1)
    pd.concat([ohes, ohe(df, 'key', 0.4)], axis=1)

    #TF IDF on genre
    genretfidf = tfidf(df, 'genre')

    # print(len(ohes))

    #Concat them all together
    final = pd.concat([floats_scaled, ohes, genretfidf], axis = 1)

     
    #Add the song id back in
    final['id']=df['id'].values
    final['name']=df['name'].values
    final['artist']=df['artist'].values
    
    return final
    

In [73]:
all_song_features = create_features(all_songs)
all_song_features.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_0,...,genre|tonk,genre|trance,genre|trip,genre|tunes,genre|turkish,genre|wave,genre|world,id,name,artist
0,0.343147,0.2305,0.395696,0.074093,0.016165,5.05e-07,0.179,0.359296,0.180623,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5SuOikwiRyPMVoIQDJUgSV,Comedy,Gen Hoshino
1,0.213198,0.083,0.298689,0.039534,0.463855,2.78e-06,0.0505,0.134171,0.159199,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4qPNDBW1i3p13qLCt0Ki3A,Ghost - Acoustic,Ben Woodward
2,0.222335,0.1795,0.368061,0.02886,0.105422,0.0,0.0585,0.060302,0.156822,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,Ingrid Michaelson
3,0.135025,0.0298,0.286851,0.018808,0.454317,3.535e-05,0.066,0.071859,0.373379,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6lfxq3CG4xtTiEg7opyCyx,Can't Help Falling In Love,Kina Grannis
4,0.313706,0.2215,0.368552,0.027254,0.235442,0.0,0.04145,0.08392,0.246431,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5vjLSffimiIP26QG5WcN2K,Hold On,Chord Overstreet


## Generate features for our own playlist

In [74]:
#Generate one list of features to describe our playlist, and find the dataset of other songs to choose from
def create_playlist_features(all_songs, playlist):
    playlist_songs = all_songs[all_songs['id'].isin(playlist['id'].values)]
    playlist_songs = playlist_songs.merge(playlist[['id', 'date_added']], on='id', how='inner')
    other_songs = all_songs[~all_songs['id'].isin(playlist['id'].values)]

    playlist_songs.sort_values('date_added',ascending=False, inplace=True)

    #Weight the songs based on when it was added to playlist
    most_recent = playlist_songs['date_added'].iloc[0]
    playlist_songs['recent'] = playlist_songs['date_added'].apply(lambda x: (x-most_recent).days)
    playlist_songs['weight'] = playlist_songs['recent'].apply(lambda x: 1.05 ** (x))

    #multiply the columns by the weight
    playlist_weighted = playlist_songs.copy()
    playlist_weighted = playlist_weighted.iloc[:, :-6]
    playlist_weighted = playlist_weighted.mul(playlist_songs['weight'], 0)

    return playlist_weighted.sum(axis=0), other_songs

    

In [75]:
my_playlist_features, non_songs = create_playlist_features(all_song_features, my_songs_exploded)

## Generate Recommendations

In [76]:
def generate_recs(features, nonplaylist):
    df = nonplaylist.copy()
    df['sim'] = cosine_similarity(df.drop(columns=['id', 'name', 'artist'], axis = 1).values, features.values.reshape(1, -1))[:,0]
    recs = df.sort_values('sim',ascending = False).head(40)

    return recs

In [77]:
recs = generate_recs(my_playlist_features, non_songs)

In [78]:
recs

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode_0,...,genre|trance,genre|trip,genre|tunes,genre|turkish,genre|wave,genre|world,id,name,artist,sim
71358,0.304061,0.3635,0.412824,0.04057,0.335341,1.29e-05,0.051,0.350251,0.345107,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3dvkpe9FEekxxgaQtxOHQI,Mặt Mộc,Phạm Nguyên Ngọc,0.834504
64584,0.332487,0.3655,0.388158,0.027876,0.23494,0.0,0.057,0.172864,0.267161,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,5wtGB6ojq1aPyOaBkhnElz,Tipo Uzui Tengen,MHRAP,0.831662
71343,0.424365,0.3005,0.406849,0.032073,0.366466,0.0,0.0489,0.283417,0.25469,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0W5JUrUOQmQbTmUOQwk7vx,I'm in a Rut,Sophie Pecora,0.831573
64729,0.367005,0.308,0.421656,0.051813,0.308735,0.0,0.03645,0.28794,0.283455,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1zz2mZGTBNGU35XdCSgvB2,Mi Nombre Entre Tus Dientes (feat. Big Javy),Edwin Luna y La Trakalosa de Monterrey,0.82775
71100,0.339086,0.3635,0.397508,0.01399,0.258032,9.5e-06,0.05,0.471859,0.287607,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,3pCt2wRdBDa2kCisIdHWgF,To the Bone,Pamungkas,0.827721
64613,0.358376,0.4095,0.425097,0.047202,0.286647,0.0,0.05,0.319598,0.297861,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,7mg05jDLLNtuL9BA8hgsSA,O Herói do Escudo,Takr,0.82723
64664,0.343655,0.2865,0.384265,0.024352,0.159137,0.0,0.1455,0.141709,0.219949,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0Gdc7WEm7QbmFzqDfQCs2M,Tanjiro Kamado,Rodrigo Zin,0.826751
64547,0.308629,0.2765,0.425781,0.014041,0.197289,0.0,0.0765,0.271859,0.295829,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5xmO5SbFOiVrRGrMQhL4Jk,Millonario De Amor,Banda Cuisillos,0.826046
71116,0.345685,0.345,0.37977,0.024819,0.243976,0.0,0.1195,0.454271,0.320466,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0hulgYglBnlZ6a1uaiJYIM,Woo Woo,Sheryl Crow,0.825868
64508,0.312183,0.3495,0.407219,0.032642,0.345382,0.0,0.071,0.233166,0.345136,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,4pq5g8DHg6lX7gDqu4OGlg,NADA FÁCIL,Luccas Carlos,0.825575
