In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

In [3]:
import json
import re
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [4]:
FILE_PATH = "C:/Users/user/OneDrive - National University of Singapore/Y3S2/IS3107 Data Engineering/Project/songs"

os.chdir(FILE_PATH)
mdp_files = os.listdir()

tracks_df = pd.DataFrame()

for f in mdp_files:
    json_file = json.loads(open(f).read())
    playlists = json_file["playlists"]
    sub_df = pd.json_normalize(playlists, record_path = "tracks", meta = ["name"])
    tracks_df = pd.concat([tracks_df, sub_df])

tracks_df.drop_duplicates(subset = "track_uri", inplace = True)

tracks_df

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,name
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,Throwbacks
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),Throwbacks
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,Throwbacks
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,Throwbacks
...,...,...,...,...,...,...,...,...,...
63215,17,NateWantsToBattle,spotify:track:4cKPzfOUIZWXhu3GDnESjL,spotify:artist:0Vb15td3iKkAzdGD5Sj9Ky,Reluctant Heroes,spotify:album:05XnUTo4HrVy5eLzqcD0lg,265039,Reluctant Heroes,game songs
63218,20,Dima Lancaster,spotify:track:0VICzaRdctD8XGfDstBjQY,spotify:artist:7tSqWbPIUCl6bTbPoxLXGn,Your Lie in April - Medley,spotify:album:4CKBpgzwIL7NPJrOvltCHj,368767,Your Lie in April - Medley,game songs
63220,22,Set It Off,spotify:track:0B4YuevrDNhh71edZrFiOs,spotify:artist:06bDwgCHeMAwhgI8il4Y5k,Uncontainable,spotify:album:0jGFC44cRkjE2b21aMEbIo,179266,Upside Down,game songs
63221,23,Set It Off,spotify:track:7i9q0VPbENCoYnJUsJhX9d,spotify:artist:06bDwgCHeMAwhgI8il4Y5k,Duality,spotify:album:5AKycvv2TUUh9h0KIletJt,242346,Duality,game songs


In [None]:
# Testing

# auth_manager = SpotifyClientCredentials(
#         client_id = "048ca22982da402e81d73f56c5b62c8f",
#         client_secret = "b710d3163f0747908258356f7f4324eb"
#     )
# sp = spotipy.Spotify(auth_manager = auth_manager)

# track_uri = "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI"

# track_features = []

# audio_features = sp.audio_features(track_uri)[0]
# artist = sp.track(track_uri)["artists"][0]["id"]
# artist_genres = sp.artist(artist)["genres"] # List

# # Artist and track popularity
# artist_popularity = sp.artist(artist)["popularity"]
# track_popularity = sp.track(track_uri)["popularity"]

# track_features.append(audio_features)
# track_features[0]["artist_genres"] = artist_genres
# track_features[0]["artist_popularity"] = artist_popularity
# track_features[0]["track_popularity"] = track_popularity

# track_features = pd.DataFrame(track_features)
# track_features

In [5]:
def get_track_features(track_uri):
    
    auth_manager = SpotifyClientCredentials(
        client_id = "048ca22982da402e81d73f56c5b62c8f",
        client_secret = "b710d3163f0747908258356f7f4324eb"
    )
    sp = spotipy.Spotify(auth_manager = auth_manager)

    track_features = []

    # Audio features
    audio_features = sp.audio_features(track_uri)[0]
    
    # Year of release
    release_date_precision = sp.track(track_uri)["album"]["release_date_precision"]
    release_date = sp.track(track_uri)["album"]["release_date"]
    year = 0
    if release_date_precision == "year":
        rd = datetime.strptime(release_date, "%Y")
        year = rd.year
    elif release_date_precision == "month":
        rd = datetime.strptime(release_date, "%Y-%m")
        year = rd.year
    else:
        rd = datetime.strptime(release_date, "%Y-%m-%d")
        year = rd.year

    # Artist genre
    artist = sp.track(track_uri)["artists"][0]["id"]
    artist_genres = sp.artist(artist)["genres"] # List

    # Artist and track popularity
    artist_popularity = sp.artist(artist)["popularity"]
    track_popularity = sp.track(track_uri)["popularity"]

    track_features.append(audio_features)
    track_features[0]["year"] = year
    track_features[0]["artist_genres"] = artist_genres
    track_features[0]["artist_popularity"] = artist_popularity
    track_features[0]["track_popularity"] = track_popularity

    return track_features[0]

In [6]:
# Testing
get_track_features("spotify:track:0UaMYEvWZi0ZqiDOoHU3YI")

{'danceability': 0.904,
 'energy': 0.813,
 'key': 4,
 'loudness': -7.105,
 'mode': 0,
 'speechiness': 0.121,
 'acousticness': 0.0311,
 'instrumentalness': 0.00697,
 'liveness': 0.0471,
 'valence': 0.81,
 'tempo': 125.461,
 'type': 'audio_features',
 'id': '0UaMYEvWZi0ZqiDOoHU3YI',
 'uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
 'track_href': 'https://api.spotify.com/v1/tracks/0UaMYEvWZi0ZqiDOoHU3YI',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0UaMYEvWZi0ZqiDOoHU3YI',
 'duration_ms': 226864,
 'time_signature': 4,
 'year': 2005,
 'artist_genres': ['dance pop',
  'hip hop',
  'hip pop',
  'pop rap',
  'r&b',
  'rap',
  'urban contemporary',
  'virginia hip hop'],
 'artist_popularity': 74,
 'track_popularity': 68}

In [7]:
from tqdm import tqdm

In [8]:
# TEMP CODE TO SUBSET DF
# SMALLER DATASET FOR FASTER TESTING

tracks_df = tracks_df[:100]
tracks_df

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,name
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,Throwbacks
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),Throwbacks
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,Throwbacks
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,Throwbacks
...,...,...,...,...,...,...,...,...,...
96,5,Lovelyz,spotify:track:24psBRmEw3kHjBGZfl1dmb,spotify:artist:3g34PW5oNmDBxMVUTzx2XK,Ah-Choo,spotify:album:5ZJuawNI3RvxURIBtsDHs0,218474,Lovelyz8,korean
97,6,LEE HI,spotify:track:06L1apH8kLF47dbhZ4Zg9A,spotify:artist:7cVZApDoQZpS447nHTsNqu,BREATHE,spotify:album:1xnXVzinhfO4I9CzTocPfh,288992,SEOULITE,korean
98,7,LEE HI,spotify:track:2qWgqPdW1OiAP8KSBH1b93,spotify:artist:7cVZApDoQZpS447nHTsNqu,FXXK WIT US,spotify:album:1xnXVzinhfO4I9CzTocPfh,217861,SEOULITE,korean
99,8,Ailee,spotify:track:2PTf3zh9UUsgdxQ5b0eXg8,spotify:artist:3uGFTJ7JMllvhgGpumieHF,I Will Show You,spotify:album:7IyU3Bqm8ERDh7i4wq4OuN,234106,U&I,korean


In [9]:
tracks_feature_df = []

for track in tqdm(tracks_df["track_uri"]):
    track_features = get_track_features(track)
    tracks_feature_df.append(track_features)

tracks_feature_df = pd.DataFrame(tracks_feature_df)
tracks_feature_df

100%|██████████| 100/100 [01:05<00:00,  1.54it/s]


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,year,artist_genres,artist_popularity,track_popularity
0,0.904,0.813,4,-7.105,0,0.1210,0.03110,0.006970,0.0471,0.810,...,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,2005,"[dance pop, hip hop, hip pop, pop rap, r&b, ra...",74,68
1,0.774,0.838,5,-3.914,0,0.1140,0.02490,0.025000,0.2420,0.924,...,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4,2003,"[dance pop, pop]",81,83
2,0.664,0.758,2,-6.583,0,0.2100,0.00238,0.000000,0.0598,0.701,...,0WqIKmW4BTrj3eJFmnCKMv,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4,2003,"[dance pop, pop, r&b]",87,21
3,0.892,0.714,4,-6.055,0,0.1410,0.20100,0.000234,0.0521,0.817,...,1AWQoqb9bSvzTjaLralEkT,spotify:track:1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4,2002,"[dance pop, pop]",81,77
4,0.853,0.606,0,-4.596,1,0.0713,0.05610,0.000000,0.3130,0.654,...,1lzr43nnXAijIGYnCT8M8H,spotify:track:1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4,2000,"[pop rap, reggae fusion]",74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.729,0.898,5,-1.029,1,0.0391,0.31500,0.000000,0.1070,0.517,...,24psBRmEw3kHjBGZfl1dmb,spotify:track:24psBRmEw3kHjBGZfl1dmb,https://api.spotify.com/v1/tracks/24psBRmEw3kH...,https://api.spotify.com/v1/audio-analysis/24ps...,218475,4,2015,"[k-pop, k-pop girl group]",43,0
96,0.609,0.246,8,-8.449,1,0.0376,0.78400,0.000000,0.0825,0.371,...,06L1apH8kLF47dbhZ4Zg9A,spotify:track:06L1apH8kLF47dbhZ4Zg9A,https://api.spotify.com/v1/tracks/06L1apH8kLF4...,https://api.spotify.com/v1/audio-analysis/06L1...,288993,4,2016,"[k-pop, pop]",67,0
97,0.763,0.658,5,-5.643,0,0.1470,0.02580,0.000000,0.0915,0.389,...,2qWgqPdW1OiAP8KSBH1b93,spotify:track:2qWgqPdW1OiAP8KSBH1b93,https://api.spotify.com/v1/tracks/2qWgqPdW1OiA...,https://api.spotify.com/v1/audio-analysis/2qWg...,217861,4,2016,"[k-pop, pop]",67,0
98,0.604,0.875,7,-1.995,0,0.0539,0.24400,0.000000,0.1140,0.219,...,2PTf3zh9UUsgdxQ5b0eXg8,spotify:track:2PTf3zh9UUsgdxQ5b0eXg8,https://api.spotify.com/v1/tracks/2PTf3zh9UUsg...,https://api.spotify.com/v1/audio-analysis/2PTf...,234107,4,2014,"[k-pop, korean pop]",59,0


In [10]:
# Remove duplicated `duration_ms` column prior to merge

tracks_df = tracks_df.drop(columns = ["duration_ms"])
tracks_complete_df = tracks_df.set_index("track_uri").join(tracks_feature_df.set_index("uri"))
tracks_complete_df

Unnamed: 0_level_0,pos,artist_name,artist_uri,track_name,album_uri,album_name,name,danceability,energy,key,...,type,id,track_href,analysis_url,duration_ms,time_signature,year,artist_genres,artist_popularity,track_popularity
track_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0,Missy Elliott,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,The Cookbook,Throwbacks,0.904,0.813,4,...,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,2005,"[dance pop, hip hop, hip pop, pop rap, r&b, ra...",74,68
spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1,Britney Spears,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,In The Zone,Throwbacks,0.774,0.838,5,...,audio_features,6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4,2003,"[dance pop, pop]",81,83
spotify:track:0WqIKmW4BTrj3eJFmnCKMv,2,Beyoncé,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0.664,0.758,2,...,audio_features,0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4,2003,"[dance pop, pop, r&b]",87,21
spotify:track:1AWQoqb9bSvzTjaLralEkT,3,Justin Timberlake,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,Justified,Throwbacks,0.892,0.714,4,...,audio_features,1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4,2002,"[dance pop, pop]",81,77
spotify:track:1lzr43nnXAijIGYnCT8M8H,4,Shaggy,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,Hot Shot,Throwbacks,0.853,0.606,0,...,audio_features,1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4,2000,"[pop rap, reggae fusion]",74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
spotify:track:24psBRmEw3kHjBGZfl1dmb,5,Lovelyz,spotify:artist:3g34PW5oNmDBxMVUTzx2XK,Ah-Choo,spotify:album:5ZJuawNI3RvxURIBtsDHs0,Lovelyz8,korean,0.729,0.898,5,...,audio_features,24psBRmEw3kHjBGZfl1dmb,https://api.spotify.com/v1/tracks/24psBRmEw3kH...,https://api.spotify.com/v1/audio-analysis/24ps...,218475,4,2015,"[k-pop, k-pop girl group]",43,0
spotify:track:06L1apH8kLF47dbhZ4Zg9A,6,LEE HI,spotify:artist:7cVZApDoQZpS447nHTsNqu,BREATHE,spotify:album:1xnXVzinhfO4I9CzTocPfh,SEOULITE,korean,0.609,0.246,8,...,audio_features,06L1apH8kLF47dbhZ4Zg9A,https://api.spotify.com/v1/tracks/06L1apH8kLF4...,https://api.spotify.com/v1/audio-analysis/06L1...,288993,4,2016,"[k-pop, pop]",67,0
spotify:track:2qWgqPdW1OiAP8KSBH1b93,7,LEE HI,spotify:artist:7cVZApDoQZpS447nHTsNqu,FXXK WIT US,spotify:album:1xnXVzinhfO4I9CzTocPfh,SEOULITE,korean,0.763,0.658,5,...,audio_features,2qWgqPdW1OiAP8KSBH1b93,https://api.spotify.com/v1/tracks/2qWgqPdW1OiA...,https://api.spotify.com/v1/audio-analysis/2qWg...,217861,4,2016,"[k-pop, pop]",67,0
spotify:track:2PTf3zh9UUsgdxQ5b0eXg8,8,Ailee,spotify:artist:3uGFTJ7JMllvhgGpumieHF,I Will Show You,spotify:album:7IyU3Bqm8ERDh7i4wq4OuN,U&I,korean,0.604,0.875,7,...,audio_features,2PTf3zh9UUsgdxQ5b0eXg8,https://api.spotify.com/v1/tracks/2PTf3zh9UUsg...,https://api.spotify.com/v1/audio-analysis/2PTf...,234107,4,2014,"[k-pop, korean pop]",59,0


In [11]:
tracks_complete_df = tracks_complete_df.drop(columns = ['pos', 'artist_name', 'artist_uri', 'track_name', 'album_uri', 'album_name', 'name', 'track_href', 'analysis_url', 'duration_ms', 'type'])

In [12]:
def feature_engineering(df):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df["artist_genres"].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.reset_index(drop = True, inplace = True)

    encoded_df = pd.get_dummies(df, columns = ["key", "mode"])
    encoded_df.reset_index(drop = True, inplace = True)

    scaler = MinMaxScaler()
    norm_cols = [
        "danceability",
        "energy",
        "loudness",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo",
        "time_signature",
        "year",
        "artist_popularity",
        "track_popularity"
    ]
    scale_df = df[norm_cols].reset_index(drop = True)
    scaled_df = pd.DataFrame(scaler.fit_transform(scale_df), columns = norm_cols)

    final_df = pd.concat([genre_df, encoded_df, scaled_df], axis = 1)
    final_df["id"] = df["id"].values

    return final_df

In [18]:
# Testing
tracks_complete_engineered = feature_engineering(tracks_complete_df)
tracks_complete_engineered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,year,artist_popularity,track_popularity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200841,0.034816,0.007033,0.029200,0.813702,0.433860,1.0,0.755102,0.721311,0.809524
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186120,0.027873,0.025227,0.309551,0.950721,0.550401,1.0,0.714286,0.836066,0.988095
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.388013,0.002654,0.000000,0.047468,0.682692,0.260152,1.0,0.714286,0.934426,0.250000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.242902,0.225076,0.000236,0.036392,0.822115,0.271508,1.0,0.693878,0.836066,0.916667
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.096320,0.062812,0.000000,0.411680,0.626202,0.230318,1.0,0.653061,0.721311,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028601,0.352737,0.000000,0.115362,0.461538,0.443910,1.0,0.959184,0.213115,0.000000
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025447,0.877938,0.000000,0.080121,0.286058,0.422669,1.0,0.979592,0.606557,0.000000
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.255521,0.028881,0.000000,0.093067,0.307692,0.132724,1.0,0.979592,0.606557,0.000000
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059727,0.273228,0.000000,0.125432,0.103365,0.463454,1.0,0.938776,0.475410,0.000000


In [None]:
test_df = tracks_df[tracks_df["name"] == "Throwbacks"]
test_df

In [21]:
def vectorise_playlist(library, playlist):
    playlist_with_feature = library[library.index.isin(playlist["track_uri"].values)]
    nonplaylist_with_feature = library[~library.index.isin(playlist["track_uri"].values)]

    vector = playlist_with_feature.sum(axis = 0)
    return vector, nonplaylist_with_feature

In [22]:
# Testing
playlist_vector, nonplaylist = vectorise_playlist(tracks_complete_engineered, test_df)
playlist_vector

0                    0.0
1                    0.0
2                    0.0
3                    0.0
4                    0.0
                    ... 
tempo                0.0
time_signature       0.0
year                 0.0
artist_popularity    0.0
track_popularity     0.0
Length: 135, dtype: float64

In [23]:
nonplaylist_df = tracks_complete_engineered[tracks_complete_engineered.index.isin(nonplaylist["id"].values)]
nonplaylist_df["sim"] = cosine_similarity(nonplaylist.drop("id", axis = 1).values, playlist_vector.values.reshape(1, -1))[:,0]
recs = nonplaylist_df.sort_values("sim", ascending = False).head(10)