In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import time
import os

from pyspark import SparkContext

import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials

import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
spotify_details = {
    'Client_id': '16d7d284a7734668ae40098fc56881e8',
    'client_secret': '25f9c87708aa441cb46d1a273a7840fd'
}

auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'], client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [3]:
# sp.tracks(["2qWgqPdW1OiAP8KSBH1b93", "2PTf3zh9UUsgdxQ5b0eXg8", "7tIJDktakabGoHjwTTa35W"])['tracks'][0]
ls = sp.search('old road')['tracks']['items']
for track in ls:
    print(track['name'] + " - " + track['album']['release_date'][:4], track['uri'])

# ls

Old Town Road - 2019 spotify:track:0F7FA14euOIX8KcbEturGH
Old Road - 2020 spotify:track:6BlGcNxBiaKR0BL81Wo4Xn
Old Town Road - Remix - 2019 spotify:track:2YpeDb67231RjR0MgVLzsG
Old Road - 2022 spotify:track:7snpxj9bTICE2VIlOzBLRP
Old Dirt Roads - 2023 spotify:track:0ayd4fDb6rMZl9uqCY3Hzl
Old Roads and Old Friends - 2023 spotify:track:5YsUIieJ340lBw8eQuG8Hf
Old Town Road - 2019 spotify:track:07cQIm99dnNSKs6skathg0
Old Road - 1969 spotify:track:6zS6G60qHcCOtmRQa427Ja
Old Road - 2022 spotify:track:18LumWwIv35eEuJ7bjccv0
On the Road Again - 2001 spotify:track:3MpK9vnxxgYvh0CNeGvx6G


In [4]:
# data_dict = {}

# with open("mpd.slice.0-999.json") as file:
# 	data = json.load(file)

# for playlist in tqdm(data['playlists']):
#     pid = playlist['pid']
#     data_dict[pid] = []
#     for tracks in playlist['tracks']:
#         data_dict[pid].append(tracks['track_uri'].split(':')[-1])

# with open('comprsd.json', 'w') as fp:
#     json.dump(data_dict, fp)

In [4]:
def get_slice_tracks(input_dir):

    track_list = []

    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.split('.')[-1] != 'json':
            continue

        with open(os.path.join(input_dir, file_name)) as file:
            data = json.load(file)

        for playlist in data['playlists']:
            for tracks in playlist['tracks']:
                track_list.append(tracks['track_uri'].split(':')[-1])

    track_list = list(set(track_list))
    return track_list

slice_dir_tracks = get_slice_tracks("data10")
print(f"{len(slice_dir_tracks)} Unique Tracks")

  0%|                                                    | 0/11 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 11/11 [00:01<00:00,  7.96it/s]

170089 Unique Tracks





In [5]:
def get_features(tracks, window_size=50):

    dropped_row_counter = 0
    df = pd.DataFrame()

    for i in tqdm(range(0, len(tracks), window_size)):

        try:        
            audio_features = pd.DataFrame([t for t in sp.audio_features(tracks[i:i+window_size]) if t != None])
            track_features = pd.DataFrame([t for t in sp.tracks(tracks[i:i+window_size])['tracks'] if t != None])

            track_features['release_year'] = track_features['album'].apply(lambda x: (int(x['release_date'][:4]) - 1950)/70)
            track_features['popularity'] = track_features['popularity'].apply(lambda x: x/100)
            track_features['explicit'] = track_features['explicit'].astype(int)
            track_features = track_features[['explicit', 'popularity', 'release_year', 'id']]
            
            audio_features.drop(['type', 'uri', 'track_href', 'analysis_url', 'duration_ms'], axis='columns', inplace=True)
            audio_features['loudness'] = audio_features['loudness'].apply(lambda x: (x+60)/60)
            audio_features['tempo'] = audio_features['tempo'].apply(lambda x: x/300)
            audio_features['time_signature'] = audio_features['time_signature'].apply(lambda x: (x-3)/4)
            audio_features['key'] = audio_features['key'].apply(lambda x: np.power(2, (x-12)/12))     # Converting key to frequency ratio wrt max key

            all_fts = pd.merge(audio_features, track_features, on='id', how='inner')
            df = pd.concat([df, all_fts])

            if all_fts.shape[0] != window_size:
                dropped_row_counter += window_size - all_fts.shape[0]
                print(f"{window_size - all_fts.shape[0]} rows dropped in section {i}-{i+window_size}")
        except:
            print(f"{i}-{i+window_size} section failed")
            pass
    
    return df

In [6]:
# slice10_fts = get_features(slice_dir_tracks).reset_index(drop=True)

In [8]:
# slice10_fts
#  first_slice_fts.to_csv('first_slice_fts.csv', index=False)

# Search method

In [9]:
db =  pd.read_csv('first_slice_fts.csv')
mean_vector = db.drop('id', axis='columns').mean()
db[list(set(db.columns) - {'id'})] -= mean_vector

In [10]:
print(db.shape)
db.head(3)

(34441, 16)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,time_signature,explicit,popularity,release_year
0,-0.010724,0.116427,-0.022902,0.010473,0.334863,-0.045186,0.046229,-0.076971,0.005533,0.477891,0.24266,5iXswDMklrAlNLEJ4XCHTY,0.018568,-0.182167,-0.160615,-0.233241
1,-0.164724,0.332427,-0.160591,0.074489,0.334863,-0.033186,-0.263742,-0.068361,0.181533,-0.304109,0.1745,4Y2glvLjQGOb4dXnwm1hQf,0.018568,-0.182167,0.349385,0.038188
2,0.199276,-0.253573,-0.190322,-0.043077,0.334863,0.071714,0.371229,-0.055371,0.171533,0.311891,-0.082437,7t8AhVb1nd1SOhutQgM62H,0.018568,-0.182167,-0.280615,0.081045


In [11]:
def get_vector(vec):
    cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'explicit', 'popularity', 'release_year']

    return np.array(vec[cols])

# Pinecone

In [12]:
PINECONE_API_KEY = 'd5018bce-8fb3-4c62-8527-18621cd84f7e'
PINECONE_ENV = 'gcp-starter'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [13]:
INDEX_NAME = 'vector-similarity-search-spotify'
DIMS = db.shape[1] - 1

if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=INDEX_NAME,
        dimension=DIMS,
        metric='cosine'
    )

pinecone_index = pinecone.GRPCIndex(INDEX_NAME)

In [14]:
def upload_pinecone(df, index):

    for idx in tqdm(range(0, df.shape[0], 100)):

        upload_list = []
        for i in range(idx, idx+100):
            try:
                upload_list.append({'id':df.loc[i, 'id'], 'values': get_vector(df.loc[i])})
            except:
                pass

        index.upsert(upload_list)
        
# upload_pinecone(db, pinecone_index)

In [15]:
pinecone_index.query(id='5Q0Nhxo0l2bP3pNjpGJwV1', top_k=20)

{'matches': [{'id': '5Q0Nhxo0l2bP3pNjpGJwV1',
              'metadata': {},
              'score': 0.99861234,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '4JQSMg83F8qYwSBt5xOXsQ',
              'metadata': {},
              'score': 0.9883547,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '2Bs4jQEGMycglOfWPBqrVG',
              'metadata': {},
              'score': 0.9817177,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '6JY1IdkZGeIcPegKxjSKeb',
              'metadata': {},
              'score': 0.98066634,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '6RUKPb4LETWmmr3iAEQktW',
              'metadata': {},
              'score': 0.9788617,
              'sparse_values': {'indices': [], 'values': []},
              'values'