In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import time
import os
import shutil


import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials

import pinecone
from urllib.request import urlretrieve

  from tqdm.autonotebook import tqdm


In [97]:
spotify_details = {
    # Fill in your token
    'Client_id': '',
    'client_secret': ''
}

auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'], client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [118]:
# Code to use spotify search
file = "songs.csv"
df = pd.read_csv(f'file')
new_cols = ['spotify_name', 'uri', 'preview_url', 'popularity', 'duration_ms', 'explicit', 'ext_url', 'id']
for col in new_cols: df[col] = ''

for idx in tqdm(df.index):
    artist_name = df.loc[idx, 'Artist']
    track_name = df.loc[idx, 'Title']

    # Spotify API Call #
    results = sp.search(q=f"track:{track_name} artist:{artist_name}", type="track", limit=1)
    try:
        track = results['tracks']['items'][0]
    except:
        print(f"Index {idx} of {file} returned no results")
        pass
    # ---------------- #

    try:
        df.loc[idx, 'duration_ms'] = track['duration_ms']
        df.loc[idx, 'explicit'] = track['explicit']
        df.loc[idx, 'ext_url'] = track['external_urls']['spotify']
        df.loc[idx, 'id'] = track['id']
        df.loc[idx, 'spotify_name'] = track['name']
        df.loc[idx, 'popularity'] = track['popularity']
        df.loc[idx, 'uri'] = track['uri']
        df.loc[idx, 'preview_url'] = track['preview_url']
    except:
        print(f"Result has missing value, index {idx} of {file}")
        pass

df.to_csv(f'{file}', index=False)
print(f"{file} completed")

In [189]:
# Code to download audio files
os.mkdir('audio')
for idx in tqdm(ss_df.index):
    track_id = ss_df.loc[idx, 'track_id']
    url = ss_df.loc[idx, 'preview_url']
    urlretrieve(url, f"audio/{track_id}.mp3")

100%|█████████████████████████████████████| 5600/5600 [2:13:55<00:00,  1.43s/it]


In [3]:
# Code to preprocess (normalise) audio features
ss_df = pd.read_csv('ss_links.csv')

ft_cols = ['track_popularity', 'track_album_release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

ss_df['track_popularity'] = ss_df['track_popularity'].apply(lambda x: x/100)
ss_df['track_album_release_date'] = ss_df['track_album_release_date'].apply(lambda x: (int(x[:4]) - 1950)/70)

ss_df['loudness'] = ss_df['loudness'].apply(lambda x: (x+60)/60)
ss_df['tempo'] = ss_df['tempo'].apply(lambda x: x/300)
ss_df['key'] = ss_df['key'].apply(lambda x: np.power(2, (x-12)/12))

ss_df['ft_vec'] = None
for idx in ss_df.index:
    ss_df.loc[idx, 'ft_vec'] = str(list(ss_df.loc[idx, ft_cols]))

ss_df.to_csv('dataset.csv')

In [None]:
# Code to fetch features
def get_features(tracks, window_size=50):

    dropped_row_counter = 0
    df = pd.DataFrame()

    for i in tqdm(range(0, len(tracks), window_size)):

        try:        
            audio_features = pd.DataFrame([t for t in sp.audio_features(tracks[i:i+window_size]) if t != None])
            track_features = pd.DataFrame([t for t in sp.tracks(tracks[i:i+window_size])['tracks'] if t != None])

            track_features['release_year'] = track_features['album'].apply(lambda x: (int(x['release_date'][:4]) - 1950)/70)
            track_features['popularity'] = track_features['popularity'].apply(lambda x: x/100)
            track_features['explicit'] = track_features['explicit'].astype(int)
            track_features = track_features[['explicit', 'popularity', 'release_year', 'id']]
            
            audio_features.drop(['type', 'uri', 'track_href', 'analysis_url', 'duration_ms'], axis='columns', inplace=True)
            audio_features['loudness'] = audio_features['loudness'].apply(lambda x: (x+60)/60)
            audio_features['tempo'] = audio_features['tempo'].apply(lambda x: x/300)
            audio_features['time_signature'] = audio_features['time_signature'].apply(lambda x: (x-3)/4)
            audio_features['key'] = audio_features['key'].apply(lambda x: np.power(2, (x-12)/12))     # Converting key to frequency ratio wrt max key

            all_fts = pd.merge(audio_features, track_features, on='id', how='inner')
            df = pd.concat([df, all_fts])

            if all_fts.shape[0] != window_size:
                dropped_row_counter += window_size - all_fts.shape[0]
                print(f"{window_size - all_fts.shape[0]} rows dropped in section {i}-{i+window_size}")
        except:
            print(f"{i}-{i+window_size} section failed")
            pass
    
    return df