# Load Credentials
load credentials from id.json into env vars

In [None]:
import json
import os

with open('id.json', 'r') as f:
  data = json.load(f)

os.environ["SPOTIPY_CLIENT_ID"] = data["client_id"]
os.environ["SPOTIPY_CLIENT_SECRET"] = data["client_secret"]
os.environ["SPOTIPY_REDIRECT_URI"] = r'http://localhost:8080'

# Get all liked songs

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

results = []

offset = 0
num_songs_total = sp.current_user_saved_tracks(limit=1)['total']

# Information returned in batches in dictionary
while offset < num_songs_total:
    print(f"Processing batch: {offset}/{num_songs_total}", end='\r')
    curr_batch = sp.current_user_saved_tracks(limit=50, offset=offset)['items']
    for track in curr_batch:
        res = track['track']
        del res['available_markets']
        del res['album']['available_markets']
        results.append(res)
        
    offset += 50
    
print("\nFinished loading liked songs")

# Get audio features for all liked songs

In [None]:
offset = 0
batch_size = 100
while offset < num_songs_total:
    # compile list to give query
    print(f"Processing batch: {offset}/{num_songs_total}", end='\r')
    uri_list = []

    curr_size = min(batch_size, num_songs_total - offset)
    for i in range(curr_size):
        uri_list.append(results[offset+i]["uri"])

    # conduct query
    curr_batch = sp.audio_features(uri_list)

    # associate query result with existing results dictionary
    keys_i_hate = ['id', 'type', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature']
    for i in range(curr_size):
        for key in keys_i_hate:
            del curr_batch[i][key]
        results[offset+i]["audio_features"] =  curr_batch[i]

    # increment batch (can also be done outside of loop)
    offset += batch_size

# Group Liked Songs using metadata

# Extract "Useful" features for song recommendations

*Useful features queried from spotify*
1. 2. Danceability/Energy: Seems like gradient goes from more to less quite reliably, Seems highly correlated
3. Mode: Major/Minor, very well clustered. COULD be useful or not should try out before conclusion
4. Speechiness: All speechy songs are in one place, can probably use
5. 6. Acousticness/Instrumentalness: Seems like instrumental is almost strictly superset, these are well clustered
7. Liveness: seems well clusterd
8. Valence: "Happiness" Seems very important although not the best clustered. Local clusters

### Convert audio feature dict into numpy array for processing

In [None]:
import numpy as np
af_used = ['danceability', 'energy', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
audio_features_arr = np.zeros((num_songs_total, len(af_used)))

for i in range(num_songs_total):
    for j, audio_feature in enumerate(af_used):
        audio_features_arr[i][j] = results[i]['audio_features'][audio_feature]

## Standardize Data

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

sc = StandardScaler()
sc.fit(audio_features_arr)

std_data = sc.transform(audio_features_arr)

num_comp = 2
pca = PCA(n_components=num_comp)
pca.fit(std_data)
pca_result = pca.transform(std_data)


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
def graph_embeddings(results: np.array, names: list, figsize=(5,5)):


    df_data = pd.DataFrame(results, columns=['data-one', 'data-two'])

    for i in range(len(names)):  
        df_data['label'] = audio_features_arr[:,i]
        plt.figure(figsize=figsize)
        sns.scatterplot(
            x="data-one", y="data-two",
            hue="label",
            data=df_data,
            alpha=0.5
        )
        plt.title(names[i])
        plt.show() 

In [None]:
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

TSNE = TSNE(n_components=2, perplexity=50, n_iter=5000, learning_rate=200)
tsne_results = TSNE.fit_transform(std_data)

In [None]:
graph_embeddings(tsne_results, af_used)

In [None]:
import umap
umap_results = umap.UMAP().fit_transform(std_data)

In [None]:
print(umap_results.shape)

In [None]:
graph_embeddings(umap_results, af_used)

In [None]:
from sklearn.manifold import MDS
mds_results = MDS().fit_transform(std_data)

In [None]:
print(mds_results.shape)

In [None]:
graph_embeddings(mds_results, af_used)

In [None]:
def return_sugggestions_on_song(song_coords: list, num_songs_return: int) -> list:
    return None

In [None]:
# Receive a song name

# look up coordinates of song name from the big numpy table

# plug table into return_suggestion_on_song

print(return_sugggestions_on_song([TODO]))