In [2]:
import numpy as np 
import pandas as pd
import pickle
import re

from joblib import load, dump
from sklearn.neighbors import NearestNeighbors

pd.set_option('display.max_columns', 24)

In [8]:
# joblib our model
def joblib_model():
    """
    Opens our .csv file for model training
    Trains a nearest neighbor model on our Spotify dataset, and joblib dumps the model 
    Returns the model, and the knn file
    
    input parameters: 
    ----------------
    None
    
    output:
    ------
    model

    """
    
    # import data into a dataframe
    df = pd.read_csv('./spotify_rock.csv', index_col=0)
    
    # training dataframe for NN model (drop str columns, 'artists', 'song')
    df_train = df[df.columns[3:]]
    
    # make a copy of the truncated dataframe
    df_train_trunc = df_train.copy()
    
    # create an instance of the Nearest Neighbors class
    model = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')

    # fit the model 
    model.fit(df_train_trunc)

    # joblib the model
    dump(model, "knn_model.joblib", compress=True)
   
    return model

In [9]:
model_knn = joblib_model()

In [12]:
# create test use query(ies)
# user_req = df.index[(df['song'] == 'Magic Carpet Ride') & (df['artists'] == 'Steppenwolf')]
# user_req = df.index[(df['song'] == 'Know Your Enemy') & (df['artists'] == 'Rage Against The Machine')]
user_req = df.index[(df['song'] == 'Jump') & (df['artists'] == 'Van Halen')]

submit = user_req[0]

# return model inputs based on user artist/song entry
series = df.iloc[user_req, 3:].to_numpy()

# query model based on input
neighbors = model_knn.kneighbors(series, return_distance=False)

# checkout results of NN
neighbors[0]

array([57473, 13812, 67196, 57031, 34491])

In [20]:
song_artists = pd.read_csv("./songs_artists.csv", index_col=0)

In [21]:
song_artists.head()

Unnamed: 0,artists,song
0,Bohemia Suburbana,Retiro
1,Bohemia Suburbana,Tengo Que Llegar
2,Bohemia Suburbana,Pero Nadie
3,Daniela Araújo,Guia-Me
4,Audio Adrenaline,Big House


In [22]:
song_artists.loc[song_artists['artists'] == 'The Beatles']

Unnamed: 0,artists,song
85267,The Beatles,Twist And Shout - Remastered 2009
85268,The Beatles,I Saw Her Standing There - Remastered 2009
85269,The Beatles,Love Me Do - Remastered 2009
85270,The Beatles,All My Loving - Remastered 2009
85271,The Beatles,Till There Was You - Remastered 2009
...,...,...
85710,The Beatles,Penny Lane - Take 6 / Instrumental
85711,The Beatles,A Day In The Life - First Mono Mix
85712,The Beatles,Something - Take 39 / Instrumental / Strings Only
85713,The Beatles,Matchbox - Remastered 2009


In [23]:
# explore what the model returns as neighbors
for _ in range(len(neighbors[0])):
    display(song_artists[song_artists.index == neighbors[0][_]])

Unnamed: 0,artists,song
57473,Van Halen,Jump


Unnamed: 0,artists,song
13812,Cielo Razzo,Qué Se Yo


Unnamed: 0,artists,song
67196,The Black Crowes,Hotel Illness


Unnamed: 0,artists,song
57031,Live,All Over You


Unnamed: 0,artists,song
34491,Los Jaivas,Valparaíso


In [24]:
# load the model 
classifier = load('./knn_model.joblib')

In [25]:
# send through the same query using the joblib model
neighbors = classifier.kneighbors(series, return_distance=False)

In [26]:
# checkout results of NN
neighbors[0]

array([57473, 13812, 67196, 57031, 34491])