In [1]:
from sklearn.neighbors import NearestNeighbors, NeighborhoodComponentsAnalysis
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import pickle
import re

In [11]:
def load_clean_data():
    """
    Loads data and performs basic cleaning / feature engineering.
    Returns one raw DataFrame, and one cleaned DataFrame.
    
    Example
    -------
    > raw, clean = load_clean_data()
    > type(raw), raw.shape, type(clean), df.clean
      (pandas.core.frame.DataFrame,
      (130326, 17),
      pandas.core.frame.DataFrame,
      (130326, 15))
    """
    def clean_text(doc):
        multi_ws = '[ ]{2,}'
        non_alpha = '[^a-zA-Z]'
        empty_start = '^ '
        empty_end = ' $'

        doc = re.sub(non_alpha, ' ', doc)
        doc = re.sub(multi_ws, ' ', doc)
        doc = re.sub(empty_start, '', doc)
        doc = re.sub(empty_end, '', doc)
        return doc

    data = pd.read_csv("Data/spotify2021.csv")
    data['artists'] = data['artists'].apply(clean_text)
    data['name'] = data['name'].apply(clean_text)

    df = data.drop(columns=['artists', 'name', 'year', 'release_date', 'duration_ms', 'popularity'])
    df = df[~df.id.duplicated(keep='first')]

    to_bins = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'valence']

    for col in to_bins:
        df[col] = round(df[col] * 4)

    def is_live(n):
        return (1 if n > 0.75 else 0)

    def round_10(n):
        a = (n // 10) * 10
        b = a + 10
        return (b if n - a > b - n else a)

    df['tempo'] = df['tempo'].apply(round_10)
    df['liveness'] = df['liveness'].apply(is_live)
    df['speechiness'] = round(df['speechiness'] * 10 / 3)
    df['loudness'] = round(df['loudness'])

    return data, df

def suggest(song_id=None, n_suggestions=1, output_format='records'):
    """
    Suggests Spotify song(s) given one song id.
    
    Parameters
    ----------
    song_id: str
        Song id from which to base suggestions
    
    n_suggestions: int {1, 3, 5, 10, 15, 20}, default 1
        Number of songs to suggest

    output_format: str, default 'records'
        Output format of the JSON string:
            ‘split’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], ‘data’ -> [values]}
            ‘records’ : list like [{column -> value}, … , {column -> value}]
            ‘index’ : dict like {index -> {column -> value}}
            ‘columns’ : dict like {column -> {index -> value}}
            ‘values’ : just the values array
            ‘table’ : dict like {‘schema’: {schema}, ‘data’: {data}}
        Note: Directly passed to pandas.DataFrame.to_json(orient=output_format) 
    
    Example
    -------
    > example = df[df.track_id == '6Wosx2euFPMT14UXiWudMy']
    > example.artist_name
      R3HAB
    > example.track_name
      Radio Silence
    > suggestion = suggest('3ajZwXiT7qpanVm5DcvcQF', 3)
    > suggestion
             artists	      id                        name
       9117	 Joni Mitchell	  00xemFYjQNRpOlPhVaLAHa	Urge For Going Live...
      42142	 Carmen Miranda	  2heQBczLrbtAgOtkDk831k	Samba Rasgado
      47980	 Amalia Mendoza	  4QJsxMItrRbvn05gGuf0CZ	chame a M la Culpa

    """
    song = df[df.id == song_id].drop(columns='id').values

    loaded_model = pickle.load(open(f'./model/model_{n_suggestions}_suggestions.sav', 'rb'))

    output = data.iloc[loaded_model.kneighbors(song)[1][0][1:]]

    drop_cols = ['acousticness', 'danceability', 'duration_ms', 'energy',
                 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
                 'mode', 'popularity', 'release_date', 'speechiness', 'tempo',
                 'valence', 'year']
    
    suggestion = output.drop(columns = drop_cols)#.to_json(orient=output_format)

    return suggestion

In [12]:
data, df = load_clean_data()
print(df.shape)
df.head()

(172230, 13)


Unnamed: 0,acousticness,danceability,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,4.0,2.0,1.0,0,0cS0A1fUEUd1EW3FcF8AEI,0.0,5,0,-13.0,0,0.0,150.0,3.0
1,3.0,3.0,2.0,0,0hbkKFIJm7Z05H8Zl9w30f,0.0,5,0,-7.0,0,0.0,90.0,4.0
2,4.0,3.0,1.0,0,11m7laMUgmOKqI3oYzuhne,0.0,0,0,-12.0,1,1.0,100.0,3.0
3,0.0,3.0,3.0,0,19Lc5SfJJ5O1oaxY0fpwfh,3.0,2,0,-7.0,1,0.0,130.0,0.0
4,1.0,3.0,3.0,1,2hJjbsLCytGsnAHfdsLejp,0.0,10,0,-6.0,0,0.0,120.0,1.0


In [10]:
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,Mamie Smith,0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,Screamin Jay Hawkins,0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,Mamie Smith,0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,Oscar Velazquez,0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music Xavier Santos Carlos Gomix Remix,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,Mixe,0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [13]:
suggest('3ajZwXiT7qpanVm5DcvcQF', 3)

Unnamed: 0,artists,id,name
9117,Joni Mitchell,00xemFYjQNRpOlPhVaLAHa,Urge For Going Live at The nd Fret Philadelphi...
42142,Carmen Miranda,2heQBczLrbtAgOtkDk831k,Samba Rasgado
47980,Amalia Mendoza,4QJsxMItrRbvn05gGuf0CZ,chame a M la Culpa


In [5]:
# # create and save models
# num = [1, 3, 5, 10, 15, 20]

# for i in num:
#     nn = NearestNeighbors(algorithm='brute', n_neighbors=i+1)
#     model = nn.fit(df.drop(columns='id'))
#     pickle.dump(model, open(f'model_{i}_suggestions.sav', 'wb'))

In [None]:
df_20 = pd.read_csv('./data/')

In [6]:
# CLIENT_ID = 'c7d6a5303acf47a99dacc2ac728ae4b0'
# CLIENT_SECRET = 'e7698c7069ab403f8b59c8a4719f8467'
# AUTH_URL = 'https://accounts.spotify.com/api/token'
# # POST
# auth_response = requests.post(AUTH_URL, {
#     'grant_type': 'client_credentials',
#     'client_id': CLIENT_ID,
#     'client_secret': CLIENT_SECRET,
# })
# auth_response_data = auth_response.json()
# # save the access token
# access_token = auth_response_data['access_token']
# headers = {'Authorization': f'Bearer {access_token}'}

# track_id = '6y0igZArWVi6Iz0rj35c1Y'
# request = f'https://api.spotify.com/v1/tracks/{track_id}'

# response = requests.get(request, headers=headers)
# response = response.json()
# print(response)

In [7]:
# api trials
# import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials

# %env SPOTIPY_CLIENT_ID=365f1df0becc4069ab821f255250e97a
# %env SPOTIPY_CLIENT_SECRET=61aa6e153f314cb2959b1de5ceca6a1b

# auth_manager = SpotifyClientCredentials()
# sp = spotipy.Spotify(auth_manager=auth_manager)

# first_ex = sp.audio_features(tracks=['19Lc5SfJJ5O1oaxY0fpwfh'])
# first_ex[0]['time_signature']

# data['time_signature'] = np.nan
# ids = data.id.values.tolist()
# timesigs = []
# for track_id in ids:
#     timesig = sp.audio_features(track_id)[0]['time_signature']
#     timesigs.append(timesig)