In [2]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
%matplotlib inline

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# tensorflow
import tensorflow as tf

# Metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Resources/clean_spotify.csv", encoding='latin1')

# Review the DataFrame
df.head(20)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic
5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,0.481,6,-8.807,1,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic
6,6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World;Christina Aguilera,Is There Anybody Out There?,Say Something,74,229400,False,0.407,0.147,2,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic
7,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,80,242946,False,0.703,0.444,11,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
8,8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,74,189613,False,0.625,0.414,0,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic
9,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,1,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic


In [4]:
# Check for null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        89740 non-null  int64  
 1   track_id          89740 non-null  object 
 2   artists           89740 non-null  object 
 3   album_name        89740 non-null  object 
 4   track_name        89740 non-null  object 
 5   popularity        89740 non-null  int64  
 6   duration_ms       89740 non-null  int64  
 7   explicit          89740 non-null  bool   
 8   danceability      89740 non-null  float64
 9   energy            89740 non-null  float64
 10  key               89740 non-null  int64  
 11  loudness          89740 non-null  float64
 12  mode              89740 non-null  int64  
 13  speechiness       89740 non-null  float64
 14  acousticness      89740 non-null  float64
 15  instrumentalness  89740 non-null  float64
 16  liveness          89740 non-null  float6

In [5]:
# View the metadata for each column
df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0
mean,53479.005739,33.198808,229144.4,0.562166,0.634458,5.28353,-8.498994,0.636973,0.087442,0.328285,0.173415,0.216971,0.469474,122.058134,3.897426
std,33410.141924,20.58064,112945.8,0.176692,0.256606,3.559912,5.221518,0.480875,0.113278,0.338321,0.323849,0.194885,0.262864,30.117651,0.453437
min,0.0,0.0,8586.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23766.75,19.0,173040.0,0.45,0.457,2.0,-10.32225,0.0,0.036,0.0171,0.0,0.0982,0.249,99.26275,4.0
50%,50680.5,33.0,213295.5,0.576,0.676,5.0,-7.185,1.0,0.0489,0.188,5.8e-05,0.132,0.457,122.013,4.0
75%,80618.5,49.0,264293.0,0.692,0.853,8.0,-5.108,1.0,0.0859,0.625,0.097625,0.279,0.682,140.077,4.0
max,113999.0,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [6]:
null_rows = df.loc[df['album_name'].isnull()]
print(null_rows)

Empty DataFrame
Columns: [Unnamed: 0, track_id, artists, album_name, track_name, popularity, duration_ms, explicit, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, track_genre]
Index: []


In [7]:
df = df.dropna(subset=['album_name'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        89740 non-null  int64  
 1   track_id          89740 non-null  object 
 2   artists           89740 non-null  object 
 3   album_name        89740 non-null  object 
 4   track_name        89740 non-null  object 
 5   popularity        89740 non-null  int64  
 6   duration_ms       89740 non-null  int64  
 7   explicit          89740 non-null  bool   
 8   danceability      89740 non-null  float64
 9   energy            89740 non-null  float64
 10  key               89740 non-null  int64  
 11  loudness          89740 non-null  float64
 12  mode              89740 non-null  int64  
 13  speechiness       89740 non-null  float64
 14  acousticness      89740 non-null  float64
 15  instrumentalness  89740 non-null  float64
 16  liveness          89740 non-null  float6

In [8]:
df.track_genre.nunique()

113

In [9]:
genre_df = df.groupby('track_genre')['track_id'].nunique().reset_index(name='song_count')
print(genre_df)

     track_genre  song_count
0       acoustic        1000
1       afrobeat         999
2       alt-rock         999
3    alternative         407
4        ambient         999
..           ...         ...
108       techno         416
109       trance         708
110     trip-hop         904
111      turkish         870
112  world-music         923

[113 rows x 2 columns]


In [10]:
genre_df = genre_df.sort_values(by='song_count', ascending=True)
print(genre_df)

   track_genre  song_count
89   reggaeton          74
56       indie         134
53       house         210
85        punk         226
71       metal         232
..         ...         ...
4      ambient         999
2     alt-rock         999
1     afrobeat         999
12    cantopop         999
0     acoustic        1000

[113 rows x 2 columns]


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        89740 non-null  int64  
 1   track_id          89740 non-null  object 
 2   artists           89740 non-null  object 
 3   album_name        89740 non-null  object 
 4   track_name        89740 non-null  object 
 5   popularity        89740 non-null  int64  
 6   duration_ms       89740 non-null  int64  
 7   explicit          89740 non-null  bool   
 8   danceability      89740 non-null  float64
 9   energy            89740 non-null  float64
 10  key               89740 non-null  int64  
 11  loudness          89740 non-null  float64
 12  mode              89740 non-null  int64  
 13  speechiness       89740 non-null  float64
 14  acousticness      89740 non-null  float64
 15  instrumentalness  89740 non-null  float64
 16  liveness          89740 non-null  float6

In [12]:
numeric_columns = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

numeric_df = df.loc[:, numeric_columns]
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        89740 non-null  int64  
 1   duration_ms       89740 non-null  int64  
 2   explicit          89740 non-null  bool   
 3   danceability      89740 non-null  float64
 4   energy            89740 non-null  float64
 5   key               89740 non-null  int64  
 6   loudness          89740 non-null  float64
 7   mode              89740 non-null  int64  
 8   speechiness       89740 non-null  float64
 9   acousticness      89740 non-null  float64
 10  instrumentalness  89740 non-null  float64
 11  liveness          89740 non-null  float64
 12  valence           89740 non-null  float64
 13  tempo             89740 non-null  float64
 14  time_signature    89740 non-null  int64  
dtypes: bool(1), float64(9), int64(5)
memory usage: 9.7 MB


In [13]:
data_types_counts = df.dtypes.value_counts()
print(data_types_counts)

float64    9
int64      6
object     5
bool       1
Name: count, dtype: int64


In [15]:
df['explicit'] = df['explicit'].astype(float)

In [16]:
# Assuming 'df' is your DataFrame
int_columns = numeric_df.select_dtypes(include=['int64']).columns
numeric_df[int_columns] = numeric_df[int_columns].astype('float64')

# Verify the data types after conversion
print(numeric_df.dtypes)

popularity          float64
duration_ms         float64
explicit               bool
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature      float64
dtype: object


In [17]:
# change bool column to float
numeric_df.loc[:, 'explicit'] = df.loc[:, 'explicit'].astype(int)
numeric_df.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,73.0,230666.0,0,0.676,0.461,1.0,-6.746,0.0,0.143,0.0322,1e-06,0.358,0.715,87.917,4.0
1,55.0,149610.0,0,0.42,0.166,1.0,-17.235,1.0,0.0763,0.924,6e-06,0.101,0.267,77.489,4.0
2,57.0,210826.0,0,0.438,0.359,0.0,-9.734,1.0,0.0557,0.21,0.0,0.117,0.12,76.332,4.0
3,71.0,201933.0,0,0.266,0.0596,0.0,-18.515,1.0,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3.0
4,82.0,198853.0,0,0.618,0.443,2.0,-9.681,1.0,0.0526,0.469,0.0,0.0829,0.167,119.949,4.0


In [18]:
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        89740 non-null  float64
 1   duration_ms       89740 non-null  float64
 2   explicit          89740 non-null  int32  
 3   danceability      89740 non-null  float64
 4   energy            89740 non-null  float64
 5   key               89740 non-null  float64
 6   loudness          89740 non-null  float64
 7   mode              89740 non-null  float64
 8   speechiness       89740 non-null  float64
 9   acousticness      89740 non-null  float64
 10  instrumentalness  89740 non-null  float64
 11  liveness          89740 non-null  float64
 12  valence           89740 non-null  float64
 13  tempo             89740 non-null  float64
 14  time_signature    89740 non-null  float64
dtypes: float64(14), int32(1)
memory usage: 9.9 MB


In [20]:
# Assuming 'df' is your DataFrame
int_columns = numeric_df.select_dtypes(include=['int32']).columns
numeric_df[int_columns] = numeric_df[int_columns].astype('float64')

# Verify the data types after conversion
print(numeric_df.dtypes)

popularity          float64
duration_ms         float64
explicit            float64
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature      float64
dtype: object


In [21]:
# initialize
scaler = StandardScaler()

# fit
scaler.fit(numeric_df)

# predict/transform
scaled_data = scaler.transform(numeric_df)
df_scaled = pd.DataFrame(scaled_data, columns=numeric_columns, index = df.track_id)

df_scaled.head()

Unnamed: 0_level_0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5SuOikwiRyPMVoIQDJUgSV,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
4qPNDBW1i3p13qLCt0Ki3A,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
1iJBSr7s7jYXzM8EGcbK5b,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
6lfxq3CG4xtTiEg7opyCyx,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
5vjLSffimiIP26QG5WcN2K,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [22]:
string_columns = ['track_id', 'track_name', 'album_name', 'artists' , 'track_genre']
string_df = df.loc[:, string_columns]
string_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89740 entries, 0 to 89739
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   track_id     89740 non-null  object
 1   track_name   89740 non-null  object
 2   album_name   89740 non-null  object
 3   artists      89740 non-null  object
 4   track_genre  89740 non-null  object
dtypes: object(5)
memory usage: 3.4+ MB


In [24]:
numerical_cols = df.select_dtypes(include=np.number).columns
data_df = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols, index=df['track_id'])

data_df.head()

Unnamed: 0_level_0,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5SuOikwiRyPMVoIQDJUgSV,-1.600691,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
4qPNDBW1i3p13qLCt0Ki3A,-1.600661,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
1iJBSr7s7jYXzM8EGcbK5b,-1.600631,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
6lfxq3CG4xtTiEg7opyCyx,-1.600601,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
5vjLSffimiIP26QG5WcN2K,-1.600571,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [25]:
# Check to see if song is in dataset
song= df[(df['track_name'] == 'See You Again')]
song

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
716,716,3XfMyT4Xf5LegDhvbrFEjp,Boyce Avenue;Bea Miller,"Cover Sessions, Vol. 4",See You Again,50,239802,0.0,0.57,0.413,9,-7.034,1,0.0345,0.535,0.0,0.076,0.278,152.032,4,acoustic
13486,14329,7qdNE9DyIKk3invtcxdGG8,One Voice Children's Choir,Memories,See You Again,43,209881,0.0,0.64,0.406,10,-9.337,1,0.0476,0.787,0.0,0.0725,0.247,80.029,4,children
19568,20696,62lmjlPu5Vwd3h18FMSz1G,Wiz Khalifa;Charlie Puth,Give You Love - Cozy Hits,See You Again,2,229525,0.0,0.69,0.48,10,-7.503,1,0.0816,0.369,0.0,0.0649,0.286,80.025,4,dance
19569,20697,0FtOxBrDP67usYNbqOuy7T,Wiz Khalifa;Charlie Puth,On Chill - Rap & RnB,See You Again,0,229525,0.0,0.69,0.48,10,-7.503,1,0.0816,0.369,0.0,0.0649,0.286,80.025,4,dance
27502,29795,4pXG8Q82L8WvypAm5Wo86y,Seven Lions;Jason Ross;Fiora,See You Again EP,See You Again,46,263200,0.0,0.248,0.588,8,-6.292,1,0.0459,0.0415,3.3e-05,0.123,0.0382,150.419,4,dubstep


In [26]:
df_model = data_df.reset_index()
df_model.head()

Unnamed: 0.1,track_id,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5SuOikwiRyPMVoIQDJUgSV,-1.600691,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
1,4qPNDBW1i3p13qLCt0Ki3A,-1.600661,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
2,1iJBSr7s7jYXzM8EGcbK5b,-1.600631,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
3,6lfxq3CG4xtTiEg7opyCyx,-1.600601,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
4,5vjLSffimiIP26QG5WcN2K,-1.600571,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [27]:
song_df = df[['track_id', 'track_name', 'artists', 'album_name']]
df_model = pd.merge(df_model, song_df, on='track_id', how='left')
df_model.head()

Unnamed: 0.1,track_id,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_name,artists,album_name
0,5SuOikwiRyPMVoIQDJUgSV,-1.600691,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216,Comedy,Gen Hoshino,Comedy
1,4qPNDBW1i3p13qLCt0Ki3A,-1.600661,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216,Ghost - Acoustic,Ben Woodward,Ghost (Acoustic)
2,1iJBSr7s7jYXzM8EGcbK5b,-1.600631,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216,To Begin Again,Ingrid Michaelson;ZAYN,To Begin Again
3,6lfxq3CG4xtTiEg7opyCyx,-1.600601,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174,Can't Help Falling In Love,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...
4,5vjLSffimiIP26QG5WcN2K,-1.600571,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216,Hold On,Chord Overstreet,Hold On


In [28]:
# define the number of nearest neighbors to consider
k = 6

# define a function to recommend songs based on a given song name
def recommend_song_artist(track_name, artist, df_model, numeric_columns):
    
    train = df_model.loc[df_model.artists == artist]
    
    k = min(len(train), 6)
    
    # initialize the model with the number of neighbors
    model = NearestNeighbors(n_neighbors=k, metric='cosine')

    # fit the model to the data
    model.fit(train[numeric_columns])
    
    # get the track_id of the given track name
    track_id = df_model[df_model['track_name'] == track_name]['track_id'].iloc[0]
    
    # get the index of the tracks in the model dataframe
    idx = df_model[df_model['track_id'] == track_id].index[0]
    
    # get the features of the tracks
    track_features = df_model.loc[idx, numeric_columns].values.reshape(1, -1)
    
    # find the k nearest neighbors
    distances, indices = model.kneighbors(track_features)
    
    # get the track names of the nearest neighbors
    tracks = train.iloc[indices[0]]
    tracks["distance"] = distances[0]
    
    return tracks

In [29]:
recommend_song_artist('The Hills', 'The Weeknd', df_model, numeric_columns)

Unnamed: 0.1,track_id,Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_name,artists,album_name,distance
67642,4g7118rCobKcU4XQXSFlBS,0.825951,-1.467349,0.116301,3.263203,0.016037,-0.325241,-1.484183,0.285741,-1.324621,-0.165456,-0.720578,-0.535485,-0.40009,-1.310476,-0.301424,0.226216,The Hills,The Weeknd,Drippy Drippy,3.330669e-16
67846,7fBv7CLKzipRk6EC6TWHOB,0.837205,2.662769,0.116062,3.263203,0.129229,-0.27458,-1.484183,0.275016,-1.324621,-0.317296,-0.772009,-0.535485,-0.420615,-1.264825,-0.30066,0.226216,The Hills,The Weeknd,Beauty Behind The Madness,0.3796517
67657,0MnmtOOke4uQVBMAjIgwVZ,0.826999,-1.613118,1.175806,-0.306447,0.502763,-0.262888,-0.079646,0.489897,-1.324621,-0.492089,-0.694271,-0.504884,-0.477059,-1.26102,-0.435134,0.226216,After Hours,The Weeknd,Halloween Party 2022,0.5255697
67697,5QO79kh1waicV47BqGRL3g,0.829723,2.711359,-0.11969,3.263203,0.666891,0.746445,-1.484183,0.576846,0.754933,-0.499151,-0.90768,-0.535446,1.67294,0.663943,-0.13305,0.226216,Save Your Tears,The Weeknd,After Hours,0.6802423
67658,52KDAbgFGCXZQVlOXy2XIQ,0.827029,-1.370169,-0.132404,-0.306447,0.378251,0.500932,-1.484183,0.808197,-1.324621,-0.372029,-0.249129,-0.535485,0.487618,1.291649,-0.964062,0.226216,Out of Time,The Weeknd,LUGNA HITS,0.7203179
67660,2eyXrWRxc7g5yx0D4J3WTS,0.827148,-1.32158,-0.132404,-0.306447,0.378251,0.500932,-1.484183,0.808197,-1.324621,-0.372029,-0.249129,-0.535485,0.487618,1.291649,-0.964062,0.226216,Out of Time,The Weeknd,HÃÂST,0.7235432
