In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [32]:
pd.options.display.max_rows = 200

In [33]:
df = pd.read_csv("df_cleaned.csv")

In [34]:
df.drop(columns ='Unnamed: 0', inplace=True)

In [35]:
df.shape

(144166, 20)

In [36]:
# Makes it into an actual list instead of a string list
df['genres'] = df['genres'].apply(lambda x: x[1:-1].split(', '))

for i in df.index:
# filters out empty values in each list
    df['genres'].loc[i] = list(filter(None, df['genres'][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [37]:
df

Unnamed: 0,acousticness,artists,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,song_name,song_popularity,speechiness,tempo,valence,year,duration_min,genres,artist_popularity,song_decade
0,0.9910,Mamie Smith,0.598,0.2240,0,0.000522,5,0.3790,-12.628,0,Keep A Song In Your Soul,12,0.0936,149.976,0.634,1920,2.81,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s
1,0.9930,Mamie Smith,0.647,0.1860,0,0.000018,0,0.5190,-12.098,1,Golfing Papa,4,0.1740,97.600,0.689,1920,2.73,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s
2,0.9920,Mamie Smith,0.782,0.0573,0,0.000002,5,0.1760,-12.453,1,Don't You Advertise Your Man,5,0.0592,85.652,0.487,1920,3.25,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s
3,0.9950,Mamie Smith,0.482,0.2290,0,0.000061,7,0.5490,-12.619,1,Kansas City Man Blues - 78rpm Version,4,0.0812,77.232,0.461,1920,3.30,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s
4,0.9920,Mamie Smith,0.574,0.1380,0,0.000492,3,0.2330,-14.171,1,Miss Jenny's Ball (aka There'll Be No Freebies...,1,0.1090,158.422,0.764,1920,3.16,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144161,0.0292,Lil Xxel,0.748,0.4570,0,0.000294,6,0.4440,-7.560,1,LMK,71,0.0465,123.035,0.464,2020,3.14,"[""'pop r&b'""]",71,2020s
144162,0.2820,SPiCYSOL,0.608,0.3680,0,0.000002,5,0.0908,-8.539,0,10years vintage - LiVE from 2020.3.5 @EBISU LI...,12,0.0278,125.941,0.212,2020,5.22,"[""'city pop'"", ""'japanese r&b'""]",12,2020s
144163,0.8250,Lincoln,0.695,0.2320,0,0.000000,1,0.5900,-11.010,1,Saint Bernard,68,0.0972,100.488,0.451,2020,1.72,"[""'indie pop'"", ""'modern indie pop'""]",68,2020s
144164,0.0205,RAYE,0.749,0.8200,0,0.000685,7,0.0775,-6.114,1,Regardless,76,0.0478,120.066,0.534,2020,3.29,"[""'dance pop'"", ""'pop'"", ""'pop dance'"", ""'post...",80,2020s


In [38]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,song_popularity,speechiness,tempo,valence,year,duration_min,artist_popularity
count,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0,144166.0
mean,0.493255,0.528681,0.492909,0.05837,0.183556,5.193825,0.209495,-11.50279,0.710181,29.692147,0.07812,117.436688,0.525736,1978.053071,3.919463,29.232628
std,0.377425,0.176051,0.271451,0.234443,0.32434,3.501892,0.182336,5.580913,0.45368,21.107551,0.109901,30.258919,0.26766,24.552015,2.150613,18.386307
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,1920.0,0.08,0.0
25%,0.0874,0.407,0.264,0.0,1e-06,2.0,0.0979,-14.505,0.0,11.0,0.0345,94.3425,0.306,1959.0,2.83,14.0
50%,0.504,0.538,0.482,0.0,0.000457,5.0,0.135,-10.654,1.0,30.0,0.0438,115.793,0.536,1978.0,3.5,31.0
75%,0.884,0.657,0.719,0.0,0.178,8.0,0.266,-7.362,1.0,45.0,0.0683,135.622,0.751,1997.0,4.48,42.0
max,0.996,0.988,1.0,1.0,1.0,11.0,1.0,3.744,1.0,100.0,0.97,243.507,1.0,2021.0,59.63,91.0


### Scaling Song Measures

In [39]:
mm_scaler = MinMaxScaler()

In [40]:
song_data= df[['acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence']]
song_data

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,0.9910,0.598,0.2240,0.000522,5,0.3790,-12.628,0,0.0936,149.976,0.634
1,0.9930,0.647,0.1860,0.000018,0,0.5190,-12.098,1,0.1740,97.600,0.689
2,0.9920,0.782,0.0573,0.000002,5,0.1760,-12.453,1,0.0592,85.652,0.487
3,0.9950,0.482,0.2290,0.000061,7,0.5490,-12.619,1,0.0812,77.232,0.461
4,0.9920,0.574,0.1380,0.000492,3,0.2330,-14.171,1,0.1090,158.422,0.764
...,...,...,...,...,...,...,...,...,...,...,...
144161,0.0292,0.748,0.4570,0.000294,6,0.4440,-7.560,1,0.0465,123.035,0.464
144162,0.2820,0.608,0.3680,0.000002,5,0.0908,-8.539,0,0.0278,125.941,0.212
144163,0.8250,0.695,0.2320,0.000000,1,0.5900,-11.010,1,0.0972,100.488,0.451
144164,0.0205,0.749,0.8200,0.000685,7,0.0775,-6.114,1,0.0478,120.066,0.534


In [41]:
song_features_sc = pd.DataFrame()
for col in song_data.columns:
    mm_scaler.fit(song_data[[col]])
    song_features_sc[col] = mm_scaler.transform(song_data[col].values.reshape(-1,1)).ravel() 
song_features_sc

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,0.994980,0.605263,0.2240,0.000522,0.454545,0.3790,0.743160,0.0,0.096495,0.615900,0.634
1,0.996988,0.654858,0.1860,0.000018,0.000000,0.5190,0.751475,1.0,0.179381,0.400810,0.689
2,0.995984,0.791498,0.0573,0.000002,0.454545,0.1760,0.745905,1.0,0.061031,0.351743,0.487
3,0.998996,0.487854,0.2290,0.000061,0.636364,0.5490,0.743301,1.0,0.083711,0.317165,0.461
4,0.995984,0.580972,0.1380,0.000492,0.272727,0.2330,0.718954,1.0,0.112371,0.650585,0.764
...,...,...,...,...,...,...,...,...,...,...,...
144161,0.029317,0.757085,0.4570,0.000294,0.545455,0.4440,0.822666,1.0,0.047938,0.505263,0.464
144162,0.283133,0.615385,0.3680,0.000002,0.454545,0.0908,0.807307,0.0,0.028660,0.517197,0.212
144163,0.828313,0.703441,0.2320,0.000000,0.090909,0.5900,0.768543,1.0,0.100206,0.412670,0.451
144164,0.020582,0.758097,0.8200,0.000685,0.636364,0.0775,0.845350,1.0,0.049278,0.493070,0.534


In [42]:
data_merge = df.drop(['acousticness', 'danceability',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence'], axis = 1)

final_df = data_merge.join(song_features_sc)
final_df

Unnamed: 0,artists,explicit,song_name,song_popularity,year,duration_min,genres,artist_popularity,song_decade,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Mamie Smith,0,Keep A Song In Your Soul,12,1920,2.81,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s,0.994980,0.605263,0.2240,0.000522,0.454545,0.3790,0.743160,0.0,0.096495,0.615900,0.634
1,Mamie Smith,0,Golfing Papa,4,1920,2.73,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s,0.996988,0.654858,0.1860,0.000018,0.000000,0.5190,0.751475,1.0,0.179381,0.400810,0.689
2,Mamie Smith,0,Don't You Advertise Your Man,5,1920,3.25,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s,0.995984,0.791498,0.0573,0.000002,0.454545,0.1760,0.745905,1.0,0.061031,0.351743,0.487
3,Mamie Smith,0,Kansas City Man Blues - 78rpm Version,4,1920,3.30,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s,0.998996,0.487854,0.2290,0.000061,0.636364,0.5490,0.743301,1.0,0.083711,0.317165,0.461
4,Mamie Smith,0,Miss Jenny's Ball (aka There'll Be No Freebies...,1,1920,3.16,"[""'harlem renaissance'"", ""'traditional blues'""]",3,1920s,0.995984,0.580972,0.1380,0.000492,0.272727,0.2330,0.718954,1.0,0.112371,0.650585,0.764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144161,Lil Xxel,0,LMK,71,2020,3.14,"[""'pop r&b'""]",71,2020s,0.029317,0.757085,0.4570,0.000294,0.545455,0.4440,0.822666,1.0,0.047938,0.505263,0.464
144162,SPiCYSOL,0,10years vintage - LiVE from 2020.3.5 @EBISU LI...,12,2020,5.22,"[""'city pop'"", ""'japanese r&b'""]",12,2020s,0.283133,0.615385,0.3680,0.000002,0.454545,0.0908,0.807307,0.0,0.028660,0.517197,0.212
144163,Lincoln,0,Saint Bernard,68,2020,1.72,"[""'indie pop'"", ""'modern indie pop'""]",68,2020s,0.828313,0.703441,0.2320,0.000000,0.090909,0.5900,0.768543,1.0,0.100206,0.412670,0.451
144164,RAYE,0,Regardless,76,2020,3.29,"[""'dance pop'"", ""'pop'"", ""'pop dance'"", ""'post...",80,2020s,0.020582,0.758097,0.8200,0.000685,0.636364,0.0775,0.845350,1.0,0.049278,0.493070,0.534


### Song Recommender Based On Song Measures

In [43]:
def song_recommender(data, song, artist, genre_parameter):
    song_and_artist_data = data[(data['artists'] == artist) & (data['song_name'] == song)].sort_values('year')[0:1]
    
    similarity_data = data.copy()
    
    data_values = similarity_data[['acousticness', 'danceability',
       'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence']]
    
    similarity_data['similarity_with_song'] =cosine_similarity(data_values, data_values.to_numpy()[song_and_artist_data.index[0],None]).squeeze()
    
    artist_genres = set(*song_and_artist_data['genres'])

    similarity_data['genres'] = similarity_data['genres'].apply(lambda genres: list(set(genres).intersection(artist_genres)))
    
    similarity_lengths = similarity_data['genres'].str.len()
    similarity_data = similarity_data.reindex(similarity_lengths[similarity_lengths >= genre_parameter].sort_values(ascending=False).index)
    
    similarity_data = similarity_data[similarity_data['song_decade'] == song_and_artist_data['song_decade'].values[0]]
 
    similarity_data.rename(columns={'song_name': f'Similar Song to {song}'}, inplace=True)
    
    similarity_data = similarity_data.sort_values(by= 'similarity_with_song', ascending = False)
    
    similarity_data = similarity_data[['artists', f'Similar Song to {song}',
       'song_popularity', 'year', 'genres', 'artist_popularity', 'song_decade', 'similarity_with_song',
       'acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence']]
    
    return similarity_data.head(10)

In [53]:
song_recommender(final_df, 'Over', 'Drake', 1)

Unnamed: 0,artists,Similar Song to Over,song_popularity,year,genres,artist_popularity,song_decade,similarity_with_song,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
102936,Drake,Over,64,2010,"[""'hip hop'"", ""'rap'"", ""'canadian pop'"", ""'tor...",63,2010s,1.0,0.010743,0.354251,0.845,0.0,0.636364,0.123,0.853194,1.0,0.206186,0.4092,0.45
103025,Drake,Over,59,2010,"[""'hip hop'"", ""'rap'"", ""'canadian pop'"", ""'tor...",63,2010s,0.998891,0.010944,0.328947,0.848,0.0,0.636364,0.124,0.853241,1.0,0.287629,0.411048,0.433
88072,Dr. Dre,Kush - Main,64,2010,"[""'hip hop'"", ""'rap'""]",52,2010s,0.996403,0.000624,0.449393,0.866,4.7e-05,0.636364,0.105,0.843734,1.0,0.114433,0.390001,0.375
105671,Big Sean,My Last,57,2011,"[""'hip hop'"", ""'rap'"", ""'pop rap'""]",53,2010s,0.993359,0.098394,0.3917,0.773,0.0,0.727273,0.209,0.85208,1.0,0.175258,0.322902,0.368
104499,Frank Ocean,Crack Rock,59,2012,"[""'hip hop'""]",63,2010s,0.99231,0.06988,0.434211,0.736,0.0,0.454545,0.0852,0.815496,1.0,0.228866,0.366569,0.379
104071,Jason Derulo,Ridin' Solo,65,2010,"[""'pop rap'""]",67,2010s,0.991225,0.128514,0.447368,0.83,0.0,0.818182,0.129,0.8782,1.0,0.150515,0.366881,0.578
104084,Jason Derulo,Ridin' Solo,56,2010,"[""'pop rap'""]",67,2010s,0.991225,0.128514,0.447368,0.83,0.0,0.818182,0.129,0.8782,1.0,0.150515,0.366881,0.578
100354,Gym Class Heroes,The Fighter (feat. Ryan Tedder),59,2011,"[""'pop rap'""]",62,2010s,0.990239,0.077811,0.59919,0.926,0.0,0.636364,0.187,0.886844,1.0,0.129897,0.406633,0.432
104215,DJ Khaled,Welcome To My Hood - Remix,52,2011,"[""'hip hop'"", ""'rap'"", ""'pop rap'""]",62,2010s,0.989142,0.022189,0.592105,0.815,0.0,0.636364,0.14,0.875329,1.0,0.235052,0.575248,0.541
135839,Death Grips,Get Got,59,2012,"[""'hip hop'""]",56,2010s,0.98909,0.006305,0.576923,0.993,0.00796,0.636364,0.0876,0.882326,1.0,0.347423,0.357279,0.405


In [45]:
final_df.to_csv('final_df.csv')

In [54]:
final_df[['artists']]

Unnamed: 0,artists
0,Mamie Smith
1,Mamie Smith
2,Mamie Smith
3,Mamie Smith
4,Mamie Smith
...,...
144161,Lil Xxel
144162,SPiCYSOL
144163,Lincoln
144164,RAYE
