In [None]:
import os
import numpy as np
import pandas as pd
import io
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
!pip install spotipy



In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

In [None]:
from google.colab import files
uploaded = files.upload()

Saving data.csv to data.csv
Saving data_by_genres.csv to data_by_genres.csv
Saving data_by_year.csv to data_by_year.csv


In [None]:
data = pd.read_csv(io.BytesIO(uploaded['data.csv']))
genre_data = pd.read_csv(io.BytesIO(uploaded['data_by_genres.csv']))
year_data = pd.read_csv(io.BytesIO(uploaded['data_by_year.csv']))

In [None]:
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [None]:
genre_data.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [None]:
genre_data['genres'].nunique

<bound method IndexOpsMixin.nunique of 0       21st century classical
1                        432hz
2                        8-bit
3                           []
4                   a cappella
                 ...          
2968                      zolo
2969                   zouglou
2970                      zouk
2971              zurich indie
2972                    zydeco
Name: genres, Length: 2973, dtype: object>

In [None]:
year_data.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


# Music Over Time

In [None]:
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

data['decade'] = data['year'].apply(get_decade)
px.histogram(data_frame = data['decade'], color = data['decade'])


In [None]:
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()

In [None]:
fig = px.bar(year_data, x = 'year', y = 'popularity', color = 'popularity')
fig.show()

In [None]:
top_genre = genre_data.sort_values('popularity',ascending = False)
top_genre = top_genre[0:20]
top_genre

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
237,1,basshall,0.213167,0.818,169799.166667,0.630167,2e-05,0.081067,-6.627833,0.134833,115.0925,0.588667,80.666667,2
2778,0,turkish edm,0.00829,0.698,186700.0,0.719,4.01e-06,0.326,-4.923,0.0455,120.062,0.364,80.0,0
2533,1,south african house,0.043833,0.847,311854.333333,0.562333,0.1303392,0.075133,-7.719,0.050733,123.676333,0.834333,80.0,1
2755,1,trap venezolano,0.0446,0.877,231848.0,0.777,3.46e-05,0.0863,-4.246,0.117,102.02,0.706,80.0,1
46,0,alberta hip hop,0.33,0.885,144000.0,0.685,0.0,0.148,-6.429,0.0627,99.954,0.937,78.5,11
536,0,chinese electropop,0.00257,0.66,217088.0,0.787,0.0,0.323,-4.592,0.032,142.018,0.199,78.5,1
37,0,afroswing,0.31845,0.71175,179995.375,0.580187,0.000257895,0.18995,-7.016687,0.195563,83.250125,0.676625,77.3125,11
31,0,afro soul,0.00627,0.766,202627.0,0.873,0.0,0.0764,-6.381,0.143,126.988,0.743,77.0,11
2383,1,russian dance,0.00561,0.653,198095.0,0.945,0.915,0.439,-2.634,0.096,126.093,0.326,77.0,5
1239,1,guaracha,0.00903,0.745,189818.0,0.972,0.465,0.297,-3.506,0.0774,128.031,0.556,77.0,7


In [None]:
fig = px.bar(top_genre, x = 'genres', y = 'popularity', color = 'popularity')
fig.show()

In [None]:
top_songs = data.sort_values('popularity',ascending = False)
top_songs = top_songs[0:20]
top_songs

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,decade
19611,0.145,2020,0.401,"['Bad Bunny', 'Jhay Cortez']",0.731,205090,0.573,1,47EiUVwUp4C9fGccaPuUCS,5.2e-05,4,0.113,-10.059,0,Dakiti,100,2020-10-30,0.0544,109.928,2020s
19606,0.756,2020,0.221,"['24kGoldn', 'iann dior']",0.7,140526,0.722,1,3tjFYV6RSFtuktYl3ZtYcq,0.0,7,0.272,-3.558,0,Mood (feat. iann dior),99,2020-07-24,0.0369,90.989,2020s
19618,0.737,2020,0.0112,['BTS'],0.746,199054,0.765,0,0t1kP63rueHleOhQkYSXFY,0.0,6,0.0936,-4.41,0,Dynamite,97,2020-08-28,0.0993,114.044,2020s
19608,0.357,2020,0.0194,"['Cardi B', 'Megan Thee Stallion']",0.935,187541,0.454,1,4Oun2ylbjFKMPTiaSbbCih,0.0,1,0.0824,-7.509,1,WAP (feat. Megan Thee Stallion),96,2020-08-07,0.375,133.073,2020s
19610,0.682,2020,0.468,['Ariana Grande'],0.737,172325,0.802,1,35mvY5S1H3J2QZyna3TFe0,0.0,0,0.0931,-4.771,1,positions,96,2020-10-30,0.0878,144.015,2020s
19612,0.543,2020,0.65,['Pop Smoke'],0.709,160000,0.548,1,1tkg4EHVoqnhR6iFEXb60y,2e-06,10,0.133,-8.493,1,What You Know Bout Love,96,2020-07-03,0.353,83.995,2020s
19616,0.334,2020,0.00146,['The Weeknd'],0.514,200040,0.73,0,0VjIjW4GlUZAMYd2vXMi3b,9.5e-05,1,0.0897,-5.934,1,Blinding Lights,96,2020-03-20,0.0598,171.005,2020s
19607,0.347,2020,0.114,"['Pop Smoke', 'Lil Baby', 'DaBaby']",0.823,190476,0.586,1,0PvFJmanyNQMseIFrU708S,0.0,6,0.193,-6.606,0,For The Night (feat. Lil Baby & DaBaby),95,2020-07-03,0.2,125.971,2020s
19615,0.372,2020,0.196,"['Justin Bieber', 'Chance the Rapper']",0.673,212093,0.704,0,5u1n1kITHCxxp8twBcZxWy,0.0,6,0.0898,-8.056,1,Holy (feat. Chance The Rapper),95,2020-09-18,0.36,86.919,2020s
19620,0.0927,2020,0.864,"['Justin Bieber', 'benny blanco']",0.631,149297,0.239,1,4y4spB9m0Q6026KfkAvy9Q,0.0,11,0.116,-7.071,0,Lonely (with benny blanco),95,2020-10-16,0.0398,79.859,2020s


In [None]:
fig = px.bar(top_songs, x = 'name', y = 'popularity', color = 'popularity')
fig.show()

In [None]:
fig = px.bar(top_songs, x ='name', y = ['energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

In [None]:
genre_data['genres'].nunique()

2973

In [None]:
data['id'].nunique()

170653

# Clustering Genres

In [None]:
genre_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                             ('kmeans', KMeans(n_clusters = 10))])
X = genre_data.select_dtypes(np.number)
genre_cluster_pipeline.fit(X)
genre_data['cluster'] = genre_cluster_pipeline.predict(X)

In [None]:
genre_data

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,cluster
0,1,21st century classical,0.979333,0.162883,1.602977e+05,0.071317,0.606834,0.361600,-31.514333,0.040567,75.336500,0.103783,27.833333,6,1
1,1,432hz,0.494780,0.299333,1.048887e+06,0.450678,0.477762,0.131000,-16.854000,0.076817,120.285667,0.221750,52.500000,5,9
2,1,8-bit,0.762000,0.712000,1.151770e+05,0.818000,0.876000,0.126000,-9.180000,0.047000,133.444000,0.975000,48.000000,7,6
3,1,[],0.651417,0.529093,2.328809e+05,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7,2
4,1,a cappella,0.676557,0.538961,1.906285e+05,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968,1,zolo,0.222625,0.547082,2.580991e+05,0.610240,0.143872,0.204206,-11.295878,0.061088,125.494919,0.596155,33.778943,9,6
2969,0,zouglou,0.161000,0.863000,2.063200e+05,0.909000,0.000000,0.108000,-5.985000,0.081300,119.038000,0.845000,58.000000,7,5
2970,1,zouk,0.263261,0.748889,3.060728e+05,0.622444,0.257227,0.089678,-10.289222,0.038778,101.965222,0.824111,46.666667,5,6
2971,0,zurich indie,0.993000,0.705667,1.984173e+05,0.172667,0.468633,0.179667,-11.453333,0.348667,91.278000,0.739000,0.000000,7,2


In [None]:
genre_data['cluster'].sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of 0       1
1       9
2       6
3       2
4       2
       ..
2968    6
2969    5
2970    6
2971    2
2972    6
Name: cluster, Length: 2973, dtype: int32>

# Clustering Songs

In [None]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
data['cluster_label'] = song_cluster_pipeline.predict(X)

In [None]:
data

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,cluster_label
0,0.0594,1921,0.98200,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,10,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,10
1,0.9630,1921,0.73200,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,7,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936,12
2,0.0394,1921,0.96100,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,3,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339,7
3,0.1650,1921,0.96700,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,5,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109,1
4,0.2530,1921,0.95700,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,3,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029,3
170649,0.7340,2020,0.20600,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,7,0.1010,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936,3
170650,0.6370,2020,0.10100,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,0.000009,4,0.2580,-2.226,0,AYA,76,2020-11-03,0.0809,91.688,11
170651,0.1950,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,0.000008,2,0.6430,-7.161,1,Darkness,70,2020-01-17,0.3080,75.055,4


# Spotipy

In [None]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="e3c87bdffb3447269bc900976e49c372", client_secret="a7f8df73f97f472d82e9b5525811a56e"))

In [None]:
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q = 'track: {} year: {}'.format(name,year), limit = 1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [None]:
find_song("Life is a Highway", 2009)

Unnamed: 0,name,year,explicit,duration_ms,popularity,danceability,energy,key,loudness,mode,...,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,time_signature
0,Life is a Highway,2009,0,276707,52,0.562,0.91,5,-6.939,1,...,0,0.0676,0.606,103.057,audio_features,5YbeJyTQkdSAWe1Ie4sLAl,spotify:track:5YbeJyTQkdSAWe1Ie4sLAl,https://api.spotify.com/v1/tracks/5YbeJyTQkdSA...,https://api.spotify.com/v1/audio-analysis/5Ybe...,4


In [None]:
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

In [None]:
def recommend_songs(song_list, spotify_data, n_songs = 10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [None]:
recommend_songs([{'name':"i want it that way", 'year':1999}],  data)

[{'artists': "['Backstreet Boys']",
  'name': 'I Want It That Way',
  'year': 1999},
 {'artists': "['Backstreet Boys']",
  'name': 'I Want It That Way',
  'year': 2001},
 {'artists': "['Zedd', 'Alessia Cara']", 'name': 'Stay', 'year': 2017},
 {'artists': "['WayV']", 'name': 'Love Talk - English Version', 'year': 2019},
 {'artists': "['Sam Smith']", 'name': 'Like I Can', 'year': 2014},
 {'artists': "['BTS']", 'name': 'Telepathy', 'year': 2020},
 {'artists': "['Michael Bublé']", 'name': 'Everything', 'year': 2007},
 {'artists': "['Plan B']", 'name': 'Es un Secreto', 'year': 2010},
 {'artists': "['TLC']", 'name': 'No Scrubs', 'year': 1999},
 {'artists': "['Mario']", 'name': 'Let Me Love You', 'year': 2004}]