In [1]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("data_o.csv")
genre_data = pd.read_csv('data_by_genres_o.csv')
year_data = pd.read_csv('data_by_year_o.csv')

In [None]:
data.info()

In [None]:
genre_data.info()

In [None]:
year_data.info()

We are going to check for all the analysis with the target as 'popularity'. Before going to do that let's check for the Feature Correlation by considering a few features and for that, I'm going to use the yellowbrick package.

In [None]:
from yellowbrick.target import FeatureCorrelation

feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

X, y = data[feature_names], data['popularity']

# Create a list of the feature names
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

plt.rcParams['figure.figsize']=(20,20)
visualizer.fit(X, y)     # Fit the data to the visualizer
visualizer.show()

# Data Understanding by Visualization and EDA

## Music by Time

Using the data grouped by year, we can understand how the overall sound of music has changed from 1921 to 2020.

In [None]:
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

data['decade'] = data['year'].apply(get_decade)

sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(data['decade'])

# Clustering Genres with K-Means

Here, I used simple K-means clustering algorithm to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [None]:
genre_data

 Finding the Optimal Number of Clusters

In [None]:
from yellowbrick.cluster import KElbowVisualizer
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2,20))
visualizer.fit(X) 
visualizer.poof()  

Because of the decrease in the fit time and distortion score. We choose 10 as k. We may choose 5 but it would be little bit less for genres. 

In [4]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, n_jobs=-1))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

# Clustering Songs with K-Means

Finding the optimal number of clusters

In [None]:
from yellowbrick.cluster import KElbowVisualizer
kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2,20))
visualizer.fit(X) 
visualizer.poof()  

We choose 20 as k. We might choose 13 or anything else but it would be little bit less for all the songs. 

In [3]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2, n_jobs=4))],verbose=True)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.3s
Initialization complete
Iteration 0, inertia 1559979.8019470547
...Converged at iteration 37: center shift 8.206930591686624e-05 within tolerance 0.00010000000000000789.
[Pipeline] ............ (step 2 of 2) Processing kmeans, total=  40.3s


In [None]:
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

# Building the RecSys

* Based on the applications, it can be seen that similar genres are likely to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes  sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. This can be used to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* Spotipy is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. We have to install using pip install spotipy
* After installing Spotipy, we will need to create an app on the Spotify Developer’s page and save your Client ID and secret key.

In [7]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ['SPOTIPY_CLIENT_ID'],
                                                           client_secret=os.environ['SPOTIPY_CLIENT_SECRET']))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]

    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [8]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [11]:
recommend_songs([{'name': 'Lucky Star', 'year':1983},
                {'name': 'Round Round', 'year': 2002},
                {'name': 'Give It Away', 'year': 1991},
                {'name': 'Strict Machine', 'year': 2003},
                {'name': 'Smile Like You Mean It', 'year': 2004}],  data)

[{'name': 'Helena Beat', 'year': 2011, 'artists': "['Foster The People']"},
 {'name': 'The Riddle', 'year': 2006, 'artists': '["Gigi D\'Agostino"]'},
 {'name': 'Hail to the King',
  'year': 2013,
  'artists': "['Avenged Sevenfold']"},
 {'name': 'You Were Right', 'year': 2016, 'artists': "['RÜFÜS DU SOL']"},
 {'name': 'Sadi Gali', 'year': 2011, 'artists': "['Lehmber Hussainpuri']"},
 {'name': 'Sugar (feat. Francesco Yates)',
  'year': 2015,
  'artists': "['Robin Schulz', 'Francesco Yates']"},
 {'name': 'La Planta', 'year': 2014, 'artists': "['Caos']"},
 {'name': 'Summer', 'year': 2014, 'artists': "['Calvin Harris']"},
 {'name': 'The One', 'year': 1999, 'artists': "['Backstreet Boys']"},
 {'name': "DJ Got Us Fallin' In Love (feat. Pitbull)",
  'year': 2010,
  'artists': "['Usher', 'Pitbull']"}]

This final cell will give you a recommendation list of songs like this. You can change the songs if you want.