In [None]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

!pip install Spotipy 
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")



In [None]:
pip install opendatasets



In [None]:
import opendatasets as od
dataset_url = 'https://www.kaggle.com/datasets/yamaerenay/spotify-dataset-19212020-600k-tracks'
od.download(dataset_url)

Skipping, found downloaded files in "./spotify-dataset-19212020-600k-tracks" (use force=True to force download)


In [None]:
data_dir = './spotify-dataset-19212020-600k-tracks'

In [None]:
import os
os.listdir(data_dir)

['artists.csv', 'data.csv', 'tracks.csv', 'dict_artists.json']

In [None]:
dataset = pd.read_csv(data_dir + '/data.csv')


In [None]:
df = dataset.select_dtypes(np.number)
number_cols = list(df.columns)

## Standard Scaler
Before Clustering our data we need to scale down the features to to a given range so that the clustering algorithm dosen't favour a particular feature just because it has a greater value

we will be using Standard Scaler from the sci-kit learn library to scale down our data.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaled_X = scaler.fit_transform(df.values)

scaled_df = pd.DataFrame(scaled_X,
columns=df.columns)

## PCA
Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

pca.fit(scaled_df)

pca_df = pca.transform(scaled_df)

In [None]:
# importing metric library
from sklearn import metrics

## Kmeans Clustering
K-Means clustering is an unsupervised learning algorithm, which is used when you have unlabeled data. The aim of this algorithm is to find groups in the data. K-means clustering algorithm has many uses for grouping text documents, images, videos, and much more. It works iteratively to assign each data point to one of the groups based on the Euclidean distance.

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=8, verbose=2)

In [None]:
kmeans.fit(pca_df)

Initialization complete
Iteration 0, inertia 162711.45838506133
Iteration 1, inertia 153267.60945333185
Iteration 2, inertia 147808.12935921672
Iteration 3, inertia 144471.84023881017
Iteration 4, inertia 142642.24264643755
Iteration 5, inertia 141579.51845790114
Iteration 6, inertia 140858.2289886051
Iteration 7, inertia 140285.6106319623
Iteration 8, inertia 139849.62602773885
Iteration 9, inertia 139510.12658809702
Iteration 10, inertia 139233.0706734676
Iteration 11, inertia 138983.94187230448
Iteration 12, inertia 138774.14103733306
Iteration 13, inertia 138603.09488472622
Iteration 14, inertia 138459.38875163664
Iteration 15, inertia 138338.25458568404
Iteration 16, inertia 138244.57613943116
Iteration 17, inertia 138175.4883693821
Iteration 18, inertia 138120.9938910728
Iteration 19, inertia 138074.49774563347
Iteration 20, inertia 138037.99297306206
Iteration 21, inertia 138006.2987663648
Iteration 22, inertia 137978.61266895544
Iteration 23, inertia 137957.1271989558
Iteration

KMeans(verbose=2)

In [None]:
spotify_dataKmeans = dataset.copy()
spotify_dataKmeans['cluster_label'] = kmeans.predict(pca_df)

In [None]:
x = metrics.calinski_harabasz_score(pca_df, spotify_dataKmeans['cluster_label'])
print(x)

150791.69603218033


There are 2 major approaches for building recommendation systems — content-based and collaborative filtering. 

> Content based: The gist of this approach is that we match users to the content or items they have liked or bought. Here the attributes of the users and the products are important. For example, for movie recommendations, we use features such as director, actors, movie length, genre, etc. to find >similarity between movies.

> Collaborative Filtering: The underlying assumption of the collaborative filtering approach is that if A and B buy similar products, A is more likely to buy a product that B has bought than a product which a random person has bought. Unlike content based, there are no features corresponding to users or items here.



In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict

In [None]:
features = ['valence', 'year', 'acousticness',
            'danceability', 'duration_ms', 'energy',
            'explicit','instrumentalness', 'key', 
            'liveness', 'loudness', 'mode',
            'popularity','speechiness', 'tempo']

metadata_cols = ['year', 'name',  'artists']

In [None]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=8, 
                                   verbose=2))],verbose=True)
X = dataset[features]
song_cluster_pipeline.fit(X)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
Initialization complete
Iteration 0, inertia 2062060.7781208793
Iteration 1, inertia 1514199.236063819
Iteration 2, inertia 1456188.0311555099
Iteration 3, inertia 1430521.0260133548
Iteration 4, inertia 1415298.8023327566
Iteration 5, inertia 1406237.3116557216
Iteration 6, inertia 1402286.7230520255
Iteration 7, inertia 1400127.089551833
Iteration 8, inertia 1398790.5259697842
Iteration 9, inertia 1397814.9929396787
Iteration 10, inertia 1397001.7923352444
Iteration 11, inertia 1396262.4447506652
Iteration 12, inertia 1395512.8487275164
Iteration 13, inertia 1394616.134199248
Iteration 14, inertia 1393662.6087383798
Iteration 15, inertia 1392716.694176499
Iteration 16, inertia 1391836.6112401923
Iteration 17, inertia 1391090.2618235885
Iteration 18, inertia 1390483.7355313157
Iteration 19, inertia 1389953.5647466485
Iteration 20, inertia 1389376.4435319374
Iteration 21, inertia 1388621.597684539
Iteration 22, iner

Pipeline(steps=[('scaler', StandardScaler()), ('kmeans', KMeans(verbose=2))],
         verbose=True)

## Generating song recommendations
Now we can finally build the music recommendation system!

This algorithm follows a common approach that is used in content-based recommender systems and is generalizable because we can mathematically define the term closest with a wide range of distance metrics ranging from the classic Euclidean distance to the cosine distance. For the purpose of this project, I used the cosine distance, which is defined below for two vectors u and v.

Cosine distance formula.
In other words, the cosine distance is one minus the cosine similarity — the cosine of the angle between the two vectors. The cosine distance is commonly used in recommender systems and can work well even when the vectors being used have different magnitudes. If the vectors for two songs are parallel, the angle between them will be zero, meaning the cosine distance between them will also be zero because the cosine of zero is 1.

Scipy’s cdist function is used for finding the distances between two pairs of collections of points.

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="c996320d9deb4688a62220b0cf198c86", 
                                                           client_secret="d1b45bcd2e434cbf9e538f1b5a882cf9"))


def find_song(name, year):
  
    """
    This function returns a dataframe with data for a song given the name and release year.
    The function uses Spotipy to fetch audio features and metadata for the specified song.
    
    """
    
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,
                                                       year), limit=1)
    if results['tracks']['items'] == []:
        return None
    
    results = results['tracks']['items'][0]

    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]
    
    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]
    
    for key, value in audio_features.items():
        song_data[key] = value
    
    return pd.DataFrame(song_data)

In [None]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specific song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
  
    """
    Gets the mean vector for a list of songs.
    """
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
   
    """
    Utility function for flattening a list of dictionaries.
    """
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

In [None]:
def recommend_songs(song_list, spotify_data, n_songs=10):
  
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [None]:
results = recommend_songs([{'name': 'Toosie Slide', 'year': 2020},
                                          {'name': 'Outta Time (feat. Drake)', 'year': 2020},
                                          {'name': 'Chicago Freestyle (feat. Giveon)', 'year': 2020}],dataset)
results

[{'artists': "['Doja Cat', 'Gucci Mane']",
  'name': 'Like That (feat. Gucci Mane)',
  'year': 2019},
 {'artists': "['Rod Wave', 'Lil Baby']",
  'name': 'Rags2Riches 2 (feat. Lil Baby)',
  'year': 2020},
 {'artists': "['Pop Smoke', 'Lil Baby', 'DaBaby']",
  'name': 'For The Night (feat. Lil Baby & DaBaby)',
  'year': 2020},
 {'artists': "['Kodak Black', 'Offset', 'Travis Scott']",
  'name': 'ZEZE (feat. Travis Scott & Offset)',
  'year': 2018},
 {'artists': "['A$AP Rocky', 'Skepta']",
  'name': 'Praise The Lord (Da Shine) (feat. Skepta)',
  'year': 2018},
 {'artists': "['Jhay Cortez', 'Bad Bunny']",
  'name': 'CÓMO SE SIENTE - Remix',
  'year': 2020},
 {'artists': "['Joseph Black']", 'name': '(i hope you) miss me', 'year': 2020},
 {'artists': "['Young Thug', 'J. Cole', 'Travis Scott']",
  'name': 'The London (feat. J. Cole & Travis Scott)',
  'year': 2019},
 {'artists': "['$NOT', 'Wifisfuneral']",
  'name': 'BERETTA (feat. Wifisfuneral)',
  'year': 2020},
 {'artists': "['Sueco']", 'nam