# Recommendation using Genre

Spotify has over a hundred genres in their database. When asked, the average person can name only less than 10 genres so in reality, not all are relevant and too many genres can only be confusing. This part we're going to train a model to classify songs based on their features and predict the genre of future songs. 7 major genres are used : Pop, Rock, Country, R&B, Jazz, EDM and Hip-Hop.

In [25]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pickle
import seaborn as sns

In [26]:
# set up spotify client

SPOTIFY_CLIENT_ID = '8bb89c78e01147559a8e3abdcdf84f4e'
SPOTIFY_CLIENT_SECRET = 'fdae390db3e14974bfd77b31b55d67c7'
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID,client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager,requests_timeout=None)

In [27]:
def getTrackFeatures(id):
    meta = sp.track(id)
    features = sp.audio_features(id)

  # meta
    track_id = meta['id']
    name = meta['name']
    album = meta['album']['name']
    artist = meta['album']['artists'][0]['name']
    artist_id =  meta['album']['artists'][0]['id']
    release_date = meta['album']['release_date']
    song_length = meta['duration_ms']
    popularity = meta['popularity']

  # features
    acousticness = features[0]['acousticness']
    danceability = features[0]['danceability']
    energy = features[0]['energy']
    instrumentalness = features[0]['instrumentalness']
    key=features[0]['key']
    liveness = features[0]['liveness']
    loudness = features[0]['loudness']
    speechiness = features[0]['speechiness']
    tempo = features[0]['tempo']
    time_signature = features[0]['time_signature']
    valence = features[0]['valence']
    mode=features[0]['mode']

    
    track = [track_id,name, album, artist,artist_id,release_date, song_length, popularity, acousticness, danceability,
             energy, instrumentalness, key, liveness, loudness, speechiness, tempo, time_signature,valence, mode]

    return track

### Creating the dataset
Get songs from 7 genres from Everynoise.com

In [8]:
# create the dataset

genres = 'pop hiphop r&b r&b rock edm jazz jazz country country'.split()

# the playlist of the different genres
playlists = '''6gS3HhOiI17QNojjPuPzqc 6s5MoZzR70Qef7x4bVxDO1 1rLnwJimWCmjp3f0mEbnkY 0Hwb2a9DJdom4yoe5V41K9 
                7dowgSWOmvdpwNkGFMUs6e 4luNnGhISZdURbFcCl2dB6 5EyFMotmvSfDAZ4hSdKrbx 2ZazIXecBCVmTlbyKJHxOc
                 4mijVkpSXJziPiOrK7YX4M 4fj8PNbbwGXBWHKodGQhfD'''.split()

# the features 
columns = ['track_id','release_date','song_length','popularity','acousticness','danceability', 'energy','instrumentalness','key', 
           'liveness','loudness', 'speechiness', 'tempo','time_signature', 'valence', 'mode', 'genre']
to_append = []

for g in zip(genres, playlists):
    
    song_count = sp.user_playlist_tracks(playlist_id=g[1])['total']
    offset_index=0
    tracks_list = []

    while offset_index<song_count:

        # get playlist for given genre 
        track_ids = sp.user_playlist_tracks('thesoundsofspotify',playlist_id=g[1],fields='items(track(id))',offset=offset_index)

        for track in track_ids['items']:
            if track['track']:
                tracks_list.append(track['track']['id'])
            else:
                pass
        offset_index = offset_index + 100
        
    tracks_af=[]
    meta=[]
    for track_num,track_id in tqdm(enumerate(tracks_list)):
        
        meta.append(sp.track(track_id))
        tracks_af.append(sp.audio_features(track_id))
        track_data = []
        track_data.append(tracks_af[track_num][0]['id'])
        track_data.append(meta[track_num]['album']['release_date'])
        track_data.append(meta[track_num]['duration_ms'])
        track_data.append(meta[track_num]['popularity'])
        track_data.append(tracks_af[track_num][0]['acousticness'])
        track_data.append(tracks_af[track_num][0]['danceability'])
        track_data.append(tracks_af[track_num][0]['energy'])
        track_data.append(tracks_af[track_num][0]['instrumentalness'])
        track_data.append(tracks_af[track_num][0]['key'])
        track_data.append(tracks_af[track_num][0]['liveness'])
        track_data.append(tracks_af[track_num][0]['loudness'])
        track_data.append(tracks_af[track_num][0]['speechiness'])
        track_data.append(tracks_af[track_num][0]['tempo'])
        track_data.append(tracks_af[track_num][0]['time_signature'])
        track_data.append(tracks_af[track_num][0]['valence'])
        track_data.append(tracks_af[track_num][0]['mode'])
        track_data.append(g[0])
        to_append.append(track_data)
dataset = pd.DataFrame(to_append, columns=columns)
dataset.to_csv("datasets/genres.csv", sep = ',', encoding="utf-8-sig")

1098it [03:19,  5.51it/s]
929it [02:51,  5.41it/s]
572it [01:41,  5.63it/s]
698it [02:10,  5.36it/s]
1173it [03:32,  5.53it/s]
1116it [03:22,  5.52it/s]
392it [01:09,  5.62it/s]
521it [01:35,  5.48it/s]
572it [01:45,  5.41it/s]
458it [01:25,  5.35it/s]


In [9]:
# the dataset
dataset = pd.read_csv('datasets/genres.csv',index_col=[0])
dataset

Unnamed: 0,track_id,release_date,song_length,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,mode,genre
0,0prNGof3XqfTvNDxHonvdK,2015-11-13,230226,75,0.0285,0.573,0.739,0.000000,0,0.1110,-5.740,0.1290,97.085,4,0.4510,1,pop
1,3yOlyBJuViE2YSGn3nVE1K,2019-12-06,170746,83,0.0180,0.724,0.491,0.000013,8,0.0887,-6.024,0.0296,105.046,4,0.3830,1,pop
2,4l0Mvzj72xxOpRrp6h8nHi,2020-01-10,206458,85,0.5560,0.488,0.343,0.000000,4,0.2100,-8.985,0.0436,102.819,4,0.0978,1,pop
3,3e7sxremeOE3wTySiOhGiP,2018-12-14,239000,74,0.1020,0.259,0.437,0.000001,11,0.1060,-6.589,0.0386,180.042,4,0.0951,0,pop
4,4tCtwWceOPWzenK2HAIJSb,2016-05-27,214480,79,0.1030,0.803,0.585,0.000004,8,0.0644,-5.861,0.0432,105.017,4,0.5930,1,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7524,3R8YbAsBsFnGIKL25s6iiO,2008-01-01,241720,63,0.2650,0.524,0.403,0.000031,7,0.3410,-8.772,0.0316,94.693,4,0.1830,1,country
7525,2IuazAsyvbb3fFZb0Uvsmk,2018-01-12,221600,64,0.0166,0.592,0.928,0.000015,11,0.1230,-5.402,0.0558,121.996,4,0.3930,1,country
7526,1dXUWskP4zy7Inqpfy5hf6,2011-09-13,219266,42,0.4750,0.512,0.532,0.000000,0,0.0993,-3.280,0.0301,147.473,4,0.2560,1,country
7527,67TFWxLDMQiElI6ongsK4V,2005,207053,18,0.4530,0.545,0.646,0.000000,2,0.1200,-4.386,0.0348,143.912,4,0.3600,1,country


###  Get the average,min,max features of each genre

In [10]:
min_max_scaler = preprocessing.MinMaxScaler()

# normalize the features
songs_features = dataset.copy()
songs_features = songs_features.drop(['genre','track_id','release_date'],axis=1)
songs_features =  min_max_scaler.fit_transform(songs_features)

In [11]:
df_features = pd.DataFrame(songs_features,columns=['song_length','popularity','acousticness','danceability','energy',
                                                   'instrumentalness','key','liveness','loudness','speechiness','tempo',
                                                   'time_signature','valence','mode'])
df_features['genre'] = dataset['genre']

In [12]:
df_features.groupby("genre").describe()

Unnamed: 0_level_0,song_length,song_length,song_length,song_length,song_length,song_length,song_length,song_length,popularity,popularity,...,valence,valence,mode,mode,mode,mode,mode,mode,mode,mode
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
country,1030.0,0.193775,0.072378,0.006952,0.145542,0.186946,0.233168,0.556875,1030.0,0.553874,...,0.755818,0.996869,1030.0,0.915534,0.278221,0.0,1.0,1.0,1.0,1.0
edm,1116.0,0.1925,0.109452,0.007292,0.114638,0.166991,0.242763,0.875848,1116.0,0.504654,...,0.545028,0.975999,1116.0,0.508961,0.500144,0.0,0.0,1.0,1.0,1.0
hiphop,929.0,0.231937,0.106996,0.0,0.157298,0.228172,0.297578,0.871234,929.0,0.648064,...,0.704685,0.987478,929.0,0.53606,0.498967,0.0,0.0,1.0,1.0,1.0
jazz,913.0,0.314913,0.210075,0.0,0.144616,0.280612,0.433434,0.995244,913.0,0.433657,...,0.580507,0.977043,913.0,0.560789,0.496563,0.0,0.0,1.0,1.0,1.0
pop,1098.0,0.18254,0.072581,0.00233,0.133256,0.175014,0.223095,0.759267,1098.0,0.710801,...,0.629291,0.990608,1098.0,0.620219,0.485554,0.0,0.0,1.0,1.0,1.0
r&b,1270.0,0.234014,0.104045,0.002568,0.163983,0.225698,0.292242,0.955113,1270.0,0.540664,...,0.666858,0.992695,1270.0,0.46063,0.498644,0.0,0.0,0.0,1.0,1.0
rock,1173.0,0.277992,0.145725,0.002142,0.183719,0.252191,0.346426,1.0,1173.0,0.643336,...,0.741208,1.0,1173.0,0.704177,0.456606,0.0,0.0,1.0,1.0,1.0


In [13]:
avg_features = {}

avg_features['song_length'] =  dataset.groupby(['genre']).song_length.mean()
avg_features['popularity'] = dataset.groupby(['genre']).popularity.mean()
avg_features['acousticness'] = dataset.groupby(['genre']).acousticness.mean()
avg_features['danceability'] = dataset.groupby(['genre']).danceability.mean()
avg_features['energy'] = dataset.groupby(['genre']).energy.mean()
avg_features['instrumentalness'] = dataset.groupby(['genre']).instrumentalness.mean()
avg_features['key'] = dataset.groupby(['genre']).key.mean()
avg_features['liveness'] = dataset.groupby(['genre']).liveness.mean()
avg_features['loudness'] = dataset.groupby(['genre']).loudness.mean()
avg_features['speechiness'] = dataset.groupby(['genre']).speechiness.mean()
avg_features['tempo'] = dataset.groupby(['genre']).tempo.mean()
avg_features['time_signature'] = dataset.groupby(['genre']).time_signature.mean()
avg_features['valence'] = dataset.groupby(['genre']).valence.mean()
avg_features['mode'] = dataset.groupby(['genre']).mode.mean()

In [14]:
min_features = {}

min_features['song_length'] =  dataset.groupby(['genre']).song_length.min()
min_features['popularity'] = dataset.groupby(['genre']).popularity.min()
min_features['acousticness'] = dataset.groupby(['genre']).acousticness.min()
min_features['danceability'] = dataset.groupby(['genre']).danceability.min()
min_features['energy'] = dataset.groupby(['genre']).energy.min()
min_features['instrumentalness'] = dataset.groupby(['genre']).instrumentalness.min()
min_features['key'] = dataset.groupby(['genre']).key.min()
min_features['liveness'] = dataset.groupby(['genre']).liveness.min()
min_features['loudness'] = dataset.groupby(['genre']).loudness.min()
min_features['speechiness'] = dataset.groupby(['genre']).speechiness.min()
min_features['tempo'] = dataset.groupby(['genre']).tempo.min()
min_features['time_signature'] = dataset.groupby(['genre']).time_signature.min()
min_features['valence'] = dataset.groupby(['genre']).valence.min()
min_features['mode'] = dataset.groupby(['genre']).mode.min()


In [15]:
max_features = {}

max_features['song_length'] =  dataset.groupby(['genre']).song_length.max()
max_features['popularity'] = dataset.groupby(['genre']).popularity.max()
max_features['acousticness'] = dataset.groupby(['genre']).acousticness.max()
max_features['danceability'] = dataset.groupby(['genre']).danceability.max()
max_features['energy'] = dataset.groupby(['genre']).energy.max()
max_features['instrumentalness'] = dataset.groupby(['genre']).instrumentalness.max()
max_features['key'] = dataset.groupby(['genre']).key.max()
max_features['liveness'] = dataset.groupby(['genre']).liveness.max()
max_features['loudness'] = dataset.groupby(['genre']).loudness.max()
max_features['speechiness'] = dataset.groupby(['genre']).speechiness.max()
max_features['tempo'] = dataset.groupby(['genre']).tempo.max()
max_features['time_signature'] = dataset.groupby(['genre']).time_signature.max()
max_features['valence'] = dataset.groupby(['genre']).valence.max()
max_features['mode'] = dataset.groupby(['genre']).mode.max()

In [16]:
import pickle

# export to pickle file

f = open("datasets/avg_features.pkl","wb")
pickle.dump(avg_features,f)
f.close()


f = open("datasets/min_features.pkl","wb")
pickle.dump(min_features,f)
f.close()


f = open("datasets/max_features.pkl","wb")
pickle.dump(max_features,f)
f.close()

### Random Forest Classification
Use RFC to classify the songs

In [17]:
label = dataset['genre']

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels

X = songs_features
y = label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

rfc = RandomForestClassifier(n_estimators=100,criterion='gini')
rfc.fit(X_train,y_train)

# Predicting the Test set results
y_pred = rfc.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[268  13   1  15  14  13  15]
 [ 12 298   3   0  32  14  15]
 [  2  14 192   0  36  32   5]
 [  3   0   1 282   4  19  10]
 [ 23  28  36   6 173  50  21]
 [ 14  18  49  23  41 269  23]
 [ 35  28   4  12  21  14 284]]
              precision    recall  f1-score   support

     country       0.75      0.79      0.77       339
         edm       0.75      0.80      0.77       374
      hiphop       0.67      0.68      0.68       281
        jazz       0.83      0.88      0.86       319
         pop       0.54      0.51      0.53       337
         r&b       0.65      0.62      0.63       437
        rock       0.76      0.71      0.74       398

    accuracy                           0.71      2485
   macro avg       0.71      0.71      0.71      2485
weighted avg       0.71      0.71      0.71      2485

0.7106639839034206


### Get recommendation based on selected genre's features


In [20]:
min_features = open('datasets/min_features.pkl', 'rb') 
min_features = pickle.load(min_features) 

In [21]:
selected_genre = 'pop' # user selects the genre 'pop'
list_genre = []
list_genre.append(selected_genre)
# the min features of pop
genre_min = pd.DataFrame(min_features)
genre_min =genre_min[genre_min.index == selected_genre]
genre_min

Unnamed: 0_level_0,song_length,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,mode
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
pop,121250,1,6.5e-05,0.209,0.0848,0.0,0,0.0205,-18.675,0.0232,51.414,1,0.0346,0


In [22]:
# the max features of pop
genre_max = pd.DataFrame(max_features)
genre_max =genre_max[genre_max.index == selected_genre] 
genre_max

Unnamed: 0_level_0,song_length,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,mode
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
pop,484146,98,0.983,0.967,0.97,0.889,11,0.856,-1.884,0.398,202.049,5,0.976,1


In [23]:
# get songs from spotify based on the selected genre's features
recommendations = sp.recommendations(seed_genres=list_genre,limit=100,
                                     min_acousticness=genre_min.iloc[0][2], max_acousticness =genre_max.iloc[0][2],
                                     min_danceability=genre_min.iloc[0][3], max_danceability =genre_max.iloc[0][3],
                                     min_energy=genre_min.iloc[0][4], max_energy =genre_max.iloc[0][4],
                                     min_instrumentalness=genre_min.iloc[0][5], max_instrumentalness =genre_max.iloc[0][5],
                                     min_liveness=genre_min.iloc[0][7], max_liveness=genre_max.iloc[0][7],
                                     min_loudness=genre_min.iloc[0][8], max_loudness=genre_max.iloc[0][8],
                                     min_speechiness=genre_min.iloc[0][9], max_speechiness=genre_max.iloc[0][9],
                                     min_tempo=genre_min.iloc[0][10], max_tempo=genre_max.iloc[0][10],
                                     min_valence=genre_min.iloc[0][12], max_valence=genre_max.iloc[0][12])

In [30]:
# use RFC to predict the songs' genre again

rec = []
for i in recommendations['tracks']:
    features = getTrackFeatures(i['id'])
    rec.append(features)
    
dataframe  = pd.DataFrame(rec, columns = ['track_id','name', 'album', 'artist','artist_id','release_date', 'song_length', 
                                        'popularity', 'acousticness', 'danceability','energy', 'instrumentalness', 'key', 
                                          'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature','valence', 'mode'])
cols = dataframe.drop(['track_id','name','artist','artist_id','release_date','album'],axis=1)
genre_feature = min_max_scaler.fit_transform(cols)
dtree_predictions = rfc.predict(genre_feature) 
dataframe['genre'] = dtree_predictions
dataframe[dataframe['genre']=='pop'] # filter out the 'pop' songs

Unnamed: 0,track_id,name,album,artist,artist_id,release_date,song_length,popularity,acousticness,danceability,...,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence,mode,genre
11,1CnPYaKxTVb4LWOtiGOm0m,All Time Low,The Human Condition,Jon Bellion,50JJSqHUf2RQ9xsHs0KMHg,2016-06-10,217603,72,0.0584,0.617,...,0.0,0,0.0933,-4.188,0.0828,90.246,4,0.505,1,pop
22,6YZdkObH88npeKrrkb8Ggf,DUELE EL CORAZON (feat. Wisin),DUELE EL CORAZON (feat. Wisin),Enrique Iglesias,7qG3b048QCHVRO5Pv1T5lw,2016-04-18,200813,72,0.0786,0.724,...,0.0,8,0.226,-3.354,0.0966,90.999,4,0.846,0,pop
35,4lnAN2S1fcI0SjxEbksZVr,Fetish (feat. Gucci Mane),Fetish (feat. Gucci Mane),Selena Gomez,0C8ZW7ezQVs4URX5aX7Kqx,2017-07-13,186112,71,0.0204,0.708,...,7e-06,2,0.062,-4.424,0.0592,123.013,4,0.265,1,pop
36,0AS63m1wHv9n4VVRizK6Hc,Mercy,Illuminate (Deluxe),Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,2017-04-20,208733,80,0.125,0.555,...,0.0,11,0.111,-4.952,0.0827,148.128,4,0.356,0,pop
39,6t2ubAB4iSYOuIpRAOGd4t,Cake - Challenge Version,This Is A Challenge,Various Artists,0LyfQWJT6nXafLPZqxe9Of,2016-12-16,157164,60,0.0729,0.786,...,2e-06,2,0.179,-4.247,0.0523,105.078,4,0.687,1,pop
43,7COfe3P7KgfwDwIRB8LIDw,Mi Gente,Vibras,J Balvin,1vyhD5VmyZ7KMfW5gqLgo5,2018-05-25,185040,74,0.0168,0.548,...,2.3e-05,11,0.143,-4.838,0.0777,104.666,4,0.288,0,pop
51,4QtiVmuA88tPQiCOHZuQ5b,"1, 2, 3 (feat. Jason Derulo & De La Ghetto)","1, 2, 3 (feat. Jason Derulo & De La Ghetto)",Sofia Reyes,0haZhu4fFKt0Ag94kZDiz2,2018-02-16,201526,72,0.165,0.792,...,0.0,1,0.0501,-3.112,0.0589,94.968,4,0.794,0,pop
53,5jE48hhRu8E6zBDPRSkEq7,All About That Bass,Title (Deluxe),Meghan Trainor,6JL8zeS1NmiOftqZTRgdTz,2015-01-09,187920,73,0.0573,0.807,...,3e-06,9,0.124,-3.726,0.0503,134.052,4,0.961,1,pop
57,7ef4DlsgrMEH11cDZd32M6,One Kiss (with Dua Lipa),One Kiss (with Dua Lipa),Calvin Harris,7CajNmpbOovFoOoasH2HaY,2018-04-06,214846,83,0.037,0.791,...,2.2e-05,9,0.0814,-3.24,0.11,123.994,4,0.592,0,pop
77,2H881m3JRA8lpuuwaQL6zy,No Fear,No Fear,DeJ Loaf,7kFfY4UjNdNyaeUgLIEbIF,2017-06-16,169853,64,0.387,0.614,...,0.0,11,0.134,-4.117,0.102,100.202,4,0.566,0,pop
