# Importing the libraries

In [21]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import yaml
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [22]:
stream= open("spotify/spotify2.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [41]:
df=pd.read_csv('1m_dataset.csv')
artist_features=pd.read_csv('features/artist_features.csv')
audio_features=pd.read_csv('features/audio_features.csv')
track_features=pd.read_csv('features/track_features.csv')

In [24]:
track_features

Unnamed: 0,Track_uri,Track_release_date,Track_pop
0,0UaMYEvWZi0ZqiDOoHU3YI,2005-07-04,70
1,6I9VzXrHxO9rA9A5euc8Ak,2003-11-13,85
2,0WqIKmW4BTrj3eJFmnCKMv,2003-06-23,21
3,1AWQoqb9bSvzTjaLralEkT,2002-11-04,83
4,1lzr43nnXAijIGYnCT8M8H,2000,0
...,...,...,...
249037,2RXDREasw1esq7PDG6qFeI,2015-06-23,4
249038,5mG83Cjbtxczir6LcBUtKe,2012-09-07,0
249039,5743GQoRbTttYCiT1SWIfk,2012-12-30,0
249040,7LtduSzcNn40lGi3KdQDqT,2012-10-05,0


In [25]:
audio_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.904,0.813,4,-7.105,0,0.1210,0.03110,0.006970,0.0471,0.810,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4
1,0.774,0.838,5,-3.914,0,0.1140,0.02490,0.025000,0.2420,0.924,143.040,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4
2,0.664,0.758,2,-6.583,0,0.2100,0.00238,0.000000,0.0598,0.701,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4
3,0.892,0.714,4,-6.055,0,0.1410,0.20100,0.000234,0.0521,0.817,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,spotify:track:1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4
4,0.853,0.606,0,-4.596,1,0.0713,0.05610,0.000000,0.3130,0.654,94.759,audio_features,1lzr43nnXAijIGYnCT8M8H,spotify:track:1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200095,0.558,0.262,4,-6.354,1,0.0281,0.80700,0.000000,0.1440,0.116,86.509,audio_features,1umsIAHkOWzOVjnBtsc8mF,spotify:track:1umsIAHkOWzOVjnBtsc8mF,https://api.spotify.com/v1/tracks/1umsIAHkOWzO...,https://api.spotify.com/v1/audio-analysis/1ums...,296626,3
200096,0.349,0.248,7,-7.300,0,0.0292,0.74400,0.000002,0.1010,0.164,105.400,audio_features,6nFHDmonvKsQDXuQHxGAJh,spotify:track:6nFHDmonvKsQDXuQHxGAJh,https://api.spotify.com/v1/tracks/6nFHDmonvKsQ...,https://api.spotify.com/v1/audio-analysis/6nFH...,323186,4
200097,0.623,0.208,9,-14.083,1,0.0408,0.90400,0.918000,0.1200,0.235,84.584,audio_features,0ytcZAli1HSG5qHNp76y6Y,spotify:track:0ytcZAli1HSG5qHNp76y6Y,https://api.spotify.com/v1/tracks/0ytcZAli1HSG...,https://api.spotify.com/v1/audio-analysis/0ytc...,230800,3
200098,0.467,0.159,9,-14.023,1,0.0334,0.96600,0.000250,0.1020,0.344,105.400,audio_features,2oFWHCb5xwvAAbUiMgZ1kU,spotify:track:2oFWHCb5xwvAAbUiMgZ1kU,https://api.spotify.com/v1/tracks/2oFWHCb5xwvA...,https://api.spotify.com/v1/audio-analysis/2oFW...,216931,4


In [26]:
artist_features

Unnamed: 0,Artist_uri,Artist_pop,Artist_genres
0,2wIVse2owClT7go1WT98tk,69,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...
1,26dSoYclwsYLMAKD3tpOr4,78,dance_pop pop
2,6vWDO969PvNqNYHIOW5v0m,83,pop r&b
3,31TPClRtHm23RisEBtV3X7,78,dance_pop pop
4,5EvFsr3kj42KNv97ZEnqij,70,pop_rap reggae_fusion
...,...,...,...
239674,0pZMJM2veQsxONhGQrAq8q,0,unknown
239675,1XXRl7dZ57LTpVpKQkYd5E,0,unknown
239676,3ztkEwzA7r2OE3MXYbkU2b,18,unknown
239677,3aBQxSSBL1i8549nb6a5uL,0,unknown


# Merging all dataframes

In [43]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')

In [44]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "Track_uri",how = 'outer')

In [45]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "Artist_uri",how = 'outer')

In [46]:
df.shape #1560393

(1760493, 28)

# Handling missing data 

In [14]:
df.isna().sum()

Unnamed: 0                        0
track_uri                         0
artist_uri                        0
album_uri                         0
danceability                1560393
energy                      1560393
key                         1560393
loudness                    1560393
mode                        1560393
speechiness                 1560393
acousticness                1560393
instrumentalness            1560393
liveness                    1560393
valence                     1560393
tempo                       1560393
type                        1560393
id                          1560393
uri                         1560393
track_href                  1560393
analysis_url                1560393
duration_ms                 1560393
time_signature              1560393
Track_uri                   1511451
Track_release_date          1511451
Track_pop                   1511451
Artist_uri                      250
Artist_pop                      250
Artist_genres               

## Handling audio_features missing From extraction

In [16]:
missing_t_uri=df.track_uri[df.id.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

In [17]:
f = open('features/audio_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
     track_feature = sp.audio_features(missing_t_uri[i:i+1])
     track_df = pd.DataFrame(track_feature)
     csv_data = track_df.to_csv(header=False,index=False)
     f.write(csv_data)
    except Exception as e:
        r = open("extract_log0.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

  0%|                                                                                      | 0/1560393 [00:00<?, ?it/s]Max Retries reached
  0%|                                                                         | 1/1560393 [00:06<2761:10:31,  6.37s/it]

KeyboardInterrupt



## Handling track_features missing From extraction

In [42]:
missing_t_uri=df.track_uri[df.Track_uri.isna()]
missing_t_uri=missing_t_uri.unique()
random.shuffle(missing_t_uri)

AttributeError: 'DataFrame' object has no attribute 'Track_uri'

In [None]:
f = open('data/track_features.csv','a')
for i in tqdm(range(0,len(missing_t_uri),1)):
    try:
        track_features = sp.tracks(missing_t_uri[i:i+1])
        for x in range(1):
            track_pop=pd.DataFrame([missing_t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as e:
        r = open("extract_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(e)+'\n')
        r.close()
        time.sleep(1)
        continue
f.close()

# Droping Unwanted Columns Save Space

There were still 101 from audio_features and 576 from track_features extraction that were missing from the soptify api, so I had to drop them.

In [47]:
df.dropna(axis=0,inplace=True)

In [48]:
df.isna().sum().sum()

0

In [49]:
df.columns

Index(['Unnamed: 0', 'track_uri', 'artist_uri', 'album_uri', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature',
       'Track_uri', 'Track_release_date', 'Track_pop', 'Artist_uri',
       'Artist_pop', 'Artist_genres           '],
      dtype='object')

In [50]:
df.drop(columns=['Track_uri','Unnamed: 0','Artist_uri','type','id','uri','track_href','analysis_url'],axis=1,inplace=True)

In [51]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,226864.0,4.0,2005-07-04,70.0,69.0,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...


## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [52]:
df['Track_pop'] = df['Track_pop'].apply(lambda x: int(x/5))
df['Artist_pop'] = df['Artist_pop'].apply(lambda x: int(x/5))

In [53]:
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: x.split('-')[0])
df['Track_release_date']=df['Track_release_date'].astype('int16')
df['Track_release_date'] = df['Track_release_date'].apply(lambda x: int(x/50))

In [54]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Track_release_date,Track_pop,Artist_pop,Artist_genres
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,226864.0,4.0,40,14,13,dance_pop hip_hop hip_pop neo_soul pop_rap r&b...


In [56]:
df.shape

(199492, 20)

In [55]:
df.to_csv('features/1M_unique_processed_data.csv',index=False)