# Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import time
from tqdm import tqdm 

In [2]:
# Reading the exported apple library
data = pd.read_csv('Music.csv')

# Dropping un-necessary columns
data.drop(['Grouping','Work','Movement Number','Movement Count','Movement Name','Volume Adjustment',
          'Equaliser','Comments','My Rating'], axis = 1, inplace = True)
data.drop(0,inplace = True)

In [3]:
# sub-selecting data when 'Plays' value is not 0
final_data = data[data['Plays'].notnull()].copy()

# final dataframe
final_data = final_data[['Name','Artist','Album','Year','Plays', 'Genre']]

In [4]:
final_data['Label'] = 0
final_data['Label'][final_data[final_data['Plays'] >= 15].index] = 1

In [5]:
final_data.loc[:,'Year'].fillna(0, inplace = True)
final_data.loc[:,'Album'].fillna('None', inplace = True)
final_data.dropna(axis = 0, inplace = True)

In [6]:
final_data.head()

Unnamed: 0,Name,Artist,Album,Year,Plays,Genre,Label
2,Kasoor,Prateek Kuhad,Kasoor - Single,2020.0,1.0,Singer/Songwriter,0
4,Tujhse Naraz Nahi Zindagi,SANAM,Rewind With Sanam,2014.0,5.0,Indian Pop,0
5,Lag Ja Gale,SANAM,Rewind With Sanam,2014.0,3.0,Indian Pop,0
6,Abhi Mujh Mein Kahin,Ajay-Atul & Sonu Nigam,Agneepath (Original Motion Picture Soundtrack),2011.0,4.0,Bollywood,0
7,Tauba Tumhare Yeh Ishare,Abhijeet Bhattacharya & Alka Yagnik,Chalte Chalte (Original Motion Picture Soundtr...,2003.0,19.0,Bollywood,1


# API Request

In [76]:
import requests
import json
from requests.auth import HTTPBasicAuth

In [77]:
client_id = 'b107854432764d6fb8104f4b7a494d74'
client_secret = '9ef8de4d0e324edfb26676339b80be32'

In [78]:
auth_url = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

In [79]:
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

In [80]:
# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

# Track ID from the URI
track_id = '6y0igZArWVi6Iz0rj35c1Y'

# actual GET request with proper header
r = requests.get(BASE_URL + 'audio-features/' + track_id, headers=headers)

In [12]:
r = r.json()

In [13]:
artist_id = '36QJpDe2go2KgaRleHCDTp' # Led Zepellin

# pull all artists albums
r = requests.get(BASE_URL + 'artists/' + artist_id + '/albums', 
                 headers=headers, 
                 params={'include_groups': 'album', 'limit': 50})
d = r.json()

### Requesting Track IDs based on Song and Artist name

In [14]:
song_name = 'Tauba Tumhare Yeh Ishare'
artist_name = 'Abhijeet Bhattacharya & Alka Yagnik'

r = requests.get(BASE_URL + 'search?q=track:' + song_name + '%20artist:' + artist_name + '&type=track',
                headers=headers)

In [18]:
track_id = '5QtQFSdyZEl0w4iDxAyv76'
r = requests.get(BASE_URL + 'audio-features/' + track_id, headers=headers) 

# Apple Playlist Data

In [19]:
tuples = list(zip(final_data['Name'], final_data['Artist'], final_data['Genre'], final_data['Year'],
                  final_data['Plays']))

In [20]:
new_df = pd.DataFrame(columns = ['artist_name','track_name','track_id','genre','year','plays','danceability','energy','key',
                        'loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence',
                       'tempo','duration_ms','time_signature'])

t0 = time.time()

i = 0
for track_name, artist_name, genre, year, plays in tqdm(tuples):
    r1 = requests.get(BASE_URL + 'search?q=track:' + track_name + '%20artist:' + artist_name + '&type=track',
                headers=headers)
    r1 = r1.json()
    if len(r1['tracks']['items']) < 1:
        continue
    else:
        track_id = r1['tracks']['items'][0]['id']
    
    r2 = requests.get(BASE_URL + 'audio-features/' + track_id, headers=headers) 
    r2 = r2.json()
    
    
    new_df.loc[i,'artist_name'] = artist_name
    new_df.loc[i,'track_name'] = track_name
    new_df.loc[i,'track_id'] = track_id
    new_df.loc[i,'genre'] = genre
    new_df.loc[i,'year'] = year
    new_df.loc[i,'plays'] = plays
    
    new_df.loc[i,'danceability'] = r2['danceability']
    new_df.loc[i,'energy'] = r2['energy']
    new_df.loc[i,'key'] = r2['key']
    new_df.loc[i,'loudness'] = r2['loudness']
    new_df.loc[i,'mode'] = r2['mode']
    new_df.loc[i,'speechiness'] = r2['speechiness']
    new_df.loc[i,'acousticness'] = r2['acousticness']
    new_df.loc[i,'instrumentalness'] = r2['instrumentalness']
    new_df.loc[i,'liveness'] = r2['liveness']
    new_df.loc[i,'valence'] = r2['valence']
    new_df.loc[i,'tempo'] = r2['tempo']
    new_df.loc[i,'duration_ms'] = r2['duration_ms']
    new_df.loc[i,'time_signature'] = r2['time_signature']
    
    i = i + 1

t1 = time.time()
print('Time taken:', t1-t0)

100%|██████████| 1346/1346 [15:45<00:00,  1.42it/s] 

Time taken: 945.3763649463654





In [25]:
new_df.to_csv('new_df.csv', index = False)

In [22]:
new_df[:10]

Unnamed: 0,artist_name,track_name,track_id,genre,year,plays,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Prateek Kuhad,Kasoor,08kTa3SL9sV6Iy8KLKtGql,Singer/Songwriter,2020,1,0.558,0.497,8,-7.175,1,0.0256,0.0285,5.36e-06,0.0799,0.444,107.047,197549,4
1,SANAM,Tujhse Naraz Nahi Zindagi,0AfVDxWlOZxto1gccmPB2f,Indian Pop,2014,5,0.69,0.266,8,-9.693,1,0.0263,0.928,0.00957,0.112,0.455,102.025,237696,4
2,SANAM,Lag Ja Gale,4QEytRwdtpdbexU2ZFVZbj,Indian Pop,2014,3,0.393,0.32,8,-8.648,1,0.0301,0.857,1.09e-05,0.0917,0.456,135.029,240953,3
3,Ajay-Atul & Sonu Nigam,Abhi Mujh Mein Kahin,73y649QhnXdcm6fRdvfraO,Bollywood,2011,4,0.334,0.387,2,-9.483,1,0.0332,0.755,3.94e-05,0.113,0.42,129.959,364787,1
4,Salim Merchant,Aye Khuda,1vgVsR5rkXGP4NmUcoroJH,Bollywood,2010,3,0.656,0.762,9,-6.328,1,0.0315,0.549,0.0,0.139,0.558,110.002,280497,4
5,SANAM,Pehla Nasha,70Nr8FkDKKZaRU8ztTG9Qi,Indian Pop,2016,10,0.547,0.639,6,-8.36,1,0.0382,0.495,0.000102,0.119,0.583,153.971,224210,4
6,SANAM & Sanah Moidutty,Aajkal Tere Mere Pyar Ke Charche,0yMBROL5b4zFngkrLTBnW7,Indian Pop,2019,12,0.65,0.588,9,-7.221,1,0.0548,0.295,5.33e-06,0.0996,0.639,104.857,186277,4
7,Xavier Rudd,Walk Away,53wIXzJGXvoAvsBLfs0eQT,Alternative,2018,2,0.528,0.488,11,-9.239,1,0.0455,0.541,8.21e-05,0.103,0.348,120.76,235106,4
8,SANAM,Tujhse Naraz Nahi Zindagi,0AfVDxWlOZxto1gccmPB2f,Indian Pop,2014,9,0.69,0.266,8,-9.693,1,0.0263,0.928,0.00957,0.112,0.455,102.025,237696,4
9,Roxette,Listen to Your Heart,1qIKynV6YQZgocodkPdCy7,Pop/Rock,1988,4,0.539,0.583,1,-4.935,0,0.0275,0.108,0.0,0.111,0.337,86.063,328093,4


# Spotify playlist Data

In [3]:
playlist = pd.read_csv('Ashreet Spotify Playlist.rtf',)
playlist.drop([0,1,2,3,4], inplace = True)
playlist.iloc[0,:] = "spotify:track:204swkv9veaNPFDuX3Qkyn2"

playlist.columns = ['track_id']
playlist.iloc[:,0] = playlist.iloc[:,0].apply(lambda x: x[14:-1])

In [5]:
playlist.head()

Unnamed: 0,track_id
5,204swkv9veaNPFDuX3Qkyn
6,50Emj7dvfU5NTCQsS7ECJ7
7,27VqcCXF74iKuPAE7aeiDP
8,3xglYoGeL5l9oSt2HWEzrX
9,3qbToOWJKKc0HtruELjlE1


In [111]:
spotify_df = pd.DataFrame(columns = ['artist_name','track_name','track_id','genre','year','plays','danceability','energy','key',
                        'loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence',
                       'tempo','duration_ms','time_signature'])


i = 0
for track_id in tqdm(playlist.track_id.values):
    audio_features = requests.get(BASE_URL + 'audio-features/' + track_id, headers=headers)
    song_info = requests.get(BASE_URL + 'tracks/' + track_id, headers=headers)
    
    audio_featues = audio_features.json()
    song_info = song_info.json()
    
    spotify_df.loc[i,'artist_name'] = song_info['artists'][0]['name']
    spotify_df.loc[i,'track_name'] = song_info['name']
    spotify_df.loc[i,'track_id'] = track_id
    #spotify_df.loc[i,'genre'] = 
    spotify_df.loc[i,'year'] = song_info['album']['release_date']
    #spotify_df.loc[i,'plays'] = plays
    
    spotify_df.loc[i,'danceability'] = audio_featues['danceability']
    spotify_df.loc[i,'energy'] = audio_featues['energy']
    spotify_df.loc[i,'key'] = audio_featues['key']
    spotify_df.loc[i,'loudness'] = audio_featues['loudness']
    spotify_df.loc[i,'mode'] = audio_featues['mode']
    spotify_df.loc[i,'speechiness'] = audio_featues['speechiness']
    spotify_df.loc[i,'acousticness'] = audio_featues['acousticness']
    spotify_df.loc[i,'instrumentalness'] = audio_featues['instrumentalness']
    spotify_df.loc[i,'liveness'] = audio_featues['liveness']
    spotify_df.loc[i,'valence'] = audio_featues['valence']
    spotify_df.loc[i,'tempo'] = audio_featues['tempo']
    spotify_df.loc[i,'duration_ms'] = audio_featues['duration_ms']
    spotify_df.loc[i,'time_signature'] = audio_featues['time_signature']
    
    i = i + 1

100%|██████████| 499/499 [04:42<00:00,  1.76it/s]


In [113]:
spotify_df.drop(['genre','plays'],1, inplace = True)

In [115]:
spotify_df.to_csv('spotify_df.csv', index = False)