---
## 0. Imports

In [1]:
from bs4 import BeautifulSoup
import http.client
import urllib
import os

import pandas as pd
import numpy as np
import csv

import dateutil.parser
import datetime
import sys

import spotipy
import spotipy.util

In [2]:
filepath = ('./prev-chart-crawls'
            + '/pull_D-2019-11-19_T-19-56-59-692877'
            + '/Result_D-2019-11-19_T-19-56-59-693112_FrmD-2017-01-01_ToD-2019-11-18.csv')

chartCrawl_df = pd.read_csv(filepath, index_col=0)
chartCrawl_df['Date'] = pd.to_datetime(chartCrawl_df['Date'], yearfirst=True)

---
## 1. Spotify API

### 1.1. Connection

In [3]:
client_id = '987f8aab8f804962a2f19a86e310905c'
client_secret = 'bdb457608ae84339ad7d3c41696cf10e'
cred_manager = spotipy.util.oauth2.SpotifyClientCredentials(client_id, client_secret)

In [4]:
sp = spotipy.Spotify(client_credentials_manager=cred_manager)

### 1.2. Contents

In [5]:
def gen_type_str(obj, r_pat=' '):
    type_str = str(type(obj)).rpartition(r_pat)[2][:-1].replace('\'', '').strip()
    if isinstance(obj, str) and ('http' in obj):
        return '(url str)'
    else:
        return '('+type_str+')'

def print_dict_keys(print_arg, lvl=0, spc_mult=5, r_mrgn=50):
    if isinstance(print_arg, list) and lvl==0:
        print_arg = print_arg[0]
    for key, item in print_arg.items():
        indt_lvl = (spc_mult * lvl)
        key_spacing = ' '*indt_lvl
        type_spacing = ' '*(r_mrgn-len(key)-indt_lvl)
        print(key_spacing, key, type_spacing, gen_type_str(item))
        if isinstance(item, dict):
            print_dict_keys(item, lvl+1)
        else:
            continue

In [75]:
sf_track_id = '3JWiDGQX2eTlFvKj3Yssj3'
sf_album_id = sp.track(sf_track_id)['album']['id']
sp.album(sf_album_id)['genres']

[]

In [7]:
sf_track_id = '3JWiDGQX2eTlFvKj3Yssj3'
sf_album_id = sp.track(sf_track_id)['album']['id']
fmt_str = '{:_^70}'
sp_api_dict = {
    'TRACK DETAILS':[sp.track, sf_track_id],
    'AUDIO FEATURES':[sp.audio_features, sf_track_id],
    'AUDIO ANALYSIS':[sp.audio_analysis, sf_track_id],
    'ALBUM DETAILS':[sp.album, sf_album_id],
}
for sp_name, sp_list in sp_api_dict.items():
    print('\n', fmt_str.format(sp_name), '\n')
    print_dict_keys(sp_list[0](sp_list[1]))


 ____________________________TRACK DETAILS_____________________________ 

 album                                               (dict)
      album_type                                     (str)
      artists                                        (list)
      available_markets                              (list)
      external_urls                                  (dict)
           spotify                                   (url str)
      href                                           (url str)
      id                                             (str)
      images                                         (list)
      name                                           (str)
      release_date                                   (str)
      release_date_precision                         (str)
      total_tracks                                   (int)
      type                                           (str)
      uri                                            (str)
 artists                   

---
## 2. Helper Methods

In [8]:
def gen_internalTrackId_df(chartCrawl_df_arg):
    """Returns a DataFrame containing all unique internal track ids."""
    interalId_srs = chartCrawl_df_arg['Spotify_URL'].str.rpartition('/')[2].unique()
    return pd.DataFrame(interalId_srs, columns=['Internal Track ID'], dtype=str)

In [9]:
internalTrackId_df = gen_internalTrackId_df(chartCrawl_df)
internalTrackId_df

Unnamed: 0,Internal Track ID
0,4Km5HrUvYTaSUfiSGPJeQR
1,343YBumqHu19cGoGARUTsd
2,5aAx2yezTd8zXrkmtKl66Z
3,7BKLCZ1jbUBVqRi2FVlTVw
4,6fujklziTHa8uoM5OQSfIo
5,7yyRTcZmCiyzzJlNzGC9Ol
6,1xznGGDReH1oQq0xzbwXa3
7,7FB8l7UA1HKqnuSLjP9qDc
8,4pdPtRcBmOSQDlJ3Fk945m
9,0SGkqnVQo9KPytSri1H6cF


---
## 3. API Methods

In [34]:
def get_details_df(unq_id_row):
    spotify_id = unq_id_row['Spotify_ID']
    details_dict = sp.track(spotify_id)
    return {'Spotify_ID':spotify_id, 'Track Details Obj':details_dict}

In [35]:
def get_features_df(unq_id_row):
    spotify_id = unq_id_row['Spotify_ID']
    features_list = sp.audio_features(spotify_id)
    features_dict = features_list[0]
    return {'Spotify_ID':spotify_id, 'Audio Features Obj':features_dict}

In [36]:
def get_album_df(unq_id_row):
    album_id = unq_id_row['Track Details Obj']['album']['id']
    album_dict = sp.album(album_id)
    return {'Spotify_ID':unq_id_row['Spotify_ID'], 'Album_ID':album_id, 'Album Details Obj':album_dict}

In [37]:
def gen_trackInfo_df(unq_id_df):
    """Returns and saves to csv the pd.DataFrames of track details."""
    details_df = unq_id_df.apply(get_details_df, 
                                 axis=1, 
                                 result_type='expand')
    
    features_df = unq_id_df.apply(get_features_df, 
                                  axis=1, 
                                  result_type='expand')
    
    album_df = details_df.apply(get_album_df, 
                                  axis=1, 
                                  result_type='expand')
    
    return (details_df, features_df, album_df)

In [38]:
def get_newest_dirpath(data_dirpath):
    dir_contents = [d for d in os.listdir(data_dirpath) if d.startswith('D-')]
    dir_contents.sort(reverse = True)
    dir_name = dir_contents[0]
    return os.path.join(data_dirpath, dir_name)

In [39]:
def extract_api_values(api_df, api_obj_name, tgt_keys):
    for key in tgt_keys:
        api_df[key] = api_df[api_obj_name].apply(lambda a: a[key])
    return api_df

In [40]:
def report_df(df, name):
    print('\n{:_^70}'.format(' '+name.upper()+' '), '\n')
    print('{:<0}{:<10}'.format('shape ', str(df.shape)), '\n')
    print('{:<0}'.format('columns '))
    print('{:<5}'.format(str(list(df.columns))), '\n')
    print('{:<5}{:<20}{:<20}{:<20}'.format(' ', 'col name', 'num na', 'num unique'))
    for col in df.columns:
        num_unq = 0
        try:
            num_unq = len(df[col].unique())
        except:
            num_unq = ''
        print('{:<5}{:<20}{:<20}{:<20}'.format(' ', col, len(df[col][df[col].isna()]), num_unq))

In [131]:
def pull_api_data(pull_dirpath=None):
    
    pull_dirpath = get_newest_dirpath('./data') if (pull_dirpath is None) else pull_dirpath
    results_filepath = [d for d in os.listdir(pull_dirpath) if d.startswith('Result')][0]
    
    charts_df = pd.read_csv(os.path.join(pull_dirpath, results_filepath), index_col=0)
    report_df(charts_df, 'charts')
    
    unq_id_df = pd.DataFrame({'Spotify_ID':charts_df['Spotify_ID'].unique()})
    report_df(unq_id_df, 'unq_id')
    
    api_df_tup = gen_trackInfo_df(unq_id_df)
    
    tgt_details = [
        'id', 'external_ids', 'uri',  'name', 'album', 'artists', 
        'available_markets', 'duration_ms', 'explicit', 'track_number'
    ]
    details_df = extract_api_values(api_df_tup[0], 'Track Details Obj', tgt_details)
    details_df.to_csv(os.path.join(pull_dirpath, 'track_details.csv'))
    report_df(details_df, 'details')
    
    tgt_features = [
        'id', 'uri',
        'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
    ]
    features_df = extract_api_values(api_df_tup[1], 'Audio Features Obj', tgt_features)
    features_df.to_csv(os.path.join(pull_dirpath, 'audio_features.csv'))
    report_df(features_df, 'features')
    
    tgt_album = [
        'id', 'uri', 'external_ids', 'name', 'artists', 
        'label', 'album_type', 'genres', 
        'total_tracks'
    ]
    albums_df = extract_api_values(api_df_tup[2], 'Album Details Obj', tgt_album)
    albums_df.to_csv(os.path.join(pull_dirpath, 'album_details.csv'))
    report_df(albums_df, 'album')
    
    merged_df = unq_id_df.merge(details_df, on='Spotify_ID')
    merged_df = merged_df.merge(features_df, on=['Spotify_ID'], suffixes=('', '_feature'))
    merged_df = merged_df.merge(albums_df, on=['Spotify_ID'], suffixes=('', '_album'))
    merged_df.to_csv(os.path.join(pull_dirpath, 'merged_track_info.csv'))
    report_df(merged_df, 'merged')
    
#     id_info = ['Spotify_ID', 'id', 'uri', 'external_ids', 'name']
#     album_info = ['name_album', 'artists']
#     ordinal = ['explicit', 'mode', 'total_tracks', 'track_number', 'time_signature', 'key']
#     catagorical = ['label', 'album_type']
#     continuous = ['danceability', 'energy', 'loudness', 'speechiness', 
#                   'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    
    ml_cols = [
        'Spotify_ID', 'id', 'uri', 'external_ids', 'name',
        'name_album', 'artists', 'label', 'album_type', 'total_tracks',
        'duration_ms', 
        'explicit', 'track_number', 'mode', 'time_signature',
        'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
        'instrumentalness', 'liveness', 'valence', 'tempo', 
    ]
    ml_df = pd.DataFrame(merged_df.loc[:, ml_cols], copy=True)
    ml_df['external_ids'] = ml_df['external_ids'].apply(lambda e: e['isrc'])
    ml_df['label'] = ml_df['label'].apply(lambda l: str(l).split('/'))
    ml_df.to_csv(os.path.join(pull_dirpath, 'ml_dataset.csv'))
    report_df(ml_df, 'ml dataset')
    
    return ml_df

In [166]:
charts_df = pd.read_csv('./data/D-2019-12-09_T-17-09-37-538278/Top200_D-2019-12-09_T-17-09-37-538278_FrmD-2017-01-01_ToD-2019-12-08.csv', index_col=0)
def get_max_chart_rank(spotify_id):
    id_df = charts_df[charts_df['Spotify_ID']==spotify_id]
    max_rank = id_df['Position'].min()
    return max_rank

In [172]:
charts_df = pd.read_csv('./data/D-2019-12-09_T-17-09-37-538278/Top200_D-2019-12-09_T-17-09-37-538278_FrmD-2017-01-01_ToD-2019-12-08.csv', index_col=0)
merged_df = pd.read_csv('./data/merged_track_info.csv', index_col=0)

ml_cols = [
    'Spotify_ID', 'name', 'name_album', 
    'album_type', 'total_tracks',
    'duration_ms', 
    'explicit', 'track_number', 'mode', 'time_signature',
    'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
    'instrumentalness', 'liveness', 'valence', 'tempo', 
]
ml_df = pd.DataFrame(merged_df.loc[:, ml_cols], copy=True)
report_df(ml_df, 'ml dataset')

ml_df['max_chart_rank'] = ml_df['Spotify_ID'].apply(lambda s: charts_df[charts_df['Spotify_ID']==s]['Position'].min())
ml_df.dropna(inplace=True)
ml_df.to_csv('./data/D-2019-12-09_T-17-09-37-538278/ml_dataset.csv')
ml_df


_____________________________ ML DATASET _____________________________ 

shape (9462, 20) 

columns 
['Spotify_ID', 'name', 'name_album', 'album_type', 'total_tracks', 'duration_ms', 'explicit', 'track_number', 'mode', 'time_signature', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'] 

     col name            num na              num unique          
     Spotify_ID          0                   9462                
     name                9                   7664                
     name_album          9                   5557                
     album_type          0                   3                   
     total_tracks        0                   55                  
     duration_ms         0                   7550                
     explicit            0                   2                   
     track_number        0                   37                  
     mode                0            

Unnamed: 0,Spotify_ID,name,name_album,album_type,total_tracks,duration_ms,explicit,track_number,mode,time_signature,...,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,max_chart_rank
0,4Km5HrUvYTaSUfiSGPJeQR,Bad and Boujee (feat. Lil Uzi Vert),Culture,album,13,343150,True,4,1,4,...,0.665,11,-5.313,0.2440,0.061000,0.000000,0.1230,0.1750,127.076,1.0
1,343YBumqHu19cGoGARUTsd,Fake Love,More Life,album,22,210937,True,20,0,4,...,0.481,9,-9.350,0.2870,0.105000,0.000000,0.1760,0.6130,134.007,2.0
2,5aAx2yezTd8zXrkmtKl66Z,Starboy,Starboy,album,18,230453,True,1,1,4,...,0.594,7,-7.028,0.2820,0.165000,0.000003,0.1340,0.5350,186.054,3.0
3,7BKLCZ1jbUBVqRi2FVlTVw,Closer,Closer,single,1,244960,False,1,1,4,...,0.524,8,-5.599,0.0338,0.414000,0.000000,0.1110,0.6610,95.010,4.0
4,6fujklziTHa8uoM5OQSfIo,Black Beatles,SremmLife 2 (Deluxe),album,14,291893,True,5,1,4,...,0.632,0,-6.163,0.0649,0.142000,0.000000,0.1280,0.3550,145.926,5.0
5,7yyRTcZmCiyzzJlNzGC9Ol,Broccoli (feat. Lil Yachty),Big Baby DRAM,album,14,225205,True,7,1,4,...,0.525,8,-7.390,0.1310,0.236000,0.000000,0.0570,0.7080,145.990,6.0
6,1xznGGDReH1oQq0xzbwXa3,One Dance,Views,album,20,173986,False,12,1,4,...,0.619,1,-5.886,0.0532,0.007840,0.004230,0.3510,0.3710,103.989,7.0
7,7FB8l7UA1HKqnuSLjP9qDc,Caroline,Good For You,album,15,209640,True,3,1,4,...,0.318,10,-10.357,0.4670,0.174000,0.000000,0.2050,0.6650,120.077,6.0
8,4pdPtRcBmOSQDlJ3Fk945m,Let Me Love You,Encore,album,14,205946,False,13,1,4,...,0.718,8,-5.309,0.0576,0.078400,0.000010,0.1220,0.1420,199.864,9.0
9,0SGkqnVQo9KPytSri1H6cF,Bounce Back,I Decided.,album,14,222360,True,3,0,4,...,0.574,1,-5.628,0.1410,0.104000,0.000000,0.1290,0.2730,81.502,3.0


In [None]:
res_df = pull_api_data()

In [None]:
unq_cols = [
    'album_type', 'explicit', 'mode', 'time_signature',
    'key',
    'total_tracks', 'track_number', 
]

for col in unq_cols:
    unq_arr = []
    try:
        unq_arr = res_df[col].unique()
    except:
        unq_arr = np.unique(np.concatenate(res_df[col].to_numpy()))
    print('\n{:_^70}'.format(' '+col.upper()+' '), '\n')
    print('{:<0}{:>5}'.format('LEN:', len(unq_arr)), '\n')
    print('{:<0}'.format('VALUES:'))
    for val in unq_arr:
        print('{: >5}'.format(str(val)))

---
## ... Testing

In [None]:
trackInfo_tup = gen_trackInfo_df(chartCrawl_df[chartCrawl_df['Artist']=='Drake'],
                                 get_track_details=True,
                                 get_audio_features=True,
                                 get_audio_analysis=False)

In [None]:
drake_details_df = trackInfo_tup[0]

In [None]:
tgt_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

In [None]:
drake_features_df = pd.DataFrame(trackInfo_tup[1], copy=True)
for feature in tgt_features:
    drake_features_df[feature] = drake_features_df['Audio Features Obj'].apply(lambda a: a[feature])
drake_features_df.set_index('Internal Track ID')
drake_features_df.to_csv('./drake_audio_features.csv')

In [None]:
talk_up_df = drake_details_df[(drake_details_df['Internal Track ID']=='3Yw09dj3cTXsAzlLjgBfIP') |
                 (drake_details_df['Internal Track ID']=='4ksuI04WMvUnJbHQjgs3L5')]
talk_up_df

In [None]:
gods_plan_df = drake_details_df[drake_details_df['Internal Track ID'].isin(
    ['2XW4DbS6NddZxRPm5rMCeY', '6T8cJz5lAqGer9GUHGyelE', '2VWbHHhWnMzKWPUs4IEEW9', '6DCZcSspjsKoFjzjrWoCdn'] 
)]
gods_plan_df

In [None]:
dont_talk_df = drake_details_df[drake_details_df['Internal Track ID'].isin(
    ['36ONiya0OANYknz0GgJmwB', '6G8kHiVZ1jW7vHMPVRNZU0'] 
)]
dont_talk_df

In [None]:
tgt_keys_arr = ['disc_number', 'duration_ms', 'explicit', 'external_ids', 
                'external_urls', 'href', 'id', 'name', 'preview_url', 'track_number', 'uri']

def print_track_compares(track_df, keys_arr=tgt_keys_arr):
    print('----------\n')
    for track_idx, details_obj in track_df['Track Details Obj'].items():
        print('track_idx:\t', track_idx, '\n')
        for tgt_key in tgt_keys_arr:
            print(tgt_key + ':\t\t', details_obj[tgt_key])
        print('\n----------\n')

In [None]:
print_track_compares(talk_up_df)

In [None]:
print_track_compares(gods_plan_df)

In [None]:
print_track_compares(dont_talk_df)

In [None]:
trackInfo_tup[1]

In [None]:
trackInfo_tup[2]

In [None]:
sp.audio_analysis('2QpGZOhTCHHiKmpSO9FW4h')['track'].keys()

In [None]:
sp.audio_features('2QpGZOhTCHHiKmpSO9FW4h')[0].keys()

In [None]:
sp.audio_analysis('https://open.spotify.com/track/2QpGZOhTCHHiKmpSO9FW4h')

In [None]:
sp.track('https://open.spotify.com/track/2QpGZOhTCHHiKmpSO9FW4h')

In [None]:
search_result = sp.search(q='Dance Monkey Tones and I', limit=1, type='track', market='US')
search_result['tracks']['items'][0].keys()

In [None]:
sp.search(q='Dance Monkey Tones and I', limit=3, type='track', market='US')

In [None]:
drake_url_test1 = sp.track('0w1ZtnzQmtmuuoKxHT0pLL')
drake_url_test1

In [None]:
drake_url_test2 = sp.track('4HG1YiGBseVKzjyKcmAJen')
drake_url_test2

In [None]:
keys_list = list(drake_url_test1.keys())
for key in keys_list:
    print(key)
    print(drake_url_test1[key], '\n')

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.reset_option('display.max_rows')
drake_url_test1_list = [drake_url_test1[key] for key in drake_url_test1.keys()]
drake_url_test2_list = [drake_url_test2[key] for key in drake_url_test2.keys()]
test_df = pd.DataFrame({'0w1ZtnzQmtmuuoKxHT0pLL':drake_url_test1_list, 
                        '4HG1YiGBseVKzjyKcmAJen':drake_url_test2_list},
                       index=drake_url_test1.keys())
test_df['Same val?'] = test_df['0w1ZtnzQmtmuuoKxHT0pLL']==test_df['4HG1YiGBseVKzjyKcmAJen']

In [None]:
test_df

In [None]:
drake_url_test1['id'] == drake_url_test2['id']

In [None]:
drake_url_test1['href'] == drake_url_test2['href']

In [None]:
drake_url_test1['href']

In [None]:
drake_url_test1['external_urls'] == drake_url_test2['external_urls']

In [None]:
drake_url_test1['external_urls']

In [None]:
drake_url_test2['external_urls']

In [None]:
drake_url_test1['external_urls'] == drake_url_test2['external_urls']