In [None]:
import json
import os.path
import requests
import shutil
import time
from spotify_api import SpotifyAPI
from config import client_id, client_secret

In [None]:
spotify = SpotifyAPI(client_id, client_secret)

Find the playlist unique SpotifyID

In [None]:
spotify.search('Most%Popular%Songs', search_type = 'playlist')

## Fetch data from spotify API

Build a JSON file with the following data:

<ul>
<li>artist_id</li>
<li>artist_name</li>
<li>album_id</li>
<li>album_name</li>
<li>album_popularity</li>
<li>genres [list]</li>
<li>album_url</li>
</ul>

In [None]:
playlist_id = '4o8NBsWreC3OnKePUYw0dg'
dataset_name = "data_full_64.txt"
dataset_path = ".\\data\\"

data = {}
id = 0
offset = 0

while True:
    cnt = 0
    r = spotify.get_playlist_items(playlist_id,
                                   'items(track(name,artists(id),id,album(id)))',
                                   100,
                                   offset, 
                                   'playlists')
    
    for item in r['items']:
        data[id] = {}
        artist_id = item['track']['artists'][0]['id'] 
        album_id = item['track']['album']['id']   
        artist = spotify.get_artist(artist_id)
        album = spotify.get_album(album_id)

        data[id]['artist_id'] = artist_id
        data[id]['artist_name'] = artist['name']
        data[id]['album_id'] = album_id
        data[id]['album_name'] = album['name']
        data[id]['album_popularity'] = album['popularity']
        data[id]['genres'] = artist['genres']
        
        for image in album['images']: 
            if image['height'] == 64:
                data[id]['album_url'] = image['url']
    
        id+=1
        cnt+=1
        
    print(f"Fetched {id} tracks.")
    
    if cnt < 100:
        break
        
    offset += 100
        
with open(dataset_path + dataset_name, 'w') as file:
    json.dump(data, file)

## Download album cover image

The playlist can contain multiple songs from the same album.
Using album id, we skip albums already downloaded.

In [None]:
dataset_path = ".\\data\\"
image_path = ".\\images_64\\"

dataset_name = "data_full_64.txt"
dataset_name_img = "data_full_64_img.txt"



with open(dataset_path+dataset_name) as json_file:
    data = json.load(json_file)


for item in data.keys():
    print(item, end= ' ')
    downloaded = False
    
    while not downloaded:
        try:
            image_url = data[item]['album_url']
            album_id = data[item]['album_id'].replace('/','')
            filename = image_path + album_id + '.jpg'
        except:
            break
        
        if os.path.isfile(filename):
            print('File already downloaded.')
            break
        try:
            r = requests.get(image_url, timeout=200, stream = True)
            time.sleep(1)
            
        except:
            continue

        if r.status_code == 200:
            r.raw.decode_content = True
            with open(filename,'wb') as f:
                shutil.copyfileobj(r.raw, f)
            downloaded = True
            data[item]['filepath'] = filename
        else:
            print('Image Couldn\'t be retrieved')


with open(dataset_path+dataset_name_img, 'w') as file:
    json.dump(data, file)

## Remove duplicate album entries

The final dataset contains unique albums with succesfully downloaded images.

In [None]:
dataset_name_img = "data_full_64_img.txt"
dataset_name_clean = "data_full_64_clean.txt"
dataset_path = ".\\data\\"


with open(dataset_path+dataset_name_img) as json_file:
    data = json.load(json_file)

dict_keys_to_del = []

for item in data.keys():    
    if 'filepath' not in data[item].keys():
        dict_keys_to_del.append(item)

for item in dict_keys_to_del:
    del data[item]
    
with open(dataset_path+dataset_name_clean, 'w') as file:
    json.dump(data, file)

In [None]:
dataset_name_clean = "data_full_64_clean.txt"
dataset_path = ".\\data\\"

with open(dataset_path+dataset_name_clean) as json_file:
    data = json.load(json_file)
    
print(json.dumps(data, indent=4))