In [None]:
# Imports
import os
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
from collections import Counter

In [None]:
# CONSTANTS
ARTISTS_LIST = "wiki/WikiArtists"
NETWORK = "network"
MIN_POPULARITY = 42

By running this notebook it will be possible to query the **Spotify API** ([reference](https://developer.spotify.com/documentation/web-api/reference/)) to store information regarding artists. The integration with the API is made possible thanks to the **Spotipy** library ([documentation](https://spotipy.readthedocs.io/en/2.16.1/)).

Before to proceed it is required to:

1. **Generate a list of artists**. This may be done using the `scraper-wikipedia.ipynb`. After its execution a JSON file will be stored with all the artists name.
1. **Create a Spotify Developer account** ([Authorization guide](https://developer.spotify.com/documentation/general/guides/authorization-guide/)). It is necessary to obtain a `client_id` and a `client_secret` to interact with the API.
1. **Adjust the path of the list of artists**. The notebook will load immediatly the list of artists mentioned previously. It might be necessary to adjust the path of the file in the `ARSTISTS_LIST` variable.

In [None]:
# Set the Spotify developer id and secret
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=os.environ["SP_CLIENT"],
    client_secret=os.environ["SP_SECRET"]
    )
)

In [None]:
with open("../../data/" + ARTISTS_LIST + ".json") as f:
  artists = json.load(f)

**Artist ID**

_Given an artist's name match its Spotify ID_

HOW:

1. Query Spotify API - `/search` (specify `type="artist"`) ([reference](https://developer.spotify.com/documentation/web-api/reference/search/search/)).
1. Iterate over the results and create a list of all the matches with the same given name.
1. Sort the list based on the popularity in reversed order.
1. If the length of the list is greater than zero (at least one match has been found) - pop the first element of the list:
    1. Store `id`, `popularity` and `genres` values in the dictionary loaded before.
    1. Dump into a JSON file (path `data/spotify/artists/<artistID>.json`) the result of this artist.

In [None]:
# Search artist ID given the name
for artist in artists:
    artist["name"] = artist["name"].strip(" ")
    print(artist["name"])

    results = sp.search(q=artist["name"], type="artist")
    matches = []

    print(results)
    for r in results["artists"]["items"]:
        if r["name"] == artist["name"]:
            matches.append(r)
    
    matches = sorted(matches, key = lambda i: i["popularity"], reverse=True)
    
    if len(matches) > 0:
        top_artist = matches[0]
        artist["id"] = top_artist["id"] # assign ID to dict
        artist["popularity"] = top_artist["popularity"]
        artist["genres"] = top_artist["genres"]

        with open('../../data/spotify/artists/' + artist["id"] + '.json', 'w') as outfile:
                    json.dump(top_artist, outfile)

    print("\n")
    time.sleep(0.75)


In [None]:
# Dump dict `artists`
with open("../../data/" + NETWORK + ".json", "w") as outfile:
    json.dump(artists, outfile)

**Filter Artists**

Plotting the popularity distribution (can be done running `src/analysis/viz-popularity.ipynb`) - it is clear that many artists have a low popularity.

The artists will be filtered by:
* their popularity (the minimum values is set in `MIN_POPULARITY`)
* if the length of the genres is equal to 0
* no matched has been found in the previous step

Artists who are nodes of the network will have `isNode` setted to `True`.


In [None]:
for artist in artists:
    if artist["id"] and artist["popularity"] > MIN_POPULARITY and len(artist["genres"]) > 0:
        artist["isNode"] = True
    else:
        artist["isNode"] = False

In [None]:
# Dump dict `artists`
with open("../../data/" + NETWORK + ".json", "w") as outfile:
    json.dump(artists, outfile)

**Discography**

It is possible to query the Spotify API to get all the albums released by an artist [(source)](https://github.com/plamere/spotipy/blob/2.16.1/examples/artist_discography.py).
Albums can be labelled as `album` or `single`. In the first case it will be necessary therefore to query all the songs included in an album and retreive all the collaborations.
However, this task is not required for single albums.


In [None]:
def show_album_tracks(artist_id, album_id, album_type):
    tracks = []
    results = sp.album_tracks(album_id)
    tracks.extend(results['items'])
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])

    with open("../../data/spotify/"+ album_type + "/" + artist_id + "/" + album_id + ".json", "w") as outfile:
        json.dump(tracks, outfile)


def show_artist_albums(artist_id, album_type):
    if not os.path.exists("../../data/spotify/" + album_type + "/" + artist_id):
        os.makedirs("../../data/spotify/" + album_type + "/" + artist_id)

    albums = []
    results = sp.artist_albums(artist_id, album_type=album_type)
    albums.extend(results['items'])
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
    print('Total albums: ', len(albums))
    unique = []  # skip duplicate albums
    unique_id = []
    for album in albums:
        name = album['name'].lower()
        if name not in unique:
            print('ALBUM: ', name)
            unique.append(name)
            unique_id.append(album["id"])
            show_album_tracks(artist_id, album["id"], album_type)
    
    return unique_id


In [None]:
for artist in artists:
    if artist["isNode"]:
        print(artist["name"])
        artist["albums"] = show_artist_albums(artist["id"], album_type="album")
        time.sleep(0.2)
        artist["singles"] = show_artist_albums(artist["id"], album_type="single")
        time.sleep(0.2)

In [None]:
# Dump dict `artists`
with open("../../data/" + NETWORK + ".json", "w") as outfile:
    json.dump(artists, outfile)

In [None]:
nodes = [x["id"] for x in artists if x["isNode"]]

In [None]:
for artist in artists:
    if artist["isNode"]:
        artist["edges"] = []

In [None]:
for artist in artists:
    if artist["isNode"]:
        print(artist["name"])
        for album in artist["albums"]:
            with open("../../data/spotify/albums/" + artist["id"] + "/" + album + ".json") as f:
                content = json.load(f)
            for item in content:
                for x in item["artists"]:
                    if x["id"] != artist["id"]:
                        if x["id"] in nodes:
                            artist["edges"].append(x["id"])

        artist["edges"] = list(set(artist["edges"]))
        print(artist["edges"])


In [None]:
# Dump dict `artists`
with open("../../data/" + NETWORK + ".json", "w") as outfile:
    json.dump(artists, outfile)

In [None]:
# TOP 5

def get_top5(artist_id):
    top_tracks = sp.artist_top_tracks(artist_id, country="US")
    top5_tracks = top_tracks['tracks'][0:5] # only the top 5 tracks

    # retrieve the lyrics
    lyrics = []
    for item in top5_tracks:
        track = {
            "artist": item['artists'][0]['name'],
            "track": item['name'],
            "id": item['id']
        }
        lyrics.append(track)

    return lyrics

In [None]:
for artist in artists:
    if artist["isNode"]:
        print(artist["name"])
        artist["top5"] = get_top5(artist["id"])
        time.sleep(0.2)

In [None]:
# Dump dict `artists`
with open("../../data/" + NETWORK + ".json", "w") as outfile:
    json.dump(artists, outfile)