# Spotify Playlist Analysis

From the Spotify API, this project will tell some interesting information about a public playlist.

The **spotipy** package is used during the whole project in order to help the connection and usage with the Spotify API. The link for the spotipy documentation is the following: https://spotipy.readthedocs.io/en/2.19.0/

Interesting Spotipy functions to explore: 
- artist_top_tracks
- artist_albums
- album

In [1]:
# Imports and authetication
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred
import datetime
import pandas as pd
from collections import Counter
from itertools import chain
import numpy as np

# Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id= cred.client_id, client_secret= cred.client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [86]:
# Functions

# From a playlist link, this function return its URI
def get_id_playlist(playlist_link):
    return playlist_link.split("/")[-1].split('?')[0]

# From a category and ID, an URI is generated
def generate_uri(category, id):
    return "spotify:" + category + ":" + id

# From a playlist link, get the main information of the playlist
def playlist_info(playlist_link):
    playlist = sp.playlist(get_id_playlist(playlist_link))
    info = {
        'cover': sp.playlist_cover_image(get_id_playlist(playlist_link))[0]['url'],
        'name': playlist["name"],
        'description': playlist["description"],
        'owner': playlist["owner"]["display_name"] # Other information of the owner also available
    }
    return info

# From a playlist link, creates a Pandas DataFrame with the tracks information
def create_playlist_df(playlist_link):
    playlist_id = get_id_playlist(playlist_link)
        
    tracks = sp.playlist_tracks(playlist_id)
    
    tracks_playlist = []
    
    while True:    
        for track in tracks["items"]:
            t_id = track["track"]["id"]
            name = track["track"]["name"]
            album = track["track"]["album"]["name"]
            duration = datetime.timedelta(seconds=int(track["track"]["duration_ms"]/1000))
            popularity = track["track"]["popularity"]

            artist = ""
            for i in range(len(track["track"]["artists"])):
                name_artist = track["track"]["artists"][i]["name"]
                if artist == "":
                    artist = name_artist
                else:
                    artist = artist + ", " + name_artist
            
            artist_album = ""
            for i in range(len(track["track"]["album"]["artists"])):
                name_artist = track["track"]["album"]["artists"][i]["name"]
                if artist_album == "":
                    artist_album = name_artist
                else:
                    artist_album = artist_album + ", " + name_artist

            track_info = {
                'id': t_id,
                'name': name,
                'album': album,
                'duration': duration,
                'popularity': popularity,
                'artist(s)': artist,
                'album_artist(s)': artist_album
            }
            tracks_playlist.append(track_info)
        if tracks["next"]:
            tracks = sp.next(tracks)
        else:
            break
        
    return pd.DataFrame(tracks_playlist)

# Return top 5 artists with most tracks apperances on a playlist
def top_artists_playlist(playlist_df):
    artists = playlist_df['artist(s)']
    series_top = pd.DataFrame.from_dict(Counter(map(str.strip, chain.from_iterable(artists.str.split(',')))),
                             orient='index').squeeze()
    
    series_top = series_top.sort_values(ascending=False)[0:5]
    
    percentage = np.around(100*series_top.values/len(playlist_df), decimals=2)

    percentage_2 = []
    for i in percentage:
        percentage_2.append(str(i) + '%')
    
    df_top = pd.DataFrame({'artist':series_top.keys(), 'appearances': series_top.values, '% of all': percentage_2})

    return df_top

# Return top 5 albums with most tracks apperances on a playlist
def top_albums_playlist(playlist_df):
    df_top = playlist_df.groupby(['album'])['album'].count().sort_values(ascending=False)[0:5]
    
    percentage = np.around(100*df_top.values/len(playlist_df), decimals=2)
    
    percentage_2 = []
    for i in percentage:
        percentage_2.append(str(i) + '%')
    
    df_top = pd.DataFrame({'album':df_top.keys(), 'appearances': df_top.values, '% of all': percentage_2})
    
    return df_top

# Return all artists with their appearances
def all_top_artists(playlist_df):
    artists = playlist_df['artist(s)']
    series_top = pd.DataFrame.from_dict(Counter(map(str.strip, chain.from_iterable(artists.str.split(',')))),
                             orient='index').squeeze()
    
    series_top = series_top.sort_values(ascending=False)
    
    percentage = np.around(100*series_top.values/len(playlist_df), decimals=2)
    
    df_top = pd.DataFrame({'artist':series_top.keys(), 'appearances': series_top.values, '% of all': percentage})

    return df_top

# Return the track with most popularity (0 to 100)
def most_popular_track(playlist_df):
    return playlist_df.iloc[playlist_df["popularity"].idxmax()]

# Return track with most duration on the playlist
def longest_track(playlist_df):
    return playlist_df.iloc[playlist_df["duration"].idxmax()]

# Return track with most duration on the playlist
def shortest_track(playlist_df):
    return playlist_df.iloc[playlist_df["duration"].idxmin()]

In [3]:
def print_stats_playlist(playlist_df):
    print("The top 5 artists with more tracks appereances on the playlist: ")
    print(top_artists_playlist(playlist_best_of_all))
    print("\n")
    print("The top 5 albums with more tracks the playlist: ")
    print(top_albums_playlist(playlist_best_of_all))
    print("\n")
    print("The track with most popularity (0 to 100) on the playlist: ")
    print(most_popular_track(playlist_best_of_all))
    print("\n")
    print("The track with most duration on the playlist: ")
    print(longest_track(playlist_best_of_all))
    print("\n")
    print("The track with less duration on the playlist: ")
    print(shortest_track(playlist_best_of_all))
    print("\n")

In [4]:
playlist_best_of_all = create_playlist_df("https://open.spotify.com/playlist/6PPayXaPylORShOqea9n9S?si=df9104c8c58349bc")
print_stats_playlist(playlist_best_of_all)

The top 5 artists with more tracks appereances on the playlist: 
         artist  appearances  % of all
0         Drake           98     22.37
1    Juice WRLD           50     11.42
2        Polo G           48     10.96
3  Travis Scott           47     10.73
4     Pop Smoke           28      6.39


The top 5 albums with more tracks the playlist: 
album
Certified Lover Boy                     16
ASTROWORLD                              15
Goodbye & Good Riddance                 11
If You're Reading This It's Too Late     9
Scorpion                                 9
Name: album, dtype: int64


The track with most popularity (0 to 100) on the playlist: 
id                 1rDQ4oMwGJI7B4tovsBOxc
name                          First Class
album                         First Class
duration                  0 days 00:02:53
popularity                             91
artist(s)                     Jack Harlow
album_artist(s)               Jack Harlow
Name: 373, dtype: object


The track with most 

This is basically the funcionality of the project that is going to be implemented in a web page.

In [87]:
top = top_albums_playlist(playlist_best_of_all)
top

Unnamed: 0,album,appearances,% of all
0,Certified Lover Boy,16,3.65%
1,ASTROWORLD,15,3.42%
2,Goodbye & Good Riddance,11,2.51%
3,If You're Reading This It's Too Late,9,2.05%
4,Scorpion,9,2.05%


In [81]:
type(top_new[0])
percentage = []
for i in top_new:
    percentage.append(str(i) + '%')
    
percentage

['22.37%', '11.42%', '10.96%', '10.73%', '6.39%']