# Setup

## Libs and env preparation

In [2]:
import pandas as pd
import os
import numpy as np
import requests
import spotipy
from itertools import chain
from datetime import datetime
import seaborn as sns

In [2]:
%load_ext dotenv

In [3]:
%dotenv

In [4]:
base_path = r'C:\Users\Ale\Desktop\Data\MyData\\'

In [5]:
USERNAME = os.environ.get('USERNAME')
CLIENT_ID =os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
REDIRECT_URI = os.environ.get('REDIRECT_URI')

AUTH_URL = 'https://accounts.spotify.com/api/token'
URL_TRACKS = 'tracks/'
BASE_URL = 'https://api.spotify.com/v1/'

## Importing the data


In [34]:
streaming = pd.read_csv(base_path+'streaming.csv')
streaming.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed
0,2020-10-17 20:55,Don Diablo,Mr. Brightside,193339,2020,10,0.053705
1,2020-10-18 16:47,Don Diablo,Mr. Brightside,17800,2020,10,0.004944
2,2020-10-18 16:51,Unknown Artist,Unknown Track,220148,2020,10,0.061152
3,2020-10-18 16:53,3 Are Legend,Khaleesi,165000,2020,10,0.045833
4,2020-10-18 16:57,Afrojack,Ten Feet Tall,229149,2020,10,0.063653


In [12]:
streaming.year.unique() # Checking that I only have data from 20/21

array([2020, 2021], dtype=int64)

In [13]:
streaming.tail()

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed
27569,2021-10-18 22:32,Jan Blomqvist,Dancing People Are Never Wrong - Extended Mix,853,2021,10,0.000237
27570,2021-10-18 22:35,Jan Blomqvist,The Space In Between - Ben Böhmer Remix,3667,2021,10,0.001019
27571,2021-10-18 22:35,LISA,MONEY,168227,2021,10,0.04673
27572,2021-10-18 22:35,LISA,MONEY,469,2021,10,0.00013
27573,2021-10-18 22:35,Jan Blomqvist,Dancing People Are Never Wrong (Live In Munich...,1877,2021,10,0.000521


In [35]:
library = pd.read_csv(base_path+'library.csv')
library.head()

Unnamed: 0,artist,album,trackName,uri
0,DVBBS,Somebody Like You,Somebody Like You,spotify:track:1wojSLvDvnie1dVaRiMxjS
1,Alan Fitzpatrick,We Do What We Want,We Do What We Want,spotify:track:4gve8Gz2iNDnOIfqQK2dAI
2,French Montana,Unforgettable (feat. Swae Lee),Unforgettable (feat. Swae Lee) - Major Lazer R...,spotify:track:5Xl8hwINbHKEYBHIU0WL1i
3,Maxim Lany,Renaissance,Renaissance,spotify:track:5DprKz35pYir1JQN75V28n
4,K├Âlsch,Now Here No Where,Shoulder Of Giants,spotify:track:0D9WPuMYmnsN7Z1kweZBy1


In [15]:
library.tail()

Unnamed: 0,artist,album,trackName,uri
769,J. Cole,The Off-Season,i n t e r l u d e,spotify:track:08LwMWf5Tcfsd752EPzFV0
770,"Tyler, The Creator",BEST INTEREST,BEST INTEREST,spotify:track:3jHdKaLCkuNEkWcLVmQPCX
771,Dimitri Vegas & Like Mike,Arcade,Arcade - Radio Edit,spotify:track:6ToupFpZbiTiRGEF2vVuzU
772,The Lumineers,The Lumineers,Ho Hey,spotify:track:1jdNcAD8Ir58RlsdGjJJdx
773,Diplo,One By One,One By One - Vintage Culture Remix,spotify:track:4V8PWNwHoiJU2WWWg2iQTC


In [16]:
streaming.shape 

(27574, 7)

In [17]:
library.shape

(774, 4)

# Analysis

## The top 100 most played tracks from the past year (October 2020 - 2021)

About the most played tracks, I need to be careful.
I wanna know the most played tracks by frequency, not exactly by hours played. But why? 'Cause I usually like to listen long tracks (5~10 minutes). So a track that has 10 minutes could count as most played even if I havent linstened it so much in the past year.

In [91]:
top_played = streaming[['artistName', 'trackName']].value_counts().to_frame('count')
top_played[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,count
artistName,trackName,Unnamed: 2_level_1
Bad Bunny,LA NOCHE DE ANOCHE,81
Kendrick Lamar,Money Trees,79
NTO,Invisible - Paul Kalkbrenner Remix,74
Olivia Rodrigo,drivers license,73
Keyshia Cole,Last Night,71
...,...,...
Jorja Smith,Gone,30
Funky Fool,Break My Heart,30
Paul Kalkbrenner,Sky and Sand,30
Tujamo,Take Control,30


In [99]:
top_played.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
artistName,trackName,Unnamed: 2_level_1
Jazmine Sullivan,Girl Like Me (feat. H.E.R.),1
Jaymes Young,Infinity,1
Jaymes Young,Happiest Year,1
Jayda G,All I Need (DJ-Kicks) - Edit,1
Jay Wheeler,"La Curiosidad (feat. Dj Nelson, Jhay Cortez, Lunay & Kendo Kaponi) - Blue Grand Prix Remix",1
Jay Rock,Easy Bake (feat. Kendrick Lamar & SZA),1
Jay Pryor,Say Something (Club Mix),1
Jay Hardway,Vibes,1
Jay Hardway,Run Baby Run,1
ØGM,I Told You,1


In [92]:
top_played[top_played > 40].dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
artistName,trackName,Unnamed: 2_level_1
Bad Bunny,LA NOCHE DE ANOCHE,81.0
Kendrick Lamar,Money Trees,79.0
NTO,Invisible - Paul Kalkbrenner Remix,74.0
Olivia Rodrigo,drivers license,73.0
Keyshia Cole,Last Night,71.0
Lost Frequencies,Here with You - Mastrovita X Mordkey Remix,67.0
Nora En Pure,Epiphany,64.0
Childish Gambino,Redbone,63.0
Frank Ocean,Nights,58.0
Anyma,Claire,56.0


In the selection below I'm selecting the three features and how many times they appearing *togheter*. It's up to note that it's a different subset from the ones above.

In [94]:
top_played_ms = streaming[['artistName', 'trackName', 'msPlayed']].value_counts().to_frame('count')
top_played_ms[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
artistName,trackName,msPlayed,Unnamed: 3_level_1
Bad Bunny,LA NOCHE DE ANOCHE,203200,54
Olivia Rodrigo,drivers license,242013,49
Lost Frequencies,Here with You - Mastrovita X Mordkey Remix,195000,48
Kendrick Lamar,Money Trees,386906,48
Paramore,Misery Business,211520,45
...,...,...,...
Xamã,Câncer,236000,20
Paul Kalkbrenner,Altes Kamuffel,501252,20
Lost Frequencies,St. Peter (Mix Cut) - Deluxe Mix,230410,20
Dayne S,Reminiscence,321995,20


In [100]:
top_played_ms.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
artistName,trackName,msPlayed,Unnamed: 3_level_1
Gilberto Gil,Água (Ao Vivo),242873,1
Gilsons,Devagarinho,12550,1
Giolì & Assia,Blame on Me - Club Edit,280000,1
Giorgia Angiuli,All The Little Things - Undercatt Remix,414019,1
Giorgia Angiuli,Love The Noise - Radio Edit,46154,1
Giveon,Heartbreak Anniversary,5570,1
Giveon,Heartbreak Anniversary,17770,1
Giveon,Heartbreak Anniversary,19230,1
Giveon,Heartbreak Anniversary,23750,1
ØGM,I Told You,86152,1


This case below is a case when I repeatedly played the same song but in different albuns - actually they're not the same, since one is a live version. 

In [41]:
len(streaming.loc[streaming['trackName'] == 'Dancing People Are Never Wrong - Extended Mix'])

17

In [43]:
streaming.iloc[27573].trackName

'Dancing People Are Never Wrong (Live In Munich) - Mixed'

In [44]:
len(streaming.loc[streaming['trackName'] == 'Dancing People Are Never Wrong (Live In Munich) - Mixed'])

26

## Hours spend listening Spotify

In [88]:
total_hours = streaming['hoursPlayed'].sum()
total_days = total_hours/(24)
print("I spent {:.2f} hours listening music and podcasts this past year or {:.2f} days from a year".format(total_hours, total_days))   

I spent 1183.56 hours listening music and podcasts this past year or 49.31 days from a year


It's not _exactly_ the hours I spent listening tracks, but it works for now.

## Top 5 most played artists 

Actually for this I'll have the artist's song - which is the person who wrote it, not exactly the one who sings it

In [104]:
top_played_artists = streaming[['artistName']].value_counts().to_frame('count')
top_played_artists[:100]

Unnamed: 0_level_0,count
artistName,Unnamed: 1_level_1
Lost Frequencies,881
Martin Garrix,601
Nora En Pure,541
RÜFÜS DU SOL,512
Kanye West,444
...,...
Ben Böhmer,62
DaniLeigh,62
Ytram,61
Tale Of Us,61


Top 10 

In [106]:
top_played_artists.head(10)

Unnamed: 0_level_0,count
artistName,Unnamed: 1_level_1
Lost Frequencies,881
Martin Garrix,601
Nora En Pure,541
RÜFÜS DU SOL,512
Kanye West,444
JAY-Z,412
Armin van Buuren,400
Dimitri Vegas & Like Mike,389
Paul Kalkbrenner,370
Jan Blomqvist,353


Bottom 10 

In [105]:
top_played_artists.tail(10)

Unnamed: 0_level_0,count
artistName,Unnamed: 1_level_1
FR!ES,1
FRENSHIP,1
Fabian Luttenberger,1
My Morning Jacket,1
Mustard,1
Murtagh,1
Murge,1
Mumbay,1
Fabrication,1
L.L.A.M.A,1


In [109]:
top_played_artists[top_played_artists > 55].dropna()

Unnamed: 0_level_0,count
artistName,Unnamed: 1_level_1
Lost Frequencies,881.0
Martin Garrix,601.0
Nora En Pure,541.0
RÜFÜS DU SOL,512.0
Kanye West,444.0
...,...
Aya Nakamura,57.0
Black Eyed Peas,57.0
Joachim Pastor,57.0
Outkast,56.0


## Top 5 genres most played

For the genres, I'll need to fetch the track data on Spotify API in order to get it.

## Preparing data to fetch the API

In [36]:
streaming.columns

Index(['endTime', 'artistName', 'trackName', 'msPlayed', 'year', 'month',
       'hoursPlayed'],
      dtype='object')

In [45]:
streaming.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed
0,2020-10-17 20:55,Don Diablo,Mr. Brightside,193339,2020,10,0.053705
1,2020-10-18 16:47,Don Diablo,Mr. Brightside,17800,2020,10,0.004944
2,2020-10-18 16:51,Unknown Artist,Unknown Track,220148,2020,10,0.061152
3,2020-10-18 16:53,3 Are Legend,Khaleesi,165000,2020,10,0.045833
4,2020-10-18 16:57,Afrojack,Ten Feet Tall,229149,2020,10,0.063653


In [46]:
library.columns

Index(['artist', 'album', 'trackName', 'uri'], dtype='object')

In [47]:
library.head()

Unnamed: 0,artist,album,trackName,uri
0,DVBBS,Somebody Like You,Somebody Like You,spotify:track:1wojSLvDvnie1dVaRiMxjS
1,Alan Fitzpatrick,We Do What We Want,We Do What We Want,spotify:track:4gve8Gz2iNDnOIfqQK2dAI
2,French Montana,Unforgettable (feat. Swae Lee),Unforgettable (feat. Swae Lee) - Major Lazer R...,spotify:track:5Xl8hwINbHKEYBHIU0WL1i
3,Maxim Lany,Renaissance,Renaissance,spotify:track:5DprKz35pYir1JQN75V28n
4,K├Âlsch,Now Here No Where,Shoulder Of Giants,spotify:track:0D9WPuMYmnsN7Z1kweZBy1


In [52]:
my_data = streaming.copy()
# This code I grab from the reference about spotify and Tableau and it's pretty useful. 
# It attributes 1 to the common, 0 for the other
my_data['Common'] = np.where(my_data['trackName'].isin(library['trackName'].tolist()), 1, 0)
my_data.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed,Common
0,2020-10-17 20:55,Don Diablo,Mr. Brightside,193339,2020,10,0.053705,0
1,2020-10-18 16:47,Don Diablo,Mr. Brightside,17800,2020,10,0.004944,0
2,2020-10-18 16:51,Unknown Artist,Unknown Track,220148,2020,10,0.061152,0
3,2020-10-18 16:53,3 Are Legend,Khaleesi,165000,2020,10,0.045833,0
4,2020-10-18 16:57,Afrojack,Ten Feet Tall,229149,2020,10,0.063653,0


In [53]:
my_data = pd.merge(my_data, library[['album', 'trackName', 'uri']], how='left', on=['trackName'])
my_data.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed,Common,album,uri
0,2020-10-17 20:55,Don Diablo,Mr. Brightside,193339,2020,10,0.053705,0,,
1,2020-10-18 16:47,Don Diablo,Mr. Brightside,17800,2020,10,0.004944,0,,
2,2020-10-18 16:51,Unknown Artist,Unknown Track,220148,2020,10,0.061152,0,,
3,2020-10-18 16:53,3 Are Legend,Khaleesi,165000,2020,10,0.045833,0,,
4,2020-10-18 16:57,Afrojack,Ten Feet Tall,229149,2020,10,0.063653,0,,


In [55]:
common = my_data[my_data['Common']==1]
common

Unnamed: 0,endTime,artistName,trackName,msPlayed,year,month,hoursPlayed,Common,album,uri
6,2020-10-18 17:06,Alan Walker,"Alone, Pt. II",179052,2020,10,0.049737,1,"Alone, Pt. II",spotify:track:0bMbDctzMmTyK2j74j3nF3
7,2020-10-18 17:07,Alan Walker,All Falls Down (feat. Juliander),34356,2020,10,0.009543,1,Different World,spotify:track:1HvCFAUIWQsWV9zud3UhDl
9,2020-10-18 17:07,Alan Walker,Darkside,1470,2020,10,0.000408,1,Different World,spotify:track:6SRWhUJcD2YKahCwHavz3X
10,2020-10-18 17:10,Alan Walker,The Spectre,193787,2020,10,0.053830,1,The Spectre,spotify:track:2DGa7iaidT5s0qnINlwMjJ
11,2020-10-18 17:10,Alok,Vale Vale,1480,2020,10,0.000411,1,Vale Vale,spotify:track:3bTjd1iQJvsfIoH8v7A0Gy
...,...,...,...,...,...,...,...,...,...,...
28047,2021-10-18 22:32,J. Cole,c l o s e,133034,2021,10,0.036954,1,The Off-Season,spotify:track:4YiY551vHi6glMtgVxuqAy
28049,2021-10-18 22:32,Jan Blomqvist,Dancing People Are Never Wrong - Extended Mix,853,2021,10,0.000237,1,Dancing People Are Never Wrong,spotify:track:64uiYolN19fGGFrXn3Su94
28051,2021-10-18 22:35,LISA,MONEY,168227,2021,10,0.046730,1,LALISA,spotify:track:7hU3IHwjX150XLoTVmjD0q
28052,2021-10-18 22:35,LISA,MONEY,469,2021,10,0.000130,1,LALISA,spotify:track:7hU3IHwjX150XLoTVmjD0q


In [57]:
common.shape[0]

10750

In [74]:
top_100 = common.artistName.value_counts()[:100]
top_100

Lost Frequencies    653
Martin Garrix       416
Kanye West          305
RÜFÜS DU SOL        303
Nora En Pure        291
                   ... 
Aspyer               28
Blackstreet          28
Boris Brejcha        28
J Balvin             28
S1mba                28
Name: artistName, Length: 100, dtype: int64

In [None]:
condition = common.

In [73]:
common_unique_uri = pd.DataFrame(common.uri.unique(), columns=['uri'])
common_unique_uri

Unnamed: 0,uri
0,spotify:track:0bMbDctzMmTyK2j74j3nF3
1,spotify:track:1HvCFAUIWQsWV9zud3UhDl
2,spotify:track:6SRWhUJcD2YKahCwHavz3X
3,spotify:track:2DGa7iaidT5s0qnINlwMjJ
4,spotify:track:3bTjd1iQJvsfIoH8v7A0Gy
...,...
709,spotify:track:3zGpHUsSZ0xLRdDlbWbv5G
710,spotify:track:3ExRfIXdYwYFjRzsCw7L7S
711,spotify:track:2g95XDCx4GqcaJPv7TTk8C
712,spotify:track:4gvrJnKCKIPiacNsWVQwEU


In [68]:
artist_common = pd.DataFrame(common.artistName.unique(), columns=['artist'])
artist_common.sort_values(['artist']) 

Unnamed: 0,artist
334,2raumwohnung
289,A R I Z O N A
237,A-Trak
204,AC/DC
231,AREA21
...,...
107,chromonicci
55,illusionize
223,loafers
293,problem solved


In [71]:
uri_params = common['uri']
uri_params

6        spotify:track:0bMbDctzMmTyK2j74j3nF3
7        spotify:track:1HvCFAUIWQsWV9zud3UhDl
9        spotify:track:6SRWhUJcD2YKahCwHavz3X
10       spotify:track:2DGa7iaidT5s0qnINlwMjJ
11       spotify:track:3bTjd1iQJvsfIoH8v7A0Gy
                         ...                 
28047    spotify:track:4YiY551vHi6glMtgVxuqAy
28049    spotify:track:64uiYolN19fGGFrXn3Su94
28051    spotify:track:7hU3IHwjX150XLoTVmjD0q
28052    spotify:track:7hU3IHwjX150XLoTVmjD0q
28053    spotify:track:3ExRfIXdYwYFjRzsCw7L7S
Name: uri, Length: 10750, dtype: object

In [8]:
def get_token_scope(scope:str) -> str:
    return spotipy.util.prompt_for_user_token(username=USERNAME,
                                           scope=scope,
                                           client_id=CLIENT_ID,
                                           client_secret=CLIENT_SECRET,
                                           redirect_uri=REDIRECT_URI
                                           )

In [9]:
def get_headers(token) -> dict:
    return {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': 'Bearer {token}'.format(token=token),
        }

In [10]:
def get_request(url:str, headers: dict, params: dict) -> dict:
    try:
        response = requests.get(url, 
                    headers = headers, params = params)
        json = response.json()
        first_result = json['items']
        return first_result
    except:
        return None

The genres attribute is related to the artist, not the track. So I can access it getting several artists or getting the artists from the tracks.

In [6]:
def get_attributes_from_tracks(data: list):
    track_uri = []
    track_name = []
    duration = []
    artist_name = []
    genres = []
    popularity = []
    album_name = []
    preview_url = []
    
    for index in range(len(data)):
        track_uri.append(data[index]['uri'])
        track_name.append(data[index]['name'])
        duration.append(data[index]['duration_ms'])
        artist_name.append([k['name'] for k in list(chain(data[index]['artists']))])
        genres.append([k['genres'] for k in list(chain(data[index]['artists']))])
        popularity.append(data[index]['popularity'])
        album_name.append(data[index]['album']['name'])
        preview_url.append(data[index]['preview_url'])
    
    return track_uri, track_name, duration, genres, artist_name, popularity, album_name, preview_url

In [7]:
def create_df_tracks(track_uri: list, track_name: list, duration: list, genres: list, artist_name: list,
                    popularity: list, album_name: list, preview_url:list) -> pd.DataFrame:
    
    selected_data = {}
    
    selected_data[columns_tracks[0]] = track_uri
    selected_data[columns_tracks[1]] = track_name
    selected_data[columns_tracks[2]] = duration
    selected_data[columns_tracks[3]] = genres
    selected_data[columns_tracks[4]] = artist_name
    selected_data[columns_tracks[5]] = popularity
    selected_data[columns_tracks[6]] = album_name
    selected_data[columns_tracks[7]] = preview_url
    

    return pd.DataFrame.from_dict(selected_data)

In [None]:
token = get_token_scope(BASE_URL)

In [None]:
headers = get_headers(token)

In [None]:
artists = get_request(BASE_URL+URL_TRACKS, headers, params = {'time_range': 'long_term', 'limit': '50'})

# Further analysis


- Distribution of msPlayed per day based on my streaming
- Artists/genres per country - NLP processing