In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [6]:
temp0 = pd.read_json('../MyData/StreamingHistory0.json')
temp1 = pd.read_json('../MyData/StreamingHistory1.json')
temp2 = pd.read_json('../MyData/StreamingHistory2.json')

# combining all streams
df_streams = temp0.append(temp1).append(temp2)
df_streams.shape

(22693, 4)

In [7]:
# converting the milliseconds values to minutes to improve readability
df_streams['minutes_played'] = round(df_streams['msPlayed'].divide(60000),2)

# renaming columns
columns_name = {'endTime':'end_time', 'artistName':'artist_name', 'trackName':'track_name', 
                'msPlayed':'milliseconds_played', 'minutes_played': 'minutes_played'}
df_streams.rename(columns= columns_name, inplace=True)

In [8]:
df_streams.head()

Unnamed: 0,end_time,artist_name,track_name,milliseconds_played,minutes_played
0,2020-05-14 21:52,OneRepublic,Apologize,188270,3.14
1,2020-05-14 21:52,AC/DC,Highway to Hell,1690,0.03
2,2020-05-14 21:52,Imagine Dragons,Believer,940,0.02
3,2020-05-14 21:55,Y2K,Lalala,160626,2.68
4,2020-05-14 21:56,Backstreet Boys,Shape of My Heart,930,0.02


***By skimming over the data, i've seen that some of the songs are just played while    
skipping to the next songs, hence I'm deleting the songs which are played less than 20 seconds  
I might increase it to 30 seconds.***

In [32]:
df_updated_streams =  df_streams[df_streams['minutes_played'] > 0.20]   

In [33]:
# converting the datatime format, to date, time, day, month values for better insights
df_updated_streams['end_time'] = df_updated_streams['end_time'].apply(pd.to_datetime)

df_updated_streams['date'] = df_updated_streams['end_time'].apply(lambda x: x.date())
df_updated_streams['time'] = df_updated_streams['end_time'].apply(lambda x: x.time())
df_updated_streams['day'] = pd.DatetimeIndex(df_updated_streams['end_time']).day_name()
df_updated_streams['month'] = pd.DatetimeIndex(df_updated_streams['end_time']).month_name()

In [35]:
total_min = round(df_updated_streams['minutes_played'].sum(),2)
total_hr = round(total_min/60,2)
total_day = round(total_hr/24,2)
min_date = min(df_updated_streams['date'])
max_date = max(df_updated_streams['date'])

print(f"Total music streamed = {total_min} minutes ~ {total_hr} hours ~ {total_day} days from {min_date} to {max_date}")

Total music streamed = 34454.88 minutes ~ 574.25 hours ~ 23.93 days from 2020-05-14 to 2021-05-15


In [9]:
grp_date = pd.DataFrame(df_updated_streams.groupby(by='date')['minutes_played'].sum()).reset_index()

fig = px.line(grp_date, x='date', y='minutes_played', title='Stream Timeline (May-2020 to May 2021)',
        labels={'date': 'Timeline', 'minutes_played': 'Minutes Played'}, width=600, height=400,
            color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.show()

On January 11th, 2021, I spent a  whopping *702.21 minutes (~12 hours)* on Spotify —   
the most in the past year! Surprisingly that was the 1st day of my school term, I couldn't think of anything else!


In [10]:

grp_month = pd.DataFrame(df_updated_streams.groupby(by='month')['minutes_played'].sum()).reset_index()

grp_month['minutes_played'] = round(grp_month['minutes_played'],2) 

# tried ordering months but not working
#month_order = ['May', 'June', 'July', 'August', 'September', 'October', 'November', 'December','January', 'February', 'March', 'April']
#grp_month['month'] = pd.Categorical(grp_month['month'], categories=month_order, ordered=True).sort_values()

# Line Chart
fig = px.line(grp_month, x='month', y='minutes_played', text='minutes_played', title='Songs streamed per Month',
        labels={'month': 'Month', 'minutes_played': 'Minutes Played'}, width=600, height=400,
            color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(texttemplate='%{text:.2s}')
fig.update_layout(showlegend=False)
fig.show()

# Pie Chart
#fig = px.pie(grp_month, values='minutes_played', names='month', title='Songs played by Month')
#fig.update_traces(textposition='inside', textinfo='percent+label')
#fig.show()

I streamed more than 3500 minutes (~59 hours) of music in October and November, that's  understandable because   
I was on my vacation which included almost 3000 km (~1864 miles) of driving while playing music.

In [11]:
grp_day = pd.DataFrame(df_updated_streams.groupby(by='day')['minutes_played'].sum()).reset_index()
grp_day['minutes_played'] = round(grp_day.minutes_played, 2)

# Bar Plot
#fig = px.bar(grp_day, x='day', y='minutes_played', text='minutes_played', title='Songs streamed by day')
#fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
#fig.update_xaxes(categoryorder='total descending')
#fig.show()

# Pie Chart
fig = px.pie(grp_day, values='minutes_played', names='day', title='% Songs played per Day', width=600, height=400, color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

I listen to music at my part-time job, which is on weekends, so it makes sense that  
I spent most of my time listening to music on weekends.

In [12]:
top_artists = pd.DataFrame(df_updated_streams.groupby(by='artist_name')['minutes_played'].sum().sort_values(ascending=False)[:10]).reset_index()

top_arstist_no_songs = pd.DataFrame(df_updated_streams.groupby(by='artist_name')['track_name'].count().sort_values(ascending=False)[:10]).reset_index()

# fig = px.bar(top_artists, x='artist_name', y='minutes_played', title='Top 10 artists by total time songs played', text='minutes_played',
#                 labels={'artist_name':'Artist Name', 'minutes_played':'Minutes Played'}, width=600, height=400,
#                 color_discrete_sequence=px.colors.sequential.Aggrnyl)
# fig.update_traces(texttemplate='%{text:.5s} mins', textposition='inside')
# fig.show()

In [13]:


# fig = px.bar(top_arstist_no_songs, x='artist_name', y='track_name', title='Top 10 artists by no. of times songs played',                                    text='track_name',labels={'artist_name':'Artist Name', 'track_name':'No of songs played'}, width=600, height=400,
#                color_discrete_sequence=px.colors.sequential.Aggrnyl)
# fig.update_traces(texttemplate='%{text:.2s} songs', textposition='inside')
# fig.show()

In [14]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Bar(x=top_artists['artist_name'], y=top_artists['minutes_played'],text=top_artists['minutes_played'],
                texttemplate='%{text:.5s} mins', name='By total time songs played'), row=1,col=1)

fig.add_trace(go.Bar(x=top_arstist_no_songs['artist_name'], y=top_arstist_no_songs['track_name'], text=top_arstist_no_songs['track_name'],
                texttemplate='%{text} times', name='By no. of times songs played'), row=1,col=2)

fig.update_layout(height=400, width=1000, title_text='Top 10 Artists')
fig.update_traces(textposition='inside')
fig.show()

According to the data, I do have some favorite artists.   
Following two factors were used:  
    1. Number of times an artist’s song played.       
    2. Total time spent listening to their songs.   
This list could change next year because of my habit of getting hooked on a song and listening to it repeatedly to bore me.    

*Artists: Arijit Singh, Pritam, A.R Rahman, Post Malone, Prateek Kuhad, Ed Sheeran maintain the top 5 spot.*


In [15]:
top_songs = pd.DataFrame(df_updated_streams.groupby(by=['track_name','artist_name'])['minutes_played'].sum().sort_values(ascending=False)[:10]).reset_index()

# remove artist names from the songs
top_songs['track_name'] = top_songs.track_name.apply(lambda x: x.split('(')[0].split('-')[0].strip())

fig = px.bar(top_songs, x='track_name', y='minutes_played', title='Top 10 songs', text='minutes_played', color='artist_name',
                 labels={'track_name':'Song Names', 'minutes_played':'Minutes Played', 'artist_name':'Artist Names'},
                 color_discrete_sequence=px.colors.sequential.RdBu)

fig.update_xaxes(tickangle=45)
fig.update_traces(texttemplate='%{text:.5s} mins', textposition='inside')
fig.show()

In [16]:
top_songs_no_played = pd.DataFrame(df_updated_streams.groupby(by=['track_name','artist_name'])['minutes_played'].count().sort_values(ascending=False)[:10]).reset_index()

# remove artist names from the songs
top_songs_no_played['track_name'] = top_songs_no_played.track_name.apply(lambda x: x.split('(')[0].split('-')[0].strip())

fig = px.bar(top_songs_no_played, x='track_name', y='minutes_played', title='Top 10 songs', text='minutes_played', color='artist_name',
                 labels={'track_name':'Song Names', 'minutes_played':'No. of Times Played', 'artist_name':'Artist Names'},
                 color_discrete_sequence=px.colors.sequential.RdBu)

fig.update_xaxes(tickangle=45)
fig.update_traces(texttemplate='%{text} times', textposition='inside')
fig.show()

There is a huge difference in the above two graphs, which includes my top 10 songs based on No. of times and Total Minutes it is played.    After some investigation, I figured out that songs such as (Do you?, What's Poppin, etc) are small in length so even though I have heard them a lot, <br>         the total time is less than others.    
But, **Modern Loneliness by Lauv** tops both the criteria, it makes sense as it is a lockdown song and was aligned with the situation.<br>
Strangely, only one song (Kalank) from my top 10 favorite songs list matches my top Artists (Arijit Singh) list above.

In [17]:
# fig = make_subplots(rows=1, cols=2)


# fig.add_trace(go.Bar(x=top_songs['track_name'], y=top_songs['minutes_played'], text=top_songs['minutes_played'],
#                 texttemplate='%{text: .5s} mins', name='By no. of Minutes played'), row=1,col=1)

# fig.add_trace(go.Bar(x=top_songs_no_played['track_name'], y=top_songs_no_played['minutes_played'],text=top_songs_no_played['minutes_played'],
#                 texttemplate='%{text} times', name='By No. of times song played'), row=1,col=2)

# fig.update_layout(height=400, width=1000, title_text='Top 10 Artists')
# fig.update_traces(textposition='inside')
# fig.show()

In [18]:
df_updated_streams.to_csv('../MyData/mystreams.csv', index=False)