In [21]:
import pandas as pd
import plotly.express as px

In [22]:
df1 = pd.read_json(r'C:\Users\Ziv\Downloads\my_spotify_data\MyData\StreamingHistory0.json')
df2 = pd.read_json(r'C:\Users\Ziv\Downloads\my_spotify_data\MyData\StreamingHistory1.json')
df = pd.concat([df1, df2], axis=0, ignore_index=True)

In [23]:
df


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2022-09-28 12:03,Darko US,Infinite Beauty,198421
1,2022-09-28 12:06,Darko US,Evolving,176000
2,2022-09-28 12:09,Darko US,R.T.G.O.B.,165466
3,2022-09-28 12:13,Darko US,Ana,15954
4,2022-09-28 12:13,Darko US,Gantz,198333
...,...,...,...,...
13564,2023-10-01 11:15,Mental Cruelty,Forgotten Kings,1050
13565,2023-10-01 11:15,Mental Cruelty,Nordlys,1100
13566,2023-10-01 11:15,Venjent,Zone,9910
13567,2023-10-01 18:32,Mental Cruelty,Symphony of a Dying Star,89760


In [25]:
# Adding a column for minutes played
df['minsPlayed'] = df['msPlayed']/(1000*60)

# Convert endTime to datetime format for easy manipulation and keep only date part
df["endTime"] = pd.to_datetime(df["endTime"]).dt.date

# Group by each track and count how many times each track was played
df_grouped_tracks = df.groupby(["trackName", "artistName"]).size().reset_index(name="timesPlayed")

# Group by each artist and count how many times each artist was played
df_grouped_artists = df.groupby("artistName").size().reset_index(name="timesPlayed")

# Group by each day and sum how many minutes were played each day
df_grouped_day = df.groupby("endTime").agg({"minsPlayed": "sum"}).reset_index()

# Sorting and get top 100 tracks and artists
df_sorted_tracks = df_grouped_tracks.sort_values("timesPlayed", ascending=False).head(100)
df_sorted_artists = df_grouped_artists.sort_values("timesPlayed", ascending=False).head(100)

# Time Series of listening behavior
fig = px.line(df_grouped_day, x='endTime', y='minsPlayed', title="Listening Behavior Over Time")
fig.show()

# Top 100 Tracks
fig = px.bar(df_sorted_tracks, x='trackName', y='timesPlayed', color='timesPlayed', title="Top 100 Tracks by Number of Plays")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

# Top 100 Artists
fig = px.bar(df_sorted_artists, x='artistName', y='timesPlayed', color='timesPlayed', title="Top 100 Artists by Number of Plays")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [16]:

# Convert endTime to datetime format for easy manipulation
df["endTime"] = pd.to_datetime(df["endTime"])

# Extracting only the Date from 'endTime'
df["Date"] = df["endTime"].dt.date

# Group by Date and trackName to get count on each day for each track
grouped_df = df.groupby(["Date", "trackName"]).size().reset_index(name='Count')

# Sorting values by Date and Count to get Top tracks on each day
sorted_df = grouped_df.sort_values(['Date','Count'],ascending=False)

# Selecting Top 10 tracks on each day
top_tracks = sorted_df.groupby('Date').head(10)

# Visualization with plotly
fig = px.bar(top_tracks, x='Count', y='trackName', orientation='h', title='Top streamed tracks')
fig.show()

In [17]:
df

Unnamed: 0,endTime,artistName,trackName,msPlayed,Date
0,2022-09-28 12:03:00,Darko US,Infinite Beauty,198421,2022-09-28
1,2022-09-28 12:06:00,Darko US,Evolving,176000,2022-09-28
2,2022-09-28 12:09:00,Darko US,R.T.G.O.B.,165466,2022-09-28
3,2022-09-28 12:13:00,Darko US,Ana,15954,2022-09-28
4,2022-09-28 12:13:00,Darko US,Gantz,198333,2022-09-28
...,...,...,...,...,...
13564,2023-10-01 11:15:00,Mental Cruelty,Forgotten Kings,1050,2023-10-01
13565,2023-10-01 11:15:00,Mental Cruelty,Nordlys,1100,2023-10-01
13566,2023-10-01 11:15:00,Venjent,Zone,9910,2023-10-01
13567,2023-10-01 18:32:00,Mental Cruelty,Symphony of a Dying Star,89760,2023-10-01


In [19]:

# Only consider 'track' type
df = df[df['trackName'] != '']

# Group by trackName to get times played for each track
df_grouped = df['trackName'].value_counts().reset_index()
df_grouped.columns = ['trackName', 'timesPlayed']

# Sorting by number of times each track was played and selecting top 100
df_sorted = df_grouped.sort_values('timesPlayed', ascending=False).head(100)

# Visualization with Plotly
fig = px.bar(df_sorted, x='trackName', y='timesPlayed', title='Top 100 Tracks by Number of Times Played', labels={'trackName':'Track Name', 'timesPlayed':'Number of Times Played'}, color='timesPlayed')
fig.show()

Unnamed: 0,endTime,artistName,trackName,msPlayed,Date
0,2022-09-28 12:03:00,Darko US,Infinite Beauty,198421,2022-09-28
1,2022-09-28 12:06:00,Darko US,Evolving,176000,2022-09-28
2,2022-09-28 12:09:00,Darko US,R.T.G.O.B.,165466,2022-09-28
3,2022-09-28 12:13:00,Darko US,Ana,15954,2022-09-28
4,2022-09-28 12:13:00,Darko US,Gantz,198333,2022-09-28
...,...,...,...,...,...
13564,2023-10-01 11:15:00,Mental Cruelty,Forgotten Kings,1050,2023-10-01
13565,2023-10-01 11:15:00,Mental Cruelty,Nordlys,1100,2023-10-01
13566,2023-10-01 11:15:00,Venjent,Zone,9910,2023-10-01
13567,2023-10-01 18:32:00,Mental Cruelty,Symphony of a Dying Star,89760,2023-10-01
