Import/clean data

In [50]:
import pandas as pd
import numpy as np

In [9]:
# gen. file names
files = [f"./data/StreamingHistory{i}.json" for i in range(0,4)]
files

['./data/StreamingHistory0.json',
 './data/StreamingHistory1.json',
 './data/StreamingHistory2.json',
 './data/StreamingHistory3.json']

In [14]:
# load JSON files into single df
df = pd.concat([pd.read_json(file) for file in files], axis=0)
df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-02-28 00:03,Lane 8,Road - Dirty South Remix,65547
1,2020-02-28 00:05,OTHERLiiNE,One Line,301
2,2020-02-28 00:05,Lane 8,Road - Dirty South Remix,394
3,2020-02-28 00:05,Dubfire,Lotus - Dub,85485
4,2020-02-28 00:05,Four Tet,Teenage Birdsong,1044


In [15]:
# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39193 entries, 0 to 9192
Data columns (total 4 columns):
endTime       39193 non-null object
artistName    39193 non-null object
trackName     39193 non-null object
msPlayed      39193 non-null int64
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


In [89]:
# convert to datetime
df['endTime'] = pd.to_datetime(df['endTime'])

# create just date col
df['date'] = df['endTime'].dt.date

# examine time range
display((df['date'].max(), df['date'].min()))

df['date'].max() - df['date'].min()

(datetime.date(2021, 2, 28), datetime.date(2020, 2, 28))

datetime.timedelta(366)

39193 entries for 1 year's worth of data.

Let's examine top songs for 1 year of pandemmy listening!

In [90]:
# create minutes played, this will emlimate skips hopefully
df['minPlayed'] = round(df['msPlayed']/60000, 2)

# preview
df[['minPlayed', 'msPlayed']].head()

Unnamed: 0,minPlayed,msPlayed
0,1.09,65547
1,0.01,301
2,0.01,394
3,1.42,85485
4,0.02,1044


In [91]:
# view by most time played artist
df.groupby('artistName').agg({'trackName': 'nunique', 'minPlayed': 'sum'}).sort_values(by='minPlayed', ascending=False)

Unnamed: 0_level_0,trackName,minPlayed
artistName,Unnamed: 1_level_1,Unnamed: 2_level_1
Lane 8,60,1100.11
Yagya,30,965.12
Moby,19,859.37
Peter Cat Recording Co.,12,694.40
Dolly Parton,29,671.39
...,...,...
Valentini,1,0.00
Benny Benassi,1,0.00
Cher,1,0.00
The Equatics,1,0.00


In [92]:
# view most tracks played
df.groupby('artistName').agg({'trackName': 'nunique', 'minPlayed': 'sum'}).sort_values(by='trackName', ascending=False)

Unnamed: 0_level_0,trackName,minPlayed
artistName,Unnamed: 1_level_1,Unnamed: 2_level_1
Jim Gaffigan,140,460.01
Lane 8,60,1100.11
Frankie Valli & The Four Seasons,50,417.60
U.S. Girls,50,606.02
Stephan Bodzin,36,523.99
...,...,...
Chris Isaak,1,0.19
Les Rita Mitsouko,1,63.12
Chris Cohen,1,13.24
Chris Coco,1,7.52


In [109]:
# view by most time played song
top_songs = df.groupby('trackName')[['minPlayed']].sum().sort_values(by='minPlayed', ascending=False)
top_songs['hrPlayed'] = round(top_songs['minPlayed'] / 60, 2)
top_songs

Unnamed: 0_level_0,minPlayed,hrPlayed
trackName,Unnamed: 1_level_1,Unnamed: 2_level_1
A New Error,331.30,5.52
LA12,277.75,4.63
Little Raver,277.63,4.63
Another Dub In The Sun,261.73,4.36
He Would Have Laughed,260.80,4.35
...,...,...
So Far,0.00,0.00
KEEP MOVING,0.00,0.00
Ain't Deep Enough - DES3ETT Remix,0.00,0.00
Strong Enough,0.00,0.00


In [93]:
# TODO - calc max listening
# by date, song quantity
df.groupby('date').agg({'trackName': 'nunique', 'minPlayed': 'sum'}).sort_values(by='trackName', ascending=False)

Unnamed: 0_level_0,trackName,minPlayed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-07,585,682.03
2021-01-29,463,476.12
2020-10-10,429,258.86
2020-05-08,403,567.66
2020-08-22,367,379.55
...,...,...
2020-04-15,2,24.47
2020-07-01,2,3.90
2020-10-29,2,11.03
2020-03-24,1,0.16


In [94]:
# by date, minutes listened
df.groupby('date').agg({'trackName': 'nunique', 'minPlayed': 'sum'}).sort_values(by='minPlayed', ascending=False)

Unnamed: 0_level_0,trackName,minPlayed
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-11,332,836.80
2021-01-23,327,802.59
2020-05-09,322,747.19
2020-09-25,46,724.85
2020-06-07,264,720.29
...,...,...
2020-10-29,2,11.03
2020-06-03,3,10.11
2020-07-01,2,3.90
2020-04-17,1,3.37
