In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading the data

In [2]:
history = pd.read_json('data/StreamingHistory0.json')
history_1 = pd.read_json('data/StreamingHistory1.json')
history_2 = pd.read_json('data/StreamingHistory2.json')

In [3]:
history.tail()

Unnamed: 0,endTime,artistName,trackName,msPlayed
9995,2021-08-26 16:11,AC/DC,Dirty Deeds Done Dirt Cheap,231933
9996,2021-08-26 16:21,Jimi Hendrix,All Along the Watchtower,52293
9997,2021-08-26 16:24,Alberto,Z BRATEM ZARABIAM PAPIER,145070
9998,2021-08-26 16:24,Alberto,Z BRATEM ZARABIAM PAPIER,19655
9999,2021-08-26 16:26,Dj.Frodo,Kawasaki,5174


In [4]:
history_2.tail()

Unnamed: 0,endTime,artistName,trackName,msPlayed
4554,2022-04-15 21:21,Rammstein,Du hast,234226
4555,2022-04-15 21:21,Death,Spirit Crusher,5547
4556,2022-04-15 21:24,Gojira,The Cell,197906
4557,2022-04-15 21:29,Gojira,Stranded,265266
4558,2022-04-15 21:32,System Of A Down,Know,176693


In [5]:
history_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     10000 non-null  object
 1   artistName  10000 non-null  object
 2   trackName   10000 non-null  object
 3   msPlayed    10000 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 312.6+ KB


In [6]:
history

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2021-04-14 23:14,Khalid,OTW,26493
1,2021-04-15 07:14,Khalid,OTW,9356
2,2021-04-15 07:18,Led Zeppelin,Black Dog - Remaster,295386
3,2021-04-15 07:22,Led Zeppelin,Rock and Roll - Remaster,220560
4,2021-04-15 07:28,Led Zeppelin,The Battle of Evermore - Remaster,351677
...,...,...,...,...
9995,2021-08-26 16:11,AC/DC,Dirty Deeds Done Dirt Cheap,231933
9996,2021-08-26 16:21,Jimi Hendrix,All Along the Watchtower,52293
9997,2021-08-26 16:24,Alberto,Z BRATEM ZARABIAM PAPIER,145070
9998,2021-08-26 16:24,Alberto,Z BRATEM ZARABIAM PAPIER,19655


### Looking for the most played artist

In [7]:
history.iloc[history['msPlayed'].apply(lambda number: number // 1000).idxmax()]

endTime                      2021-07-01 19:06
artistName                          Metallica
trackName     Enter Sandman - Remastered 2021
msPlayed                              1131241
Name: 6566, dtype: object

In [8]:
history_1.iloc[history_1['msPlayed'].apply(lambda number: number // 1000).idxmax()]

endTime       2021-11-26 13:19
artistName     Nauka XXI wieku
trackName       #46 - Entropia
msPlayed               3307394
Name: 6858, dtype: object

In [9]:
history_2.iloc[history_2['msPlayed'].apply(lambda number: number // 1000).idxmax()]

endTime            2022-02-15 14:25
artistName              Hans Zimmer
trackName     Planet Earth II Suite
msPlayed                     791300
Name: 865, dtype: object

In [10]:
history_2.sort_values(by='msPlayed')

Unnamed: 0,endTime,artistName,trackName,msPlayed
962,2022-02-17 23:22,Korn,Coming Undone,0
963,2022-02-17 23:22,Megadeth,Skin O' My Teeth - Remastered,0
2422,2022-03-15 20:31,Martin Garrix,Animals,0
609,2022-02-10 13:36,Mötley Crüe,Shout At The Devil,0
961,2022-02-17 23:22,Slipknot,People = Shit,0
...,...,...,...,...
1685,2022-03-03 15:50,Gojira,Stranded,589933
986,2022-02-18 10:35,Gojira,Stranded,624824
1295,2022-02-22 16:58,Mayhem,Freezing Moon,651650
18,2022-01-29 22:22,Slipknot,Nero Forte,776826


### Looking for duplications

In [11]:
history_2.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4554    False
4555    False
4556    False
4557    False
4558    False
Length: 4559, dtype: bool

### Looking for the most played song

In [12]:
nero_forte = history_2[history_2['trackName'] == 'Nero Forte']['msPlayed'].sum()
nero_forte // 1000

18303

### Looking for an artist listening time

In [13]:
history_2[history_2['artistName'] == 'Mayhem']['msPlayed']

257     317720
331      21397
332     139541
334     185565
347     383053
         ...  
4357    417253
4389    106330
4390    276727
4425    356562
4527    211940
Name: msPlayed, Length: 260, dtype: int64

In [14]:
history_1[history_1['artistName'] == 'Mayhem']

Unnamed: 0,endTime,artistName,trackName,msPlayed


### Merge all three datasets into one

In [15]:
history_all = history.append(history_1).append(history_2)
history_all

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2021-04-14 23:14,Khalid,OTW,26493
1,2021-04-15 07:14,Khalid,OTW,9356
2,2021-04-15 07:18,Led Zeppelin,Black Dog - Remaster,295386
3,2021-04-15 07:22,Led Zeppelin,Rock and Roll - Remaster,220560
4,2021-04-15 07:28,Led Zeppelin,The Battle of Evermore - Remaster,351677
...,...,...,...,...
4554,2022-04-15 21:21,Rammstein,Du hast,234226
4555,2022-04-15 21:21,Death,Spirit Crusher,5547
4556,2022-04-15 21:24,Gojira,The Cell,197906
4557,2022-04-15 21:29,Gojira,Stranded,265266


#### Function for calcualting hours played by specified artist

In [16]:
def hours_listen(artist):
    print(f'You listened to {artist} for \
{(history_all[history_all["artistName"]== artist]["msPlayed"].sum()) / (1000*60*60):.2f} hours')
    
def hours_listen_return(artist):
    return (history_all[history_all["artistName"]== artist]["msPlayed"].sum()) / (1000*60*60)

In [17]:
hours_listen('Mayhem')

You listened to Mayhem for 17.13 hours


In [18]:
hours_listen('Slipknot')

You listened to Slipknot for 58.64 hours


In [19]:
hours_listen('Yeat')

You listened to Yeat for 7.93 hours


In [20]:
hours_listen('Rammstein')
hours_listen('Slipknot')
hours_listen('Korn')

You listened to Rammstein for 23.14 hours
You listened to Slipknot for 58.64 hours
You listened to Korn for 7.43 hours


In [21]:
history_all['artistName'].unique()

array(['Khalid', 'Led Zeppelin', 'Rage Against The Machine', 'AC/DC',
       'Metallica', "Guns N' Roses", 'Kanye West', 'Pink Floyd',
       'Red Hot Chili Peppers', 'Nirvana', 'Sum 41', 'The Clash',
       'David Guetta', 'Lucky Luke', 'Riton', 'Arctic Monkeys',
       'Young Stoner Life', 'Playboi Carti', 'Marilyn Manson', 'Laszlo',
       'Justin Bieber', 'Kendrick Lamar', 'Mac Miller', 'Lil Peep',
       'Ms. Lauryn Hill', 'Future', 'Tujamo', 'Polo G', 'Travis Scott',
       'Eminem', 'King Von', 'Tyler, The Creator', 'Kid Cudi', 'J. Cole',
       'Big Sean', 'Madvillain', 'Mata', 'Michael Jackson',
       'Anderson .Paak', 'A$AP Rocky', 'chillwagon', 'Cordae',
       'Denzel Curry', 'Unknown Artist', 'Aminé', 'Drake',
       'Freddie Gibbs', 'City Morgue', 'Post Malone', 'Masno',
       'Bruno Mars', 'Lucenzo', 'White 2115', 'Quebonafide', 'Logic',
       'Juice WRLD', 'Childish Gambino', 'Roddy Ricch', 'Nas',
       'Isaiah Rashad', 'Iron Maiden', 'Linkin Park', 'Marshmello',
  

In [None]:
decreasing = {art: hours_listen_return(art) for art in history_all['artistName'].unique()}
srt = sorted(decreasing.items(), key=lambda item: item[1], reverse=True)

In [None]:
srt

### Function summing hours played 

In [None]:
def year_sum(data):
    return sum([data[i][1] for i in range(len(data))])

In [None]:
print(f'For the last year you were listening to music for {year_sum(srt)} hours.')

### Days played

In [None]:
year_sum(srt)/24