In [1]:
from google.colab import auth
auth.authenticate_user()

# Read single blob

In [2]:
from google.cloud import storage
import pandas as pd
import json

storage_client = storage.Client()

bucket_name = 'streaminghistory'
file_name = 'MyData1/StreamingHistory2.json'

bucket = storage_client.get_bucket(bucket_name)

blob = bucket.blob(file_name)

# Download the blob to a local file
blob.download_to_filename('StreamingHistory2.json')

# Read the local file into a pandas DataFrame
df = pd.read_json('StreamingHistory2.json')

# Print the first few rows of the DataFrame
print(df.head())

            endTime        artistName  \
0  2023-06-14 05:47   The Paper Kites   
1  2023-06-14 05:47        John Prine   
2  2023-06-14 13:17        Noah Kahan   
3  2023-06-14 13:21       Futurebirds   
4  2023-06-14 13:23  The Decemberists   

                                   trackName  msPlayed  
0                        Bloom - Bonus Track    210080  
1  In Spite of Ourselves (feat. Iris DeMent)     15237  
2                          She Calls Me Back    243815  
3                                      Rodeo    233173  
4                           Sons & Daughters    106693  


# Read all blobs in extended streaming history

In [3]:
# Prefix to filter files
prefix = 'MyData2/Streaming_History'
dfs = []
# List blobs (files) in the bucket with the specified prefix
blobs = bucket.list_blobs(prefix=prefix)

# Collect the file names
file_names = [blob.name for blob in blobs]
print(file_names)
for file in file_names:
  print(file)

['MyData2/Streaming_History_Audio_2015-2017_0.json', 'MyData2/Streaming_History_Audio_2017-2018_1.json', 'MyData2/Streaming_History_Audio_2018-2019_3.json', 'MyData2/Streaming_History_Audio_2018_2.json', 'MyData2/Streaming_History_Audio_2019-2020_4.json', 'MyData2/Streaming_History_Audio_2020-2021_6.json', 'MyData2/Streaming_History_Audio_2020_5.json', 'MyData2/Streaming_History_Audio_2021-2022_8.json', 'MyData2/Streaming_History_Audio_2021_7.json', 'MyData2/Streaming_History_Audio_2022-2023_9.json', 'MyData2/Streaming_History_Audio_2023_10.json', 'MyData2/Streaming_History_Video_2016-2023.json']
MyData2/Streaming_History_Audio_2015-2017_0.json
MyData2/Streaming_History_Audio_2017-2018_1.json
MyData2/Streaming_History_Audio_2018-2019_3.json
MyData2/Streaming_History_Audio_2018_2.json
MyData2/Streaming_History_Audio_2019-2020_4.json
MyData2/Streaming_History_Audio_2020-2021_6.json
MyData2/Streaming_History_Audio_2020_5.json
MyData2/Streaming_History_Audio_2021-2022_8.json
MyData2/Stream

In [4]:
for i in range(len(file_names)):
  file_name = file_names[i]
  local_file_name = "StreamingHistory"+str([i][0])+".json"
  #print(local_file_name)
  #print(file_name)
  blob = bucket.blob(file_name)
  blob.download_to_filename(local_file_name)
  df = pd.read_json(local_file_name)
  dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)

# Initial EDA on StreamingHistory

In [7]:
combined_df.describe()

Unnamed: 0,ms_played,offline_timestamp
count,174861.0,174736.0
mean,117204.9,1334162000000.0
std,187246.3,571791500000.0
min,0.0,0.0
25%,3157.0,1515091000000.0
50%,99913.0,1562986000000.0
75%,200736.0,1611992000000.0
max,11470920.0,1665792000000.0


In [8]:
combined_df.head(4)

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,...,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2015-08-10T14:10:00Z,benjaminkholland,"iOS 8.3 (iPhone7,2)",1020,US,65.78.179.118,unknown,Apollo - Radio Edit,Astronaut,Apollo,...,,,,appload,endplay,True,1.0,0.0,0.0,False
1,2015-08-10T14:18:03Z,benjaminkholland,"iOS 8.3 (iPhone7,2)",1020,US,65.78.179.118,unknown,679 (feat. Remy Boyz),Fetty Wap,679 (feat. Remy Boyz),...,,,,appload,endplay,True,1.0,0.0,0.0,False
2,2015-08-10T14:21:19Z,benjaminkholland,"iOS 8.3 (iPhone7,2)",196693,US,65.78.179.118,unknown,679 (feat. Remy Boyz),Fetty Wap,679 (feat. Remy Boyz),...,,,,clickrow,trackdone,True,0.0,0.0,0.0,False
3,2015-08-18T05:49:20Z,benjaminkholland,"iOS 8.3 (iPhone7,2)",185093,US,65.78.179.118,unknown,Rabbit Hole,The Temper Trap,The Temper Trap,...,,,,trackdone,trackdone,False,0.0,0.0,0.0,False




In [9]:
combined_df.shape

(174861, 21)

In [10]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174861 entries, 0 to 174860
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ts                                 174861 non-null  object 
 1   username                           174861 non-null  object 
 2   platform                           174861 non-null  object 
 3   ms_played                          174861 non-null  int64  
 4   conn_country                       174861 non-null  object 
 5   ip_addr_decrypted                  174861 non-null  object 
 6   user_agent_decrypted               173172 non-null  object 
 7   master_metadata_track_name         173967 non-null  object 
 8   master_metadata_album_artist_name  173967 non-null  object 
 9   master_metadata_album_album_name   173967 non-null  object 
 10  spotify_track_uri                  173967 non-null  object 
 11  episode_name                       864 

In [27]:
combined_df.shape
#174861

(174861, 21)

In [84]:
# Evaluate the rows that have the nulls for each value
songs = combined_df[combined_df["episode_name"].isna()]
#173997 (there are 30 without either)
songs = combined_df[combined_df["master_metadata_track_name"].notna()]
#173967
songs.shape

(173967, 21)

In [33]:
pods = combined_df[combined_df["episode_name"].notna()]
#864
pods.shape

(864, 21)

In [25]:
no_song_or_pod = combined_df[combined_df["episode_name"].isna() & combined_df["master_metadata_track_name"].isna()]
no_song_or_pod.shape

(30, 21)

In [None]:
combined_df["incognito_mode"].value_counts()
combined_df[combined_df["incognito_mode"]==True]

In [44]:
combined_df["platform"].value_counts()
len(combined_df["platform"].unique())
combined_df["platform"].unique()
#combined_df[combined_df["platform"]==True]

array(['iOS 8.3 (iPhone7,2)', 'iOS 9.1 (iPhone7,2)',
       'Windows 7 (6.1.7601; x64; SP1; S)',
       'WebPlayer (websocket RFC6455)',
       'Partner amazon_echo Amazon;Echo;;', 'iOS 10.0.2 (iPhone7,2)',
       'iOS 10.1 (iPhone9,3)', 'OS X 10.11.6 [x86 8]',
       'iOS 10.2.1 (iPhone9,3)', 'iOS 10.2.1 (iPhone8,1)',
       'iOS 10.3.1 (iPhone8,1)',
       'Partner ios_sdk Apple;iPhone9.3;1bfd2066b99e4ddbb6c28a761e96c4f9;10.2.1',
       'iOS 10.3.3 (iPhone9,3)',
       'Partner ios_sdk Apple;iPhone9.3;1bfd2066b99e4ddbb6c28a761e96c4f9;10.3.3',
       'iOS 11.1.2 (iPhone9,3)', 'OS X 10.13.4 [x86 8]',
       'Partner ios_sdk Apple;iPhone9.3;1bfd2066b99e4ddbb6c28a761e96c4f9;11.1.2',
       'iOS 12.1.2 (iPhone9,3)',
       'Partner amazon_echo Amazon;Echo;27d4dfe427b34d57995b463e5d63198d;;tpapi',
       'iOS 12.1.4 (iPhone9,3)', 'iOS 12.2 (iPhone9,3)',
       'OS X 10.13.6 [x86 4]', 'OS X 10.14.5 [x86 8]',
       'iOS 11.4.1 (iPhone9,3)', 'iOS 12.1.1 (iPhone9,3)',
       'Partner tizen_tv

In [11]:
# x Top artists
# x Most streamed song
# x Overall stream time (could be different depending on song length)
# x Number of streams per song
# x Number of streams per artist
# Most intentionally played song (not shuffle queued)
# Number of podcasts
# x Most skipped song
# x Most frequently skipped song (weighted for plays)
# Most skipped genre
# x Most skipped artist
# x Most frequently skipped artist (weighted for plays)

# Does genre change based on the platform? ios vs. mac?
  # too vague

# Top genre
# Top sub-genre
# Lowest genre

#### Similar questions for podcasts
#### Similar questions for saved song trends

# Most common songs on playlists


# Rec sys for songs based on ones that I listened to in same session
# Rec sys for songs based on content

# marqueeReachableAudience???

## Songs

In [85]:
# Select ONLY songs
# Evaluate the rows that have the nulls for each value
#songs = combined_df[combined_df["episode_name"].isna()]
#173997 (there are 30 without either)
songs = combined_df[combined_df["master_metadata_track_name"].notna()]
#173967
songs.shape

(173967, 21)

### Top artist & songs by plays

#### Top artists by plays

In [174]:
# Top artists
print("Artist count: "+str(len(songs['master_metadata_album_artist_name'].value_counts())))
print("Top artist")
print(songs['master_metadata_album_artist_name'].value_counts().reset_index().rename(columns={'ms_played': 'ms_total',"master_metadata_album_artist_name":"Artist_Name"}).head())
print("\n")
# Most streamed song
print("Song count: " +str(len(songs['master_metadata_track_name'].value_counts())))
print("Top song")
#print(songs['master_metadata_track_name'].value_counts().reset_index().head())

Artist count: 6594
Top artist
      Artist_Name  count
0      Kanye West   3701
1           Drake   2734
2     Post Malone   2415
3         J. Cole   2164
4  Kendrick Lamar   1958


Song count: 21264
Top song
                            Track_Name            Artist_Name  counts
0                             Cardinal                Mt. Joy     172
1                              HUMBLE.         Kendrick Lamar     163
2                           scapegoats              Baby Keem     161
3                           Yes Indeed               Lil Baby     157
4                           goosebumps           Travis Scott     148
...                                ...                    ...     ...
23132                 Love out of Lust               Lykke Li       1
23133                  Love the Haters         Olivia Rodrigo       1
23134         Love the One You're With  Crosby, Stills & Nash       1
23135  Black Males (feat. Epicmustdie)        WESTSIDE BOOGIE       1
23136       Beautiful

Unnamed: 0,Artist_Name,count
0,Kanye West,3701
1,Drake,2734
2,Post Malone,2415
3,J. Cole,2164
4,Kendrick Lamar,1958
...,...,...
6589,D'Angelo,1
6590,ROZES,1
6591,Ball Park Music,1
6592,Maddison Hoolan,1


In [175]:
a = songs['master_metadata_album_artist_name'].value_counts().reset_index().rename(columns={'ms_played': 'ms_total',"master_metadata_album_artist_name":"Artist_Name"})
a[a["Artist_Name"]=="Noah Kahan"]

Unnamed: 0,Artist_Name,count
41,Noah Kahan,662


#### Top songs by plays

In [282]:
# Confirmed that the songs have the same name but different artist
# combined_df[combined_df["master_metadata_track_name"]=="Ghost Town"][["master_metadata_album_artist_name","master_metadata_track_name"]]

# songs.groupby(["master_metadata_track_name","master_metadata_album_artist_name"])["master_metadata_track_name"].agg("count")
# Different values when grouping by artist name. Potentially artist name change?
print(songs.groupby(["master_metadata_track_name","master_metadata_album_artist_name"]).size().reset_index(name='counts').rename(columns={'ms_played': 'ms_total','master_metadata_track_name':'Track_Name',"master_metadata_album_artist_name":"Artist_Name"}).sort_values("counts",ascending=False).reset_index(drop=True))
# Sticking with grouping by artist & song name. Sometimes same song can be on different albums. Some songs also have same name.
# songs.groupby(["master_metadata_track_name"]).size().reset_index(name='counts').sort_values("counts",ascending=False)
# 23137, 21264

                            Track_Name            Artist_Name  counts
0                             Cardinal                Mt. Joy     172
1                              HUMBLE.         Kendrick Lamar     163
2                           scapegoats              Baby Keem     161
3                           Yes Indeed               Lil Baby     157
4                           goosebumps           Travis Scott     148
...                                ...                    ...     ...
23132                 Love out of Lust               Lykke Li       1
23133                  Love the Haters         Olivia Rodrigo       1
23134         Love the One You're With  Crosby, Stills & Nash       1
23135  Black Males (feat. Epicmustdie)        WESTSIDE BOOGIE       1
23136       Beautiful Crazy - Acoustic             Luke Combs       1

[23137 rows x 3 columns]


### Top artist & songs by playtime

#### Top artist by playtime

In [281]:
# Top artists by playtime
#print(combined_df['master_metadata_album_artist_name'].value_counts().head())
artist_playtime = songs.groupby('master_metadata_album_artist_name')['ms_played'].sum().reset_index().rename(columns={'ms_played': 'ms_total','master_metadata_track_name':'Track_Name',"master_metadata_album_artist_name":"Artist_Name"}).sort_values("ms_total",ascending=False).reset_index(drop=True)
artist_playtime["min_played"] = artist_playtime["ms_total"]/60000
artist_playtime["hr_played"] = artist_playtime["min_played"]/60
print(artist_playtime)
#song_playtime

                  Artist_Name   ms_total   min_played   hr_played
0                  Kanye West  443276214  7387.936900  123.132282
1                       Drake  349928847  5832.147450   97.202458
2                 Post Malone  278284792  4638.079867   77.301331
3     Rainbow Kitten Surprise  263096251  4384.937517   73.082292
4              Kendrick Lamar  245243844  4087.397400   68.123290
...                       ...        ...          ...         ...
6589                     Zapp          0     0.000000    0.000000
6590                Sub Focus          0     0.000000    0.000000
6591          Rapper Big Pooh          0     0.000000    0.000000
6592             Wilder Woods          0     0.000000    0.000000
6593                José José          0     0.000000    0.000000

[6594 rows x 4 columns]


#### Top songs by playtime

In [279]:
song_playtime = songs.groupby(["master_metadata_track_name","master_metadata_album_artist_name"])['ms_played'].sum().reset_index().rename(columns={'ms_played': 'ms_total','master_metadata_track_name':'Track_Name',"master_metadata_album_artist_name":"Artist_Name"}).sort_values("ms_total",ascending=False).reset_index(drop=True)
song_playtime["min_played"] = song_playtime["ms_total"]/60000
song_playtime["hr_played"] = song_playtime["min_played"]/60
song_playtime = song_playtime.drop(columns="ms_total")
print(song_playtime)

                            Track_Name              Artist_Name  min_played  \
0                          First Class  Rainbow Kitten Surprise  432.213250   
1                        When It Lands  Rainbow Kitten Surprise  397.305650   
2                           Ghost Town               Kanye West  381.074800   
3                             Cardinal                  Mt. Joy  368.971400   
4      If You’re Too Shy (Let Me Know)                 The 1975  367.822467   
...                                ...                      ...         ...   
23132                    Make Yer Mark               The Garden    0.000000   
23133                     Every Season             Cousin Stizz    0.000000   
23134      My Friends (feat. TeamTwin)               Mr_hotspot    0.000000   
23135                        20 Joints                   Berner    0.000000   
23136                            Tupac                   DaBaby    0.000000   

       hr_played  
0       7.203554  
1       6.621

### Top skipped artist & songs

In [172]:
# The skipped songs count are weighed by artists I've been listening to more in the last year. This is a questionable stat.
# There are values for 2476 artists
songs.groupby(["master_metadata_album_artist_name"])["skipped"].agg("count").reset_index(name="skipped_count").sort_values("skipped_count",ascending=False).reset_index(drop=True)

Unnamed: 0,master_metadata_album_artist_name,skipped_count
0,Noah Kahan,623
1,Fred again..,464
2,Taylor Swift,462
3,Zach Bryan,436
4,Drake,280
...,...,...
6589,Jay Taj,0
6590,Jay Som,0
6591,Jay Prince,0
6592,Jay Isaiah,0


In [210]:
len(songs["reason_end"].unique())
songs["reason_end"].unique()

array(['endplay', 'trackdone', 'fwdbtn', 'backbtn', 'trackerror',
       'unexpected-exit', 'unexpected-exit-while-paused', 'playbtn', '',
       'remote', 'logout', 'unknown'], dtype=object)

In [190]:
# there are fwdbutton reason_ends that are not counted as skips.
# going to count anything that is a fwdbtn as a skip for purposes of the skip stat
songs[["master_metadata_album_artist_name","skipped","ts"]]
songs[songs["skipped"]==True][["master_metadata_album_artist_name","skipped","ts"]]
songs[songs["skipped"].isna()][["reason_start","reason_end"]].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
reason_start,reason_end,Unnamed: 2_level_1
trackdone,trackdone,38601
fwdbtn,fwdbtn,33614
clickrow,endplay,18274
trackdone,fwdbtn,10421
fwdbtn,trackdone,9113
...,...,...
appload,trackerror,2
playbtn,unknown,1
playbtn,trackerror,1
trackerror,logout,1


In [211]:
# not many of these values
songs["skipped"].value_counts()
#reason_start	reason_end	shuffle	skipped
songs["reason_start"].value_counts()
songs["reason_end"].value_counts()
songs.groupby(["reason_end","skipped"])["skipped"].agg("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,skipped
reason_end,skipped,Unnamed: 2_level_1
,0.0,100
backbtn,0.0,26
backbtn,1.0,407
endplay,0.0,60
endplay,1.0,2039
fwdbtn,0.0,235
fwdbtn,1.0,6829
logout,0.0,56
playbtn,0.0,5
remote,0.0,116


In [233]:
skipped_songs = songs[songs["reason_end"]=="fwdbtn"]

#### Top skipped artist by count

In [241]:
# The skipped songs count are weighed by artists I've been listening to more in the last year. This is a questionable stat.
# There are values for 2476 artists

len(skipped_songs)
#skipped_songs["master_metadata_album_artist_name"].value_counts()
skipped_artists_aggr = skipped_songs["master_metadata_album_artist_name"].value_counts().reset_index(name='count_artist_skipped').rename(columns={"master_metadata_album_artist_name":"Artist_Name"}).sort_values("count_artist_skipped",ascending=False).reset_index(drop=True)
print(skipped_artists_aggr)
 #[[""]].groupby(["master_metadata_album_artist_name"]).sum("count").reset_index()#.reset_index(name="skipped_count").sort_values("skipped_count",ascending=False).reset_index(drop=True)

             Artist_Name  count_artist_skipped
0             Kanye West                  1135
1                  Drake                   825
2            Post Malone                   774
3                J. Cole                   738
4              Lil Wayne                   666
...                  ...                   ...
3546       The Dirty Nil                     1
3547               SK-47                     1
3548  Fumez The Engineer                     1
3549                 Av9                     1
3550       Jesse Stewart                     1

[3551 rows x 2 columns]


#### Top skipped songs by count

In [242]:
skipped_songs_aggr = skipped_songs.groupby(["master_metadata_track_name","master_metadata_album_artist_name"]).size().reset_index(name='count_song_skipped').rename(columns={'ms_played': 'ms_total','master_metadata_track_name':'Track_Name',"master_metadata_album_artist_name":"Artist_Name"}).sort_values("count_song_skipped",ascending=False).reset_index(drop=True)
print(skipped_songs_aggr)

                             Track_Name        Artist_Name  count_song_skipped
0                            DEVASTATED        Joey Bada$$                  65
1                           Wyclef Jean         Young Thug                  62
2                         Tunnel Vision        Kodak Black                  61
3      Yamborghini High (feat. Juicy J)           A$AP Mob                  60
4                            Same Drugs  Chance the Rapper                  60
...                                 ...                ...                 ...
11375                      January 1979       mewithoutYou                   1
11376                           Jammin'         Mikey More                   1
11377                       James Brown  Cage The Elephant                   1
11378                             James               MGMT                   1
11379        Live SheckWes Die SheckWes          Sheck Wes                   1

[11380 rows x 3 columns]


### Top skipped artists & songs by percentage

In [264]:
# Overall listens
total_artists = songs['master_metadata_album_artist_name'].value_counts().reset_index().rename(columns={'ms_played': 'ms_total',"master_metadata_album_artist_name":"Artist_Name","count":"count_played"})
total_artists.head()

Unnamed: 0,Artist_Name,count_played
0,Kanye West,3701
1,Drake,2734
2,Post Malone,2415
3,J. Cole,2164
4,Kendrick Lamar,1958


#### Top skipped artists by %

In [278]:
# Number of skips
skipped_artists_percent = skipped_artists_aggr.merge(total_artists, how='left', on='Artist_Name')
skipped_artists_percent.head()
skipped_artists_percent["percent_skipped"]=skipped_artists_percent["count_artist_skipped"]/skipped_artists_percent["count_played"]#.sort_values("percent_skipped")
skipped_artists_percent.sort_values(["percent_skipped","count_played"],ascending=False).reset_index(drop=True)

#skipped_songs["master_metadata_album_artist_name"].value_counts()

Unnamed: 0,Artist_Name,count_artist_skipped,count_played,percent_skipped
0,B.B. King,9,9,1.000000
1,Chosen Music,7,7,1.000000
2,Dido,6,6,1.000000
3,SwagHollywood,6,6,1.000000
4,Kash On Deck,5,5,1.000000
...,...,...,...,...
3546,Pharrell Williams,1,19,0.052632
3547,Staring in Spaces,1,20,0.050000
3548,Lil Loaded,2,41,0.048780
3549,9lokknine,1,21,0.047619


#### Top skipped songs by %

In [270]:
total_songs = songs.groupby(["master_metadata_track_name","master_metadata_album_artist_name"]).size().reset_index(name='count_played').rename(columns={'ms_played': 'ms_total','master_metadata_track_name':'Track_Name',"master_metadata_album_artist_name":"Artist_Name"}).sort_values("count_played",ascending=False).reset_index(drop=True)
total_songs.head()

Unnamed: 0,Track_Name,Artist_Name,count_played
0,Cardinal,Mt. Joy,172
1,HUMBLE.,Kendrick Lamar,163
2,scapegoats,Baby Keem,161
3,Yes Indeed,Lil Baby,157
4,goosebumps,Travis Scott,148


In [271]:
# Number of skips
skipped_songs_percent = skipped_songs_aggr.merge(total_songs, how='left', on=['Artist_Name','Track_Name'])
skipped_songs_percent.head()
skipped_songs_percent["percent_skipped"]=skipped_songs_percent["count_song_skipped"]/skipped_songs_percent["count_played"]#.sort_values("percent_skipped")
skipped_songs_percent.sort_values(["percent_skipped","count_played"],ascending=False)

#skipped_songs["skip_percent"] =
#skipped_songs["master_metadata_album_artist_name"].value_counts()

Unnamed: 0,Track_Name,Artist_Name,count_song_skipped,count_played,percent_skipped
2653,Close Your Eyes (And Count to Fuck),Run The Jewels,7,7,1.000000
2709,Summit,Chosen Music,7,7,1.000000
2737,God Made The Automobile,Iron & Wine,7,7,1.000000
2917,Arms of a Thief,Iron & Wine,6,6,1.000000
2990,Thank You,Dido,6,6,1.000000
...,...,...,...,...,...
6657,the 1,Taylor Swift,1,38,0.026316
8210,Freestyle,Lil Baby,1,38,0.026316
6579,Wait so Long,Trampled by Turtles,1,39,0.025641
6668,YAH.,Kendrick Lamar,1,44,0.022727


### Shuffle

#### Most listened to unintentionally (with shuffle)

#### Most listened to intentionally (without shuffle)

### Top 5-10 songs/artists by year

## Podcasts

In [None]:
# Select ONLY podcasts

In [58]:
combined_df.nunique()

Unnamed: 0,0
ts,164429
username,1
platform,64
ms_played,53063
conn_country,9
ip_addr_decrypted,3051
user_agent_decrypted,17
master_metadata_track_name,21264
master_metadata_album_artist_name,6594
master_metadata_album_album_name,13998


In [60]:
artists = combined_df['master_metadata_album_artist_name'].unique()

In [67]:
combined_df['master_metadata_album_artist_name'].value_counts()

Unnamed: 0_level_0,count
master_metadata_album_artist_name,Unnamed: 1_level_1
Kanye West,11103
Drake,8202
Post Malone,7245
J. Cole,6492
Kendrick Lamar,5874
...,...
D'Angelo,3
ROZES,3
Ball Park Music,3
Maddison Hoolan,3


In [68]:
combined_df['master_metadata_album_album_name'].value_counts()

Unnamed: 0_level_0,count
master_metadata_album_album_name,Unnamed: 1_level_1
Tha Carter V,3699
beerbongs & bentleys,3120
The Life Of Pablo,3018
Stoney,2892
Mt. Joy,2715
...,...
Relaxation Time,3
Yoga Morning,3
Barcelona Atmosphere,3
Cameron's Meditation - Sleep,3


In [64]:
artists_df = combined_df.groupby('master_metadata_album_artist_name').agg('count')

In [65]:
combined_df

Unnamed: 0_level_0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
master_metadata_album_artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
"""DAVE""",3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,0,3,3,3
"""Weird Al"" Yankovic",9,9,9,9,9,9,9,9,9,9,0,0,0,9,9,9,0,9,9,9
$NOT,93,93,93,93,93,93,93,93,93,93,0,0,0,93,93,93,6,93,93,93
$tupid Young,114,114,114,114,114,114,114,114,114,114,0,0,0,114,114,114,9,114,114,114
$uicideboy$,228,228,228,228,228,228,228,228,228,228,0,0,0,228,228,228,0,228,228,228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
張露,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,0,3,3,3
愛THE BLXCK SINATRA死,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,0,3,3,3
暁テル子,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,0,3,3,3
謎女,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,0,3,3,3


# Export Data joining

## Playlist file

### Most common playlist song

#### Personal playlists

#### General playlists

## Query file

### Top query

#### Top with tokenization & stemming

## Library file

# API data joining

Song genres, etc.

# Machine learning

## Song clustering

## Playlist generation & song rec