In [39]:
import pandas as pd
import numpy as np

In [2]:
poorly_formatted_rows = [2120259, 2446317, 11141080, 11152098, 11152401, 11882086, 12902538, 12935043, 17589538]
df = pd.read_csv('lastfm_data/userid-timestamp-artid-artname-traid-traname.tsv', header=None, skiprows=poorly_formatted_rows, sep='\t')
df.columns = ['user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name']

In [3]:
df.sample(10)

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
16032594,user_000830,2006-04-23T19:43:04Z,eab76c9f-ff91-4431-b6dd-3b976c598020,Infected Mushroom,5484da29-6a95-45de-abe1-25ff784370a6,Unbalanced (Baby Killer Remix)
14461949,user_000771,2007-11-02T23:11:11Z,45a663b5-b1cb-4a91-bff6-2bef7bbfdd76,Britney Spears,0897dfea-0393-4735-bb9a-9f3263b44970,Hot As Ice
88963,user_000003,2006-10-18T21:14:56Z,f9ef7a22-4262-4596-a2a8-1d19345b8e50,Garbage,c7ea9d60-4d60-4268-a07f-9071e941a93b,The Trick Is To Keep Breathing
93849,user_000004,2009-03-18T18:16:40Z,46d972b8-1199-412e-9d83-a9bdb0f9fc1d,The Heart Strings,,Her New Disaster
5032677,user_000260,2005-11-02T17:52:47Z,b88a8ada-40da-4008-9ac5-87c38773ca09,Dumdum Boys,267138ba-720d-43bd-bc28-c56a33bc326c,Boom Boom
14939346,user_000790,2008-12-23T01:13:06Z,d59c4cda-11d9-48db-8bfe-b557ee602aed,Billie Holiday,f33112ea-147e-4c3e-a4ad-a381c6082037,Deep Song
15321661,user_000798,2007-04-19T13:01:57Z,61a12e16-137b-4e6b-801c-0ec7e6287bfe,One Man Army And The Undead Quartet,36c2bf46-de48-4256-9cab-e4840cefa187,Bulldozer Frenzy
16220463,user_000837,2009-01-27T02:00:34Z,bb1f91d2-1b54-4ee5-b55a-626492a5904f,Freezepop,b326dd12-4790-49ea-bbd9-7775ffb00bed,Stakeout
2622610,user_000134,2006-12-31T21:53:55Z,f9ef7a22-4262-4596-a2a8-1d19345b8e50,Garbage,6e225cd7-5a1c-4de5-90a4-f18e9186293e,Stupid Girl
5528230,user_000281,2006-11-09T22:33:06Z,eb872766-98f6-453d-883f-2ae908a18315,Tv On The Radio,24df4051-4194-44a8-a49d-b7e7dc51e4cb,Province


In [4]:
df.describe()

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
count,19098853,19098853,18498005,19098853,16936134,19098841
unique,992,17454730,107295,173921,960402,1083471
top,user_000949,2009-02-26T21:29:15Z,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6,Intro
freq,183103,248,115099,115099,3991,17561


#### Interesting facts:
- Last.FM users tend to like Radiohead
- Most used name for a song is Intro (which makes sense)
- We got data from ~1K users
- There's ~960K different tracks which makes it pretty diverse for a recommendation system
- ~107K different artists
- ~19M entries in the dataset (!!!)
- The unique estimator has different values for artist_id and artist_name... what does this mean?

In [5]:
del df['timestamp'] #don't need timestamps.
print("Radiohead is featured %d times" % len(df[df['artist_name'] == 'Radiohead']))

Radiohead is featured 115099 times


In [6]:
artist_name = df.loc[df['track_id'] == 'db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6'].iloc[0]['artist_name']
track_name = df.loc[df['track_id'] == 'db16d0b3-b8ce-4aa8-a11a-e4d53cc7f8a6'].iloc[0]['track_name']
print("The most popular track is %s by %s" % (track_name, artist_name))

The most popular track is Such Great Heights by The Postal Service


### Data set is probably biased to indie rock.

In [7]:
print("TOP 25 ARTISTS FEATURED IN THE DATASET")
df.groupby(['artist_name']).count().sort_values('track_name', ascending=False)['track_id'].head(25)

TOP 25 ARTISTS FEATURED IN THE DATASET


artist_name
Radiohead                      111488
The Beatles                     98525
Nine Inch Nails                 82022
Muse                            60967
Coldplay                        60256
Depeche Mode                    57802
Pink Floyd                      55739
Death Cab For Cutie             56451
Placebo                         50632
Elliott Smith                   49306
The Cure                        48553
Britney Spears                  43462
David Bowie                     48285
The Killers                     46206
Kanye West                      44971
Sigur Rós                       40642
Red Hot Chili Peppers           44512
The Smiths                      44854
Metallica                       42546
Interpol                        41871
Björk                           39930
Bloc Party                      38447
Black Rebel Motorcycle Club     34381
Arcade Fire                     39317
Modest Mouse                    38829
Name: track_id, dtype: int64

# Data set is INDEED biased to indie rock.

### Let's check track data, same way

In [8]:
top_25_tracks_by_id = df.groupby(['track_id']).count().sort_values('track_name', ascending=False)['track_name'].head(25)

In [9]:
top_25_tracks = df[df['track_id'].isin(top_25_tracks_by_id.index)].drop_duplicates(subset='track_id')
top_25_tracks = top_25_tracks.merge(pd.DataFrame(top_25_tracks_by_id), left_on='track_id', right_index=True)
top_25_tracks.columns = ['user_id','artist_id','artist_name','track_id','track_name','count']
top_25_tracks.sort_values('count', ascending=False, inplace=True)
print("TOP 25 TRACKS FEATURED IN THE DATASET")
top_25_tracks[['artist_name', 'track_name', 'count']]

TOP 25 TRACKS FEATURED IN THE DATASET


Unnamed: 0,artist_name,track_name,count
24969,The Postal Service,Such Great Heights,3991
26687,Boy Division,Love Will Tear Us Apart,3651
26555,Radiohead,Karma Police,3533
19819,Death Cab For Cutie,Soul Meets Body,3479
17054,Muse,Supermassive Black Hole,3463
39355,The Knife,Heartbeats,3155
46238,Arcade Fire,Rebellion (Lies),3047
17556,Muse,Starlight,3040
321655,Britney Spears,Gimme More,3002
39366,The Killers,When You Were Young,2997


### Aside for the funny typo in Ian Curtis' band, this furthermore confirms the bias. And yey! Radiohead is amazing.

### So, what now with the artist id and the artist name amount not coinciding?

In [14]:
df['artist_id'].unique()

array(['f1b1cf71-bd35-4e99-8624-24a6e15f133a',
       'a7f7df4a-77d8-4f12-8acd-5c60c93f4de8',
       'ba2f4f3b-0293-4bc8-bb94-2f73b5207343', ...,
       '172d7924-1660-4d17-9fdb-53604dd30fdb',
       'a2749770-fcf8-41f0-928d-fa514b86ba61',
       '8c887739-8b37-4d13-a5e4-731c0abaffc3'], dtype=object)

In [33]:
n_artists_without_id = len(df[df['artist_id'].isnull()].artist_name.unique())
n_artists_id = len(df.artist_id.unique())
n_artists_name = len(df.artist_name.unique())
print("%d artists without id, %d unique ids, %d unique names and %d unaccounted cases" %(n_artists_without_id, n_artists_name, n_artists_id, n_artists_name - n_artists_id))

69418 artists without id, 173921 unique ids, 107296 unique names and 66625 unaccounted cases


##### So one reason, apparently, is because some artist_ids are Nan

In [43]:
erased_artists_wo_ids = df[df['artist_id'].notnull()]
unique_duples = erased_artists_wo_ids[['artist_id','artist_name']].drop_duplicates()
unique_duples[unique_duples.duplicated(subset='artist_id')]

Unnamed: 0,artist_id,artist_name
21955,66ea0139-149f-4a0c-8fbf-5ea9ec4a6e49,Disney
59666,e5257dc5-1edd-4fca-b7e6-1158e00522c8,Jackson 5
80209,b9472588-93f3-4922-a1a2-74082cdf9ce8,Panic! At The Disco
99353,def226b0-7990-4b38-ab1a-7740461844dc,Yael Naïm
132551,7b290da1-03f2-4d91-8f6e-0e4930e39831,Mlle Caro & Frank Garcia
207865,243c6f61-d83b-4459-bebd-5899df0da111,Lil' Jon & The East Side Boyz
324407,1fda852b-92e9-4562-82fa-c52820a77b23,Pussycat Dolls
415220,06732552-5a39-4cda-81b9-3215092266f9,André Ethier
490877,6bb4ccea-4717-449d-bbf8-075b2d2c19ba,Quasar Wut-Wut
602995,127f591a-7e27-4435-92db-0780f219f3a1,The B-52S


In [45]:
df[df['artist_id'] == '9044c2be-02a0-4de3-8e99-e0231d8f9f31'].artist_name.unique()

array(['Johnny Cash & June Carter', 'Johnny Cash & June Carter Cash'], dtype=object)

In [46]:
df[df['artist_id'] == '2e0031c3-c703-45c7-b7d8-5a9885b4ee3a'].artist_name.unique()

array(['Bob Marley And Peter Tosh', 'Peter Tosh & Bob Marley'], dtype=object)

In [47]:
df[df['artist_id'] == 'def226b0-7990-4b38-ab1a-7740461844dc'].artist_name.unique()

array(['Yael Naim', 'Yael Naïm'], dtype=object)

##### Apparently the other reason is to ensure diverse encodings and spellings for names