# MyDJ K-Nearest Neighbors on Last.fm dataset

This notebook explores a baseline result for running k-nearest neighbor on the [Last.fm](http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-1K.html) music recommendation dataset

### Helper functions

In [26]:
from pathlib import Path
import numpy as np
import pandas as pd

def load_song_data():
    path = Path('./lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv')
    return pd.read_csv(
        path.absolute(),
        sep="\t",
        names=["userid", "timestamp", "artist-id", "artist-name", "track-id", "track-name"])

## Load data and access distribution

In [27]:
song_data = load_song_data()

## How many songs does each user have?

In [116]:
print(len(song_data))
print(song_data.head())

user_values = song_data['userid'].value_counts().keys().tolist()
user_counts = song_data['userid'].value_counts().tolist()
user_id_with_song_counts = list(zip(user_values, user_counts))

19098862
        userid             timestamp                             artist-id  \
0  user_000001  2009-05-04T23:08:57Z  f1b1cf71-bd35-4e99-8624-24a6e15f133a   
1  user_000001  2009-05-04T13:54:10Z  a7f7df4a-77d8-4f12-8acd-5c60c93f4de8   
2  user_000001  2009-05-04T13:52:04Z  a7f7df4a-77d8-4f12-8acd-5c60c93f4de8   
3  user_000001  2009-05-04T13:42:52Z  a7f7df4a-77d8-4f12-8acd-5c60c93f4de8   
4  user_000001  2009-05-04T13:42:11Z  a7f7df4a-77d8-4f12-8acd-5c60c93f4de8   

  artist-name track-id                                  track-name  
0   Deep Dish      NaN  Fuck Me Im Famous (Pacha Ibiza)-09-28-2007  
1        坂本龍一      NaN           Composition 0919 (Live_2009_4_15)  
2        坂本龍一      NaN                        Mc2 (Live_2009_4_15)  
3        坂本龍一      NaN                     Hibari (Live_2009_4_15)  
4        坂本龍一      NaN                        Mc1 (Live_2009_4_15)  


In [115]:
limit = 1000

users_with_1000_songs = list(filter(lambda pair: pair[1] >= limit, user_id_with_song_counts))
print(f"{len(users_with_1000_songs)} have at least {limit} songs, out of a total of {len(user_song_counts)} users")

890 have at least 1000 songs, out of a total of 992 users


## How many unique songs are there?

In [120]:
song_values = song_data['track-id'].value_counts().keys().tolist()
song_counts = song_data['track-id'].value_counts().tolist()
song_id_with_listen_counts = list(zip(song_values, song_counts))

In [128]:
limit = 10

songs_with_1000_listens = list(filter(lambda pair: pair[1] >= limit, song_id_with_listen_counts))
print(f"{len(songs_with_1000_listens)} songs have at least {limit} listens, out of a total of {len(song_counts)} songs")

268184 songs have at least 10 listens, out of a total of 960402 songs


## Generate user-song matrix

In [213]:
from sortedcontainers import SortedList

user_order = SortedList(sorted(set(song_data['userid'])))
artist_order = SortedList(sorted(set(song_data['artist-name'])))
user_song_matrix = np.zeros((len(user_order), len(artist_order)))
print(user_song_matrix.shape)

(992, 173923)


In [203]:
user_artist_df = song_data.loc[:, ['userid','artist-name']]
user_artist_df = user_artist_df.drop_duplicates()
print(user_artist_df.head())

         userid      artist-name
0   user_000001        Deep Dish
1   user_000001             坂本龍一
14  user_000001       Underworld
16  user_000001  Ennio Morricone
17  user_000001          Minus 8


In [None]:
for i, row in enumerate(user_artist_df.iterrows()):
    user_id, artist_name = row[1]

    row_num = user_order.index(user_id)
    col_num = artist_order.index(artist_name)
    user_song_matrix[row_num, col_num] = 1
    if i % 100000 == 0:
        print(i)

0
100000
200000


In [223]:
with open('user-id-to-listened-artist-name-matrix.csv', 'w') as outfile:
    df = pd.DataFrame(user_song_matrix.astype(int))
    column_names = ["userid"] + list(artist_order)
    df.insert(0, "userid", user_order)
    df.columns = column_names
    df.to_csv(outfile, sep='\t', encoding='utf-8')