# Brute Force Collaborative Filtering

Notebook to perform brute force collaborative filtering on the LastFM dataset. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from multiprocessing import Pool
from sklearn.model_selection import train_test_split

## Read Data

In [3]:
data = pd.read_csv("~/Columbia/Personalization Theory/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv", 
                   delimiter="\t", header=None, 
                   names = ["userid","timestamp","musicbrainz-artist-id",
                            "artist-name","musicbrainz-track-id","track-name"])

In [None]:
print("Total number of rows:", len(data))
data.head(5)

Total number of rows: 19098862


Unnamed: 0,userid,timestamp,musicbrainz-artist-id,artist-name,musicbrainz-track-id,track-name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


## Group data into artists and users

In [None]:
user_artist_groups = data.groupby(['userid','artist-name'])['timestamp'].count().reset_index()
user_artist_groups.columns = ['userid', 'artist-name', 'count']

In [5]:
user_artist_groups.head(5)

Unnamed: 0,userid,artist-name,count
0,user_000001,2562,26
1,user_000001,310,1
2,user_000001,3582,1
3,user_000001,4 Wings,2
4,user_000001,4Hero,146


In [6]:
user_artist_groups.tail(5)

Unnamed: 0,userid,artist-name,count
897416,user_001000,Zigmat,1
897417,user_001000,Zion I,2
897418,user_001000,Zy-,2
897419,user_001000,浜崎あゆみ,1
897420,user_001000,長沼英樹,18


### Save the file!

Let's save this grouped data into a csv for later use

In [7]:
user_artist_groups.to_csv("users_and_artists.csv", index=False)

In [8]:
unique_users = np.unique(user_artist_groups["userid"])
unique_artists = np.unique(user_artist_groups["artist-name"])

In [9]:
print('Number of unique users:', len(unique_users))
print('Number of unique artists:', len(unique_artists))

Number of unique users: 992
Number of unique artists: 173923


## Making Train-Test splits

In [70]:
train_data = pd.DataFrame()
test_data = pd.DataFrame()

In [64]:
user_artist_groups = pd.read_csv("users_and_artists.csv")

In [73]:
for u in unique_users:
    user_u_all_data = user_artist_groups.loc[user_artist_groups["userid"]==u,:]
    user_u_train, user_u_test = train_test_split(user_u_all_data, test_size=0.20, random_state=42)
    train_data = train_data.append(user_u_train)
    test_data = test_data.append(user_u_test)

In [74]:
train_data

Unnamed: 0,userid,artist-name,count
227,user_000001,Hippiehaus,5
559,user_000001,The Black Dog,149
363,user_000001,Marvin Gaye,5
61,user_000001,Billy Joel,1
578,user_000001,The Verve,46
569,user_000001,The Lost Men,12
29,user_000001,Angela Bofill,4
265,user_000001,Jazzanova Feat. Pedro Martins & Azymuth,18
148,user_000001,Dj Rels,2
209,user_000001,"Grover Washington, Jr.",1


In [75]:
test_data

Unnamed: 0,userid,artist-name,count
627,user_000001,Yanokami,42
271,user_000001,Jeff Beck,15
290,user_000001,Jonny Greenwood,2
63,user_000001,Björk,448
302,user_000001,Ken Ishii,13
78,user_000001,Boom Boom Satellites,16
644,user_000001,大樹,71
576,user_000001,The Sugarhill Gang,1
72,user_000001,Bobby Cole,17
221,user_000001,Henry Mancini,2


In [76]:
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

## Make the matrix

In [10]:
count_matrix = pd.DataFrame(columns = unique_users, index=unique_artists)

In [11]:
count_matrix

Unnamed: 0,user_000001,user_000002,user_000003,user_000004,user_000005,user_000006,user_000007,user_000008,user_000009,user_000010,...,user_000991,user_000992,user_000993,user_000994,user_000995,user_000996,user_000997,user_000998,user_000999,user_001000
! Europe - France - Cold Wave,,,,,,,,,,,...,,,,,,,,,,
! Germany - Surfbeat From Beyond The Iron Curtain,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Adam Mi,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Breakout,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Budka Suflera,,,,,,,,,,,...,,,,,,,,,,
"! Www.Polskie-Mp3.Tk ! Jacek Kaczmarski, Gintrowski And Lapinski",,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Jan Krzysztof Kelus,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Justyna I Piotr,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Katarzyna Groniec,,,,,,,,,,,...,,,,,,,,,,
! Www.Polskie-Mp3.Tk ! Lista Przebojow Programu Iii,,,,,,,,,,,...,,,,,,,,,,


### Fill up the values of the matrix

In [12]:
for index, row in user_artist_groups.iterrows():
    count_matrix.loc[row['artist-name'], row['userid']] = row['count']

In [56]:
count_matrix = count_matrix.astype(float)

In [57]:
np.sum(1 * np.array(count_matrix > 0))

897421

In [58]:
count_matrix.to_csv("count_matrix.csv", index = True)

In [17]:
del data, user_artist_groups #cleaning up some RAM

In [62]:
user_corr_matrix = count_matrix.corr()
          
user_corr_matrix

Unnamed: 0,user_000001,user_000002,user_000003,user_000004,user_000005,user_000006,user_000007,user_000008,user_000009,user_000010,...,user_000991,user_000992,user_000993,user_000994,user_000995,user_000996,user_000997,user_000998,user_000999,user_001000
user_000001,1.000000,-0.090134,-0.026829,0.309235,-0.002373,0.308227,-0.121920,-0.416848,0.160085,-0.417063,...,-0.049403,-0.056587,0.783740,-0.224025,-0.157792,0.205558,-0.256438,0.671444,0.055667,0.341907
user_000002,-0.090134,1.000000,0.099694,-0.027784,0.061601,0.151186,-0.192577,-0.146346,-0.056150,-0.089299,...,-0.019535,-0.025551,0.075208,0.345693,0.046023,-0.025776,-0.090054,0.192135,0.028812,0.179434
user_000003,-0.026829,0.099694,1.000000,0.130054,0.162888,0.006667,-0.030372,0.119836,0.083526,-0.024390,...,0.102088,0.055483,-0.057917,0.133891,0.017632,0.200785,-0.285323,0.159963,-0.036582,0.081204
user_000004,0.309235,-0.027784,0.130054,1.000000,0.164050,0.059561,0.112745,-0.148280,-0.050679,-0.060382,...,-0.022017,0.199521,0.232730,-0.028535,0.081863,0.728982,-0.061126,0.492791,-0.039710,0.224270
user_000005,-0.002373,0.061601,0.162888,0.164050,1.000000,-0.043063,-0.157275,-0.086733,-0.095991,0.115276,...,-0.059015,-0.041216,0.345460,0.061010,0.065527,-0.043353,-0.243611,-0.108384,-0.088623,-0.118620
user_000006,0.308227,0.151186,0.006667,0.059561,-0.043063,1.000000,0.320662,-0.099821,-0.057728,0.137523,...,0.152269,0.047809,-0.024460,0.059325,0.179098,0.662574,-0.243176,0.202674,0.324494,0.597567
user_000007,-0.121920,-0.192577,-0.030372,0.112745,-0.157275,0.320662,1.000000,-0.208861,0.145230,-0.196596,...,1.000000,0.221044,-0.227779,0.286215,0.223694,0.288338,1.000000,,-0.058439,-0.087480
user_000008,-0.416848,-0.146346,0.119836,-0.148280,-0.086733,-0.099821,-0.208861,1.000000,-0.123663,0.319665,...,-0.071571,-0.209382,-0.095496,-0.097834,-0.072882,-0.261641,-0.454848,-0.280168,-0.121159,-0.069525
user_000009,0.160085,-0.056150,0.083526,-0.050679,-0.095991,-0.057728,0.145230,-0.123663,1.000000,-0.109364,...,0.067666,0.088589,-0.139668,0.002103,-0.030382,-0.001325,-0.195364,-0.002187,0.015427,0.542334
user_000010,-0.417063,-0.089299,-0.024390,-0.060382,0.115276,0.137523,-0.196596,0.319665,-0.109364,1.000000,...,0.325020,-0.160076,-0.070094,-0.007308,-0.094242,-0.422850,,-0.195749,0.061001,-0.274807


In [29]:
N_users

10

In [30]:
users

['user_000001',
 'user_000002',
 'user_000003',
 'user_000004',
 'user_000005',
 'user_000006',
 'user_000007',
 'user_000008',
 'user_000009',
 'user_000010']