# Building a song recommender

# Fire up Packages

In [1]:
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Load music data

In [3]:
song_data=pd.read_csv('song_data.csv')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [4]:
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


In [5]:
song_data.shape

(1116609, 6)

## Showing the most popular songs in the dataset

In [6]:
song_data.song.value_counts().head(10)

Sehr kosmisch - Harmonia                                                                                                                        5970
Undo - Björk                                                                                                                                    5281
You\'re The One - Dwight Yoakam                                                                                                                 4806
Dog Days Are Over (Radio Edit) - Florence + The Machine                                                                                         4536
Revelry - Kings Of Leon                                                                                                                         4339
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) - Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner    3949
Secrets - OneRepublic                                                                                     

# User-Item Collaborative Filtering
In this case, the recommendation system should be built with the method of user-item collaborative filtering. Namely, the recommendation should be backed up by: “Customers who are similar to you also liked …”.

## A: Recommend Songs


### Build user-item matrix 

In [7]:
song=song_data.drop_duplicates(['user_id','song','title','artist'])
song=song.reset_index(drop=True)

In [8]:
song.shape

(1115985, 6)

### Due to the memory limitation of Jupyter notebook, I just use first 100000 rows of data. 

In [9]:
n_user=len(song['user_id'].unique())
n_song=len(song['song_id'].unique())
n_artist=len(song['artist'].unique())
print ('We have '+str(n_user)+' unique users,'+str(n_song)+' unique songs and '+str(n_artist)+' unique artist in the data table.')

We have 66346 unique users,10000 unique songs and 3375 unique artist in the data table.


### User_item matrix can be created by pivot table. 

In [10]:
song_pivot=song.pivot(index='user_id',columns='song_id',values='listen_count')

In [11]:
song_pivot.shape

(66346, 10000)

In [12]:
song_pivot=song_pivot.fillna(0)

In [13]:
song_pivot.head(5)

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACSG12AB018DC80,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAJMQ12A6D4F7D17,SOAAKPM12A58A77210,SOAALWN12A6D4F7FDA,SOAAMOW12AB018149B,...,SOZZKPR12A6D4F8147,SOZZLTY12A67AE0AD0,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZYAO12A6701FF36,SOZZZPV12A8C1444B5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00003a4459f33b92906be11abe0e93efc423c0ff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00005c6177188f12fb5e2e82cdbd93e8a3f35e64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00030033e3a2f904a48ec1dd53019c9969b6ef1f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007235c769e610e3d339a17818a5708e41008d9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007c0e74728ca9ef0fe4eb7f75732e8026a278b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Implement KNN method to recommend songs

In [14]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine')
kmodel=knn.fit(song_pivot)

### Let us try a user

In [15]:
# change a pivot to DataFrame!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
song_pivot=song_pivot.reset_index(drop=True)

In [16]:
song_pivot.head(5)

song_id,SOAAAGQ12A8C1420C8,SOAACPJ12A81C21360,SOAACSG12AB018DC80,SOAAEJI12AB0188AB5,SOAAFAC12A67ADF7EB,SOAAFYH12A8C13717A,SOAAJMQ12A6D4F7D17,SOAAKPM12A58A77210,SOAALWN12A6D4F7FDA,SOAAMOW12AB018149B,...,SOZZKPR12A6D4F8147,SOZZLTY12A67AE0AD0,SOZZLZN12A8AE48D6D,SOZZRHE12A6702165F,SOZZTCU12AB0182C58,SOZZTNF12A8C139916,SOZZVWB12AB0189C30,SOZZWZV12A67AE140F,SOZZYAO12A6701FF36,SOZZZPV12A8C1444B5
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
kneighbors=kmodel.kneighbors(song_pivot.ix[1].values.reshape(1, -1))

In [18]:
print(kneighbors,kneighbors[1])

(array([[  2.22044605e-16,   3.16869949e-01,   4.47842370e-01,
          5.00000000e-01,   6.37857016e-01,   6.51257084e-01,
          6.62836282e-01,   6.66666667e-01,   6.72185764e-01,
          6.78366240e-01,   6.84550163e-01,   6.88195218e-01,
          7.05825797e-01,   7.25913042e-01,   7.29166667e-01,
          7.32028900e-01,   7.37199835e-01,   7.46416755e-01,
          7.73544593e-01,   7.73866492e-01]]), array([[    1, 38931, 60477, 41873, 65646, 33107, 16998, 49361,  7506,
        59869, 45327, 49752, 35205, 37806, 19504, 36494, 41770, 36358,
         1981, 28473]])) [[    1 38931 60477 41873 65646 33107 16998 49361  7506 59869 45327 49752
  35205 37806 19504 36494 41770 36358  1981 28473]]


In [19]:
User_Index=kneighbors[1][0]

### Now we have a list of indexes of the users that are similar to the user we want to recommend the songs. We will find out who are they and what songs do they like to listen. After that, we can create a list with the songs we want to recommend.


In [20]:
All_user=song['user_id'].unique()
Others=User_Index[1:]
Relevant_user=All_user[Others]

In [67]:
type(Relevant_user)

numpy.ndarray

In [103]:
song_relevant=song[song['user_id'].isin(Relevant_user)]
song_relevant.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
33481,09aa560703dff8a927955ac96b947577380807f8,SOILROA12A6D4FD0D7,3,Through The Fire And Flames (Album Version),Dragonforce,Through The Fire And Flames (Album Version) - ...
33482,09aa560703dff8a927955ac96b947577380807f8,SOLXXZI12A8AE4733A,1,Helena (So Long & Goodnight) (Album Version),My Chemical Romance,Helena (So Long & Goodnight) (Album Version) -...
33483,09aa560703dff8a927955ac96b947577380807f8,SOOIFDD12A8C13C468,1,Monsters (Album Version),Matchbook Romance,Monsters (Album Version) - Matchbook Romance
33484,09aa560703dff8a927955ac96b947577380807f8,SOOXMSN12A58A7A8D3,1,To The End (Album Version),My Chemical Romance,To The End (Album Version) - My Chemical Romance
33485,09aa560703dff8a927955ac96b947577380807f8,SOPKPSQ12A58A7A5E4,3,I\'m Not Okay (I Promise) (Live From Sessions@...,My Chemical Romance,I\'m Not Okay (I Promise) (Live From Sessions@...


In [22]:
song_list=song_relevant.groupby('song').listen_count.sum()

In [23]:
song_list.sort_values(inplace=True)

### The top 30 Recommended songs for the specific user


In [24]:
Recommended_Song=song_list[:-30:-1]
print (Recommended_Song)

song
My Immortal (Album Version) - Evanescence                                  29
Gypsy Woman (She\'s Homeless) - Crystal Waters                             27
Drop It Low - Ester Dean / Chris Brown                                     23
Electric Avenue - Eddy Grant                                               21
Forever - Drake / Kanye West / Lil Wayne / Eminem                          17
Because The Night [MTV Unplugged Version] - 10_000 Maniacs                 17
Frisch und g\'sund - Die Mooskirchner                                      14
Somebody To Love - Justin Bieber                                           14
Better The Devil You Know (Showgirl Tour) - Kylie Minogue                  13
Let Me Think About It - Ida Corr Vs Fedde Le Grand                         12
Please Mr. Postman - Carpenters                                            12
Pojo Pojo - Cyberfit                                                       11
Picture U & Me - Mo B. Dick                                

## B: Recommend Artists 

### Count how many times the users listen the music of artist


In [25]:
artist_count=song.groupby(['user_id','artist'],as_index=False).size().reset_index(name='count')

In [42]:
artist_count.sort_values(by='count',inplace=True)

In [72]:
artist_count.head()

Unnamed: 0,user_id,artist,count
0,00003a4459f33b92906be11abe0e93efc423c0ff,Black Eyed Peas,1
540097,a4481c01213c866b1158f26b74a5ae5f9d536b4f,Metric,1
540098,a4481c01213c866b1158f26b74a5ae5f9d536b4f,Michael Bublé,1
540099,a4481c01213c866b1158f26b74a5ae5f9d536b4f,Miley Cyrus,1
540100,a4481c01213c866b1158f26b74a5ae5f9d536b4f,Nancy Sinatra,1


### Create user artists matrix 

In [27]:
pivot_artist=artist_count.pivot(index='user_id',columns='artist',values='count')

In [28]:
pivot_artist=pivot_artist.fillna(0)

In [29]:
Model_artist=knn.fit(pivot_artist)

### Try a user 

In [30]:
Neighbours=Model_artist.kneighbors(pivot_artist.ix[0].values.reshape(1, -1))

In [55]:
Neighbours

(array([[  3.33066907e-16,   4.92907447e-01,   5.99108137e-01,
           6.22035527e-01,   6.22035527e-01,   6.22035527e-01,
           6.22035527e-01,   6.22035527e-01,   6.29671960e-01,
           6.32116396e-01,   6.33320601e-01,   6.41431417e-01,
           6.55415606e-01,   6.61938298e-01,   6.61938298e-01,
           6.61938298e-01,   6.61938298e-01,   6.61938298e-01,
           6.65923448e-01,   6.66666667e-01]]),
 array([[    0, 42005, 66102, 61178, 16745,  5414, 35436, 40071,  4333,
         59842, 30193, 15398,  1083, 24670, 30702, 14188, 19281, 15859,
         33651, 21848]]))

In [31]:
Neighbour_index=Neighbours[1][0]

In [54]:
Art_others=Neighbours[1:]
Art_others

(array([[    0, 42005, 66102, 61178, 16745,  5414, 35436, 40071,  4333,
         59842, 30193, 15398,  1083, 24670, 30702, 14188, 19281, 15859,
         33651, 21848]]),)

In [56]:
All_user2=song['artist'].unique()
Others2=Art_others[1:]
Relevant_user2=All_user2[Others2]

In [85]:
Relevant_user2

array(['Jack Johnson', 'Paco De Lucia', 'Kanye West', ...,
       "Mama\\'s Jasje", 'Elvis Perkins', 'Mott The Hoople'], dtype=object)

In [86]:
All_user_artist=artist_count[artist_count['artist'].isin (Relevant_user2)]


In [90]:
All_user_artist.sort_values(by='count',ascending=False,inplace=True)

In [91]:
All_user_artist.head(10)

Unnamed: 0,user_id,artist,count
278885,54e50d606af8ead2f8c23da08247071c897926d2,The New Pornographers,48
551626,a7bc28aa730f4247c850b0235505a276acdec825,Daft Punk,45
440843,868955d3452b7d4c06f7a3cf652685bf9aa030c9,Coldplay,43
433121,843db08e0497f08e1ea288943d4b4816280f0f09,The Black Keys,42
363235,6f153a78ba2cad9524a9b9db71494b0dd2acf252,The Black Keys,40
661122,c96cb1375b5d1f8bd88c84d7854dfbca1a6a7698,Gorillaz,38
109523,21a440c1537cd0b3d00735bc1d2d6d887ac119b5,Coldplay,38
525827,9fe6abb01de3165b478b2fb3dc5dbc3d531ef8ec,The Black Keys,38
743620,e2ffd660f074710961b5e214dfdc1cb19753b793,The Black Keys,36
481797,92da4d090f5129114655632f6108088e3ecd077e,Muse,35


In [92]:
Recommended_artists = All_user_artist['artist'][:10]
print (Recommended_artists)

278885    The New Pornographers
551626                Daft Punk
440843                 Coldplay
433121           The Black Keys
363235           The Black Keys
661122                 Gorillaz
109523                 Coldplay
525827           The Black Keys
743620           The Black Keys
481797                     Muse
Name: artist, dtype: object


# Assignment

In [38]:
len(song[song.artist=='Kanye West'].user_id.unique())

2522

In [39]:
len(song[song.artist=='Foo Fighters'].user_id.unique())

2055

In [40]:
len(song[song.artist=='Taylor Swift'].user_id.unique())

3246

In [41]:
len(song[song.artist=='Lady GaGa'].user_id.unique())

2928

In [102]:
song.groupby('artist').listen_count.sum().sort_values()

artist
William Tabbert                                                              14
Reel Feelings                                                                24
Beyoncé feat. Bun B and Slim Thug                                            26
Boggle Karaoke                                                               30
Diplo                                                                        30
harvey summers                                                               31
Nâdiya                                                                       36
Kanye West / Talib Kweli / Q-Tip / Common / Rhymefest                        38
Jody Bernal                                                                  38
Aneta Langerova                                                              38
John Altman                                                                  39
Trademark                                                                    40
Lloyd / Ashanti / Scarface       