In [1]:
import pandas as pd 
import numpy as np 
from scipy.sparse import csr_matrix

#displaying 3 point decimals 
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
user_data = pd.read_table('/Users/abhi/Recommender_Systems/recommend_music/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv'
                         , header = None, nrows = 2e7, names =['users', 'musicbrainz-artist-id', 'artist', 'plays'],
                         usecols = ['users', 'artist', 'plays'])
user_data.head()

Unnamed: 0,users,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [3]:
user_profiles = pd.read_table('/Users/abhi/Recommender_Systems/recommend_music/lastfm-dataset-360K/usersha1-profile.tsv',
                             header = None, names =['users','gender', 'age', 'country', 'signup'],
                             usecols = ['users', 'country'])
user_profiles.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [4]:
#dropping the artists that are not popular 
if user_data['artist'].isnull().sum()>0:
    user_data = user_data.dropna(axis=0, subset = ['artist'])

In [5]:
artist_plays = (user_data.groupby(by = ['artist']).sum().
                reset_index().
                rename(columns = {'plays': 'total_artist_plays'})
                       [['artist','total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist,total_artist_plays
0,04)],6
1,2,1606
2,58725ab=>,23
3,80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari,70
4,amy winehouse,23


In [6]:
user_data_with_artist_plays = user_data.merge(artist_plays, left_on='artist', 
                                             right_on = 'artist', how ='left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [7]:
print(artist_plays['total_artist_plays'].describe())

count     292364.000
mean       12907.037
std       185981.313
min            1.000
25%           53.000
50%          208.000
75%         1048.000
max     30466827.000
Name: total_artist_plays, dtype: float64


In [8]:
artist_plays['total_artist_plays'].quantile(np.arange(0.9,1,0.01))

0.900     6138.000
0.910     7410.000
0.920     9102.960
0.930    11475.590
0.940    14898.440
0.950    19964.250
0.960    28419.880
0.970    43541.330
0.980    79403.440
0.990   198482.590
Name: total_artist_plays, dtype: float64

In [9]:
popularity_threshold = 40000
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
user_data_popular_artists.head()

Unnamed: 0,users,artist,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,13547741


In [10]:
combined = user_data_popular_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
usa_data = combined.query('country == \'United States\'')
usa_data.head()

Unnamed: 0,users,artist,plays,total_artist_plays,country
156,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,2366807,United States
157,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,6115545,United States
158,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,2194862,United States
159,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,4248296,United States
160,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,3495537,United States


In [11]:
if not usa_data[usa_data.duplicated(['users', 'artist'])].empty:
    initial_rows = usa_data.shape[0]
    
    print('Initial dataframe shape {0}'.format(usa_data.shape))
    usa_data = usa_data.drop_duplicates(['users', 'artist'])
    current_rows = usa_data.shape[0]
    print('New dataframe shape {0}'.format(usa_data.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (2788019, 5)
New dataframe shape (2788013, 5)
Removed 6 rows


# Implementing Nearest Neigbors Model 

In [13]:
wide_artist_data = usa_data.pivot(index = 'artist', columns = 'users', values ='plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data)

In [14]:
# save the model for future 
from scipy.sparse import csr_matrix
    
def save_sparse_csr(filename, array):
    np.savez(filename, data = array.data, indices=array.indices,
            indptr=array.indptr, shape = array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape = loader['shape'])

save_sparse_csr('/Users/abhi/Recommender_Systems/recommend_music/model/',
               wide_artist_data_sparse)

In [None]:
#len(wide_artist_data)

In [None]:
#import cPickle as pickle

In [None]:
#filename = '/Users/abhi/Recommender_Systems/recommend_music/wide_artist_data.pickle'
#with open(filename,'wb') as fp:
 #   pickle.dump(wide_artist_data,fp)

In [4]:
#wide_artist_data_sparse = load_sparse_csr('/Users/abhi/Recommender_Systems/recommend_music/model/model.npz')

In [15]:
# Fitting the model
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(wide_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [16]:
#Making recommendations 
query_index = np.random.choice(wide_artist_data.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index,:].reshape(1,-1),
                                         n_neighbors = 6)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}: \n'.format(wide_artist_data.index[query_index]))
    else:
        print('{0}: {1}, with distance: {2}'.format(i,wide_artist_data.index[indices.flatten()[i]]
                                                   , distances.flatten()[i]))

3019
Recommendations for fu manchu: 

1: nebula, with distance: 0.499399557776
2: the atomic bitchwax, with distance: 0.539052743399
3: hermano, with distance: 0.609744577685
4: dozer, with distance: 0.612605053692
5: zeke, with distance: 0.617131966286


In [17]:
# Creating binary counts 
wide_artist_data_zero_one = wide_artist_data.apply(np.sign)
wide_artist_data_zero_one_sparse = csr_matrix(wide_artist_data_zero_one.values)

save_sparse_csr('/Users/abhi/Recommender_Systems/recommend_music/model/lastfm_sparse_artist_matrix_binary.npz', wide_artist_data_zero_one_sparse)

In [18]:
model_nn_binary = NearestNeighbors(metric='cosine', algorithm='brute')
model_nn_binary.fit(wide_artist_data_zero_one_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [19]:
import string
from fuzzywuzzy import fuzz



users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,fff58a5c95280b7af63f9c552f9159b58ae5efa3,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff69e7cb53568c732909648527a778c31befec8,fff820efe22db6c868515436de82af39e013b910,fff89b6b5332f0f38996f11c88f908a3924926fe,fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,fffa9294e858a7c863b5ad363c748c2330d9bd45,fffa9d62caff0f038c7a35db70f109b1bba04a1d,fffaf6f9a1a3ad8bd0dff7b48b2eb9eef030fdee,fffe8c7f952d9b960a56ed4dcb40a415d924b224
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#####,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+44),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def print_artist_recommendations(query_artist, artist_plays_matrix, knn_model, k):
    """
    Inputs:
    query_artist: query artist name
    artist_plays_matrix: artist play count dataframe (not the sparse one, the pandas dataframe)
    knn_model: our previously fitted sklearn knn model
    k: the number of nearest neighbors.
    
    Prints: Artist recommendations for the query artist
    Returns: None
    """
    
    query_index = None 
    ratio_tuples = [] 
    
    
    for i in artist_plays_matrix.index:
        ratio = fuzz.ratio(i.lower(), query_artist.lower())
        if ratio >= 75:
            current_query_index = artist_plays_matrix.index.tolist().index(i)
            ratio_tuples.append((i,ratio,current_query_index))
    
    print 'Possible matches: {0}\n'.format([(x[0], x[1]) for x in ratio_tuples])
    
    try:
        query_index = max(ratio_tuples, key = lambda x: x[1])[2] # gets the index of the best matched artist 
        
    except:
        print 'Your artist didn\'t match any artists in the data. Try again'
        return None 
    
    distances, indices = knn_model.kneighbors(artist_plays_matrix.iloc[query_index, :].reshape(1, -1), n_neighbors = k + 1)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print 'Recommendations for {0}:\n'.format(artist_plays_matrix.index[query_index])
        else:
            print '{0}: {1}, with distance of {2}:'.format(i, artist_plays_matrix.index[indices.flatten()[i]], distances.flatten()[i])

    return None

In [24]:
print_artist_recommendations('red hot chili peppers', wide_artist_data_zero_one, model_nn_binary, k = 10)

Possible matches: [('red hot chili peppers', 100)]

Recommendations for red hot chili peppers:

1: incubus, with distance of 0.686632912166:
2: the beatles, with distance of 0.693856742888:
3: sublime, with distance of 0.70540037526:
4: foo fighters, with distance of 0.71155686859:
5: coldplay, with distance of 0.716691422348:
6: led zeppelin, with distance of 0.722488787624:
7: nirvana, with distance of 0.724943983169:
8: green day, with distance of 0.734603813118:
9: radiohead, with distance of 0.737372302802:
10: rage against the machine, with distance of 0.740136491957:


In [32]:
print_artist_recommendations('emi ne m', wide_artist_data_zero_one, model_nn_binary, k = 10)

Possible matches: [('eminem', 86)]

Recommendations for eminem:

1: 50 cent, with distance of 0.668324212947:
2: 2pac, with distance of 0.693505542237:
3: ludacris, with distance of 0.707377082885:
4: jay-z, with distance of 0.709177423306:
5: the game, with distance of 0.716550022387:
6: t.i., with distance of 0.725279375768:
7: kanye west, with distance of 0.726752502358:
8: nas, with distance of 0.740194341248:
9: notorious b.i.g., with distance of 0.746848783097:
10: dr. dre, with distance of 0.747268461847:


In [31]:
wide_artist_data_zero_one.shape

(9127, 66913)