# Musical Recommender v1.0
#### Sample data is from DataCamp

In [3]:
# Perform the necessary imports

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [4]:
# Change it to the github!

artists = pd.read_csv("C:\\Users\\bbalogh7\\OneDrive\\DatasRev_fileok\\Musical_artists_-_DataCamp_Unsup_learning\\scrobbler-small-sample.csv")

artists.head()

Unnamed: 0,user_offset,artist_offset,playcount
0,1,79,58
1,1,84,80
2,1,86,317
3,1,89,64
4,1,96,159


In [20]:
print('summary for artists:', artists.info())
print('\nshape of the dataframe:', artists.shape)
print('\nnumber of different users:', artists['user_offset'].nunique())
print('number of different artists:', artists['artist_offset'].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2894 entries, 0 to 2893
Data columns (total 3 columns):
user_offset      2894 non-null int64
artist_offset    2894 non-null int64
playcount        2894 non-null int64
dtypes: int64(3)
memory usage: 67.9 KB
summary for artists: None

shape of the dataframe: (2894, 3)

number of different users: 500
number of different artists: 111


#### We can see, that there are 3 columns, and 2894 rows in this csv. There are 500 different users, and 111 artists. For one artist there can be more playcounts from different users. 
#### Fortunately there are no NaN values.

In [8]:
# If we want a more visual representation of the dataframe I ordered it to artist_offset, so we can see that for one artist
# there should be more than one column.

artists.sort_values(by=['artist_offset', 'user_offset']).iloc[0:15, :]

Unnamed: 0,user_offset,artist_offset,playcount
21,2,0,105
96,15,0,165
147,20,0,91
151,21,0,98
208,29,0,120
366,48,0,236
512,70,0,67
671,95,0,77
685,96,0,93
759,109,0,98


#### This format (2894, 3) is not good for us, so we have to pivot the table to 'artist_offset'. Then we will have one row for every artist, and 500 columns for every possible playcount. When there are no playcounts, we will fill the NaN values with 0-s.

In [21]:
artists_pivot = artists.pivot_table(index ='artist_offset', columns ='user_offset', fill_value=0) 

artists_pivot.head()

Unnamed: 0_level_0,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount
user_offset,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
artist_offset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0,0,105,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,211,0,0,0,0,0,0,0,0,...,0,0,0,270,0,105,97,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Now we have the desired shape.

artists_pivot.shape

(111, 500)

In [24]:
artists_csr = csr_matrix(artists_pivot)

artists_csr

<111x500 sparse matrix of type '<class 'numpy.int64'>'
	with 2894 stored elements in Compressed Sparse Row format>

In [25]:
# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Make a pipeline for the easier data handling
pipeline = make_pipeline(scaler, nmf, normalizer)

norm_features = pipeline.fit_transform(artists_csr)

print("norm_features' shape:", norm_features.shape)

norm_features' shape: (111, 20)


#### Our artist names are only numbers now, so we have to give them their real names from artists.csv.

In [30]:
artist_names = pd.read_csv("C:\\Users\\bbalogh7\\OneDrive\\DatasRev_fileok\\Musical_artists_-_DataCamp_Unsup_learning\\artists.csv", header=None)

artist_names_list = artist_names[0].tolist()

# The full list of the names to choose from
artist_names_list

['Massive Attack',
 'Sublime',
 'Beastie Boys',
 'Neil Young',
 'Dead Kennedys',
 'Orbital',
 'Miles Davis',
 'Leonard Cohen',
 'Van Morrison',
 'NOFX',
 'Rancid',
 'Lamb',
 'Korn',
 'Dropkick Murphys',
 'Bob Dylan',
 'Eminem',
 'Nirvana',
 'Van Halen',
 'Damien Rice',
 'Elvis Costello',
 'Everclear',
 'Jimi Hendrix',
 'PJ Harvey',
 'Red Hot Chili Peppers',
 'Ryan Adams',
 'Soundgarden',
 'The White Stripes',
 'Madonna',
 'Eric Clapton',
 'Bob Marley',
 'Dr. Dre',
 'The Flaming Lips',
 'Tom Waits',
 'Moby',
 'Cypress Hill',
 'Garbage',
 'Fear Factory',
 '50 Cent',
 'Ani DiFranco',
 'Matchbox Twenty',
 'The Police',
 'Eagles',
 'Phish',
 'Stone Temple Pilots',
 'Black Sabbath',
 'Britney Spears',
 'Fatboy Slim',
 'System of a Down',
 'Simon & Garfunkel',
 'Snoop Dogg',
 'Aimee Mann',
 'Less Than Jake',
 'Rammstein',
 'Reel Big Fish',
 'The Prodigy',
 'Pantera',
 'Foo Fighters',
 'The Beatles',
 'Incubus',
 'Audioslave',
 'Bright Eyes',
 'Machine Head',
 'AC/DC',
 'Dire Straits',
 'Motör

In [29]:
# Create a new dataframe from the calculated features and the artist names.

df = pd.DataFrame(norm_features, index=artist_names_list)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Massive Attack,0.0,0.0,0.0,0.0,0.005802,0.0,0.0,0.055953,0.0,0.0,0.005263,0.0,0.001415,0.997994,0.0,0.0,0.0,0.0,0.028547,0.0
Sublime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005689,0.0,0.0,0.0,0.0,0.999984,0.0
Beastie Boys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Neil Young,0.262535,0.0,0.0,0.056488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.960995,0.063939,0.0,0.016869,0.0
Dead Kennedys,0.0,0.013429,0.0,0.58078,0.0,0.0,0.0,0.0,0.0,0.74629,0.0,0.0,0.137658,0.0,0.0,0.080923,0.0,0.282962,0.0,0.0


In [31]:
# Select row of the artist you want
artist = df.loc['Interpol']

# Compute cosine similarities: similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())

Interpol                 1.000000
The Flaming Lips         0.639217
Hot Hot Heat             0.554635
Death From Above 1979    0.453017
Mirah                    0.395152
dtype: float64


In [None]:
# utánanézni az NMF-nek, a df.dot-nak és hogy mi lenne Scaler nélkül.