# Musical Recommender v1.0
#### Sample data is from DataCamp
#### Balazs Balogh - 2019

In [1]:
# Perform the necessary imports

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [2]:
# Read the csv file

artists = pd.read_csv("https://raw.githubusercontent.com/budapestpy-workshops/sample_files/master/scrobbler-small-sample.csv")

artists.head()

Unnamed: 0,user_offset,artist_offset,playcount
0,1,79,58
1,1,84,80
2,1,86,317
3,1,89,64
4,1,96,159


In [3]:
# Basic information about the data

print(artists.info())
print('\nshape of the dataframe:', artists.shape)
print('\nnumber of different users:', artists['user_offset'].nunique())
print('number of different artists:', artists['artist_offset'].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2894 entries, 0 to 2893
Data columns (total 3 columns):
user_offset      2894 non-null int64
artist_offset    2894 non-null int64
playcount        2894 non-null int64
dtypes: int64(3)
memory usage: 67.9 KB
None

shape of the dataframe: (2894, 3)

number of different users: 500
number of different artists: 111


#### We can see, that there are 3 columns, and 2894 rows in this csv. There are 500 different users, and 111 artists. For one artist there can be more playcounts from different users. 
#### Fortunately there are no NaN values.

In [4]:
# If we want a more visual representation of the dataframe I ordered it to artist_offset, so we can see that for one artist
# there should be more than one column.

artists.sort_values(by=['artist_offset', 'user_offset']).iloc[0:15, :]

Unnamed: 0,user_offset,artist_offset,playcount
21,2,0,105
96,15,0,165
147,20,0,91
151,21,0,98
208,29,0,120
366,48,0,236
512,70,0,67
671,95,0,77
685,96,0,93
759,109,0,98


#### We have to pivot the table to 'artist_offset', because this layout (2894, 3) is not the best for us. Then we will have one row for every artist, and 500 columns for every possible playcount. When there are no playcounts, we will fill the NaN values with 0-s.

In [5]:
artists_pivot = artists.pivot_table(index ='artist_offset', columns ='user_offset', fill_value=0) 

artists_pivot.head()

Unnamed: 0_level_0,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount,playcount
user_offset,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
artist_offset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0,0,105,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,211,0,0,0,0,0,0,0,0,...,0,0,0,270,0,105,97,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Now we have the desired shape.

artists_pivot.shape

(111, 500)

In [7]:
"""
We need a numpy array or csr_matrix for NMF. The csr_matrix comes from the scipy library, and it's good for matrices
with a lot of 0 values. It takes all the non-zero values.
https://machinelearningmastery.com/sparse-matrices-for-machine-learning/ - more info here
An example from the article above:

Sample numpy array:
[[1 0 0 1 0 0]
 [0 0 2 0 0 1]
 [0 0 0 2 0 0]]

csr_matrix:
  (0, 0) 1
  (0, 3) 1
  (1, 2) 2
  (1, 5) 1
  (2, 3) 2
  
"""

artists_csr = csr_matrix(artists_pivot)

artists_csr

<111x500 sparse matrix of type '<class 'numpy.int64'>'
	with 2894 stored elements in Compressed Sparse Row format>

In [8]:
# Create a MaxAbsScaler - Scale each feature by max value (by column).
# https://joshlawman.com/preparing-data-scaling-and-normalization/ - more info about scalers

scaler = MaxAbsScaler()

In [9]:
# NMF = Non-negative Matrix Factorization, a dimension reduction technique. All sample features must be non-negative.
# Components represent topics, and we have to tell NMF how many topics we want.
# https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/ - more info

nmf = NMF(n_components=20)

In [10]:
# Create a Normalizer
normalizer = Normalizer()

# Make a pipeline for the easier data handling
pipeline = make_pipeline(scaler, nmf, normalizer)

norm_features = pipeline.fit_transform(artists_csr)

print("norm_features' shape:", norm_features.shape)

norm_features' shape: (111, 20)


#### Our artist names are only numbers now, so we have to give them their real names from artists.csv.

In [11]:
artist_names = pd.read_csv("https://raw.githubusercontent.com/budapestpy-workshops/sample_files/master/artists.csv", header=None)

# We need to convert the names to a list, to have them as index later.
artist_names_list = artist_names[0].tolist()

# The full list of the names to choose from
artist_names_list

['Massive Attack',
 'Sublime',
 'Beastie Boys',
 'Neil Young',
 'Dead Kennedys',
 'Orbital',
 'Miles Davis',
 'Leonard Cohen',
 'Van Morrison',
 'NOFX',
 'Rancid',
 'Lamb',
 'Korn',
 'Dropkick Murphys',
 'Bob Dylan',
 'Eminem',
 'Nirvana',
 'Van Halen',
 'Damien Rice',
 'Elvis Costello',
 'Everclear',
 'Jimi Hendrix',
 'PJ Harvey',
 'Red Hot Chili Peppers',
 'Ryan Adams',
 'Soundgarden',
 'The White Stripes',
 'Madonna',
 'Eric Clapton',
 'Bob Marley',
 'Dr. Dre',
 'The Flaming Lips',
 'Tom Waits',
 'Moby',
 'Cypress Hill',
 'Garbage',
 'Fear Factory',
 '50 Cent',
 'Ani DiFranco',
 'Matchbox Twenty',
 'The Police',
 'Eagles',
 'Phish',
 'Stone Temple Pilots',
 'Black Sabbath',
 'Britney Spears',
 'Fatboy Slim',
 'System of a Down',
 'Simon & Garfunkel',
 'Snoop Dogg',
 'Aimee Mann',
 'Less Than Jake',
 'Rammstein',
 'Reel Big Fish',
 'The Prodigy',
 'Pantera',
 'Foo Fighters',
 'The Beatles',
 'Incubus',
 'Audioslave',
 'Bright Eyes',
 'Machine Head',
 'AC/DC',
 'Dire Straits',
 'Motör

In [12]:
# Create a new dataframe from the calculated features and the artist names.

df = pd.DataFrame(norm_features, index=artist_names_list)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Massive Attack,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.823474,0.0,0.01608,0.222741,0.392904,0.262324,0.060975,0.0,0.0,0.0,0.0,0.212397,0.0
Sublime,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Beastie Boys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Neil Young,0.267967,0.00614,0.0,0.056177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.959573,0.06228,0.0,0.018478,0.0
Dead Kennedys,0.0,0.095714,0.0,0.583789,0.0,0.0,0.0,0.0,0.0,0.744166,0.0,0.0,0.136139,0.0,0.0,0.075812,0.0,0.268263,0.0,0.0


In [13]:
# Select row of the artist you want
artist = df.loc['The Beatles']

# Compute cosine similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())

The Beatles          1.000000
The Beach Boys       0.768938
Bruce Springsteen    0.484257
Phish                0.479310
Leonard Cohen        0.468106
dtype: float64


## Additional info:

In [14]:
# pandas.dot, Dot Product example, we need to multiply the values, and add them, that's the dot product: a1 * b1 + a2 * b2 ...

print(df.loc['Interpol'])
print()
print(df.loc['The Flaming Lips'])

interpol_np = df.loc['Interpol'].to_numpy()
tfl_np = df.loc['The Flaming Lips'].to_numpy()

print('\nDot product of Interpol and The Flaming Lips, which means the similarity based on the user reviews:',
      interpol_np.dot(tfl_np))

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     0.004156
9     0.000000
10    0.999991
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.000000
Name: Interpol, dtype: float64

0     0.348815
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.206745
8     0.000000
9     0.007245
10    0.627072
11    0.000000
12    0.543430
13    0.000000
14    0.000000
15    0.000000
16    0.211676
17    0.319670
18    0.000000
19    0.000000
Name: The Flaming Lips, dtype: float64

Dot product of Interpol and The Flaming Lips, which means the similarity based on the user reviews: 0.6270666179876194


In [15]:
# dot product example 2.

from numpy import array

a = array([1, 2, 3])
print(a)

b = array([1, 2, 3])
print(b)

c = a.dot(b)
print(c)

# prints out:
# [1 2 3]
# [1 2 3]
# 14

[1 2 3]
[1 2 3]
14
