# Musical Recommender v1.0
#### Sample data is from DataCamp
#### Balazs Balogh - 2019

In [None]:
# Perform the necessary imports

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

In [None]:
# Read the csv file, and inspect the data (head())

artists = ...

...

In [None]:
# Basic information about the data

print(artists.info())
print('\nshape of the dataframe:', ...)
print('\nnumber of different users:', ...)
print('number of different artists:', ...)

#### We can see, that there are 3 columns, and 2894 rows in this csv. There are 500 different users, and 111 artists. For one artist there can be more playcounts from different users. 
#### Fortunately there are no NaN values.

In [None]:
# If we want a more visual representation of the dataframe I sorted it to artist_offset, so we can see that for one artist
# there should be more than one column.

artists. ...(by=['artist_offset', 'user_offset']).iloc[0:15, :]

#### We have to pivot the table to 'artist_offset', because this layout (2894, 3) is not the best for us. Then we will have one row for every artist, and 500 columns for every possible playcount. When there are no playcounts, we will fill the NaN values with 0-s.

In [None]:
artists_pivot = artists.pivot_table(index ='...', columns ='...', fill_value=0) 

artists_pivot.head()

In [None]:
# Now we have the desired shape.

artists_pivot. ...

In [None]:
"""
We need a numpy array or csr_matrix for NMF. The csr_matrix comes from the scipy library, and it's good for matrices
with a lot of 0 values. It takes all the non-zero values.
https://machinelearningmastery.com/sparse-matrices-for-machine-learning/ - more info here
An example from the article above:

Sample numpy array:
[[1 0 0 1 0 0]
 [0 0 2 0 0 1]
 [0 0 0 2 0 0]]

csr_matrix:
  (0, 0) 1
  (0, 3) 1
  (1, 2) 2
  (1, 5) 1
  (2, 3) 2
  
"""

artists_csr = csr_matrix(artists_pivot)

artists_csr

In [None]:
# Create a MaxAbsScaler - Scale each feature by max value (by column).
# https://joshlawman.com/preparing-data-scaling-and-normalization/ - more info about scalers

scaler = MaxAbsScaler()

In [None]:
# NMF = Non-negative Matrix Factorization, a dimension reduction technique. All sample features must be non-negative.
# Components represent topics, and we have to tell NMF how many topics we want.
# https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/ - more info

nmf = NMF(n_components=20)

In [None]:
# Create a Normalizer
normalizer = Normalizer()

# Make a pipeline for the easier data handling
pipeline = ...

norm_features = ... # fit_transform is a common method for pipelines

print("norm_features' shape:", norm_features.shape)

#### Our artist names are only numbers now, so we have to give them their real names from artists.csv.

In [None]:
# Read the artist names WITHOUT headers.

artist_names = ...

# We need to convert the names to a list, to have them as index later.
artist_names_list = ...

# The full list of the names to choose from
artist_names_list

In [None]:
# Create a new dataframe from the calculated features and the artist names.

df = pd.DataFrame(norm_features, index=artist_names_list)

df.head()

In [None]:
# Select row of the artist you want
artist = df.loc['The Beatles']

# Compute cosine similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())

## Additional info:

In [None]:
# pandas.dot, Dot Product example, we need to multiply the values, and add them, that's the dot product: a1 * b1 + a2 * b2 ...

print(df.loc['Interpol'])
print()
print(df.loc['The Flaming Lips'])

interpol_np = df.loc['Interpol'].to_numpy()
tfl_np = df.loc['The Flaming Lips'].to_numpy()

print('\nDot product of Interpol and The Flaming Lips, which means the similarity based on the user reviews:',
      interpol_np.dot(tfl_np))

In [None]:
# dot product example 2.

from numpy import array

a = array([1, 2, 3])
print(a)

b = array([1, 2, 3])
print(b)

c = a.dot(b)
print(c)

# prints out:
# [1 2 3]
# [1 2 3]
# 14