In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
save_dir = "../webUI/static/collab"
track_file = os.path.join(save_dir, "IndianMusicTracks.csv")
user_file = os.path.join(save_dir, "UserList.csv")
user_item_interactions = os.path.join(save_dir, "user_item_interactions.csv")

In [3]:
track_df = pd.read_csv(track_file, index_col=0) #usecols=["track_id", "artist_name", "track_title"])
user_df = pd.read_csv(user_file, index_col=0) # usecols=["user_id", "user_name"])
user_item_df = pd.read_csv(user_item_interactions, index_col=0) #)

In [7]:
print(type(user_item_df.iloc[0]))

<class 'pandas.core.series.Series'>


In [4]:
# get the user_item interaction matrix in numpy form
user_item_matrix = user_item_df.to_numpy()

In [5]:
# Get the SVD decomposition of the user_item_matrix, with matrix of user vectors represented by P
# and matrix of item vectors represented by Q

P, S, Qh = np.linalg.svd(user_item_matrix)

In [6]:
P

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [7]:
Qh.shape

(12, 12)

In [8]:
S

array([1.73205081, 1.73205081, 1.73205081, 1.73205081])

In [9]:
# find the mtrix product of P, diag(S) and Qh

print(np.matmul(np.matmul(P, np.diag(S)), Qh[:4,:]))

[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]]


In [10]:
Qh[:,:4]

array([[ 0.57735027,  0.57735027,  0.57735027,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.57735027],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.33333333, -0.4553418 ,  0.12200847, -0.33333333],
       [ 0.33333333, -0.4553418 ,  0.12200847, -0.33333333],
       [ 0.33333333,  0.12200847, -0.4553418 ,  0.        ],
       [ 0.33333333,  0.12200847, -0.4553418 ,  0.        ],
       [ 0.33333333,  0.12200847, -0.4553418 ,  0.        ],
       [-0.19245009,  0.26289171, -0.07044162, -0.38490018],
       [-0.19245009,  0.26289171, -0.07044162, -0.38490018],
       [-0.19245009,  0.26289171, -0.07044162, -0.38490018]])

In [12]:
# Find the cosine similarity for any song index, by using values of the Q vector upto a certain rank (default 4)

def cosine_similarity(Qh, song_index, rank=12, n_recommendations=3):
  q = Qh[:rank, song_index]
  q_norm = np.linalg.norm(q)
  q = q / q_norm

  Q_temp = Qh[:rank, :].T
  Q_temp_norm = np.linalg.norm(Q_temp, axis=1)
  # Normalize every row of Qh
  Q_temp = Q_temp / Q_temp_norm[:, None]

  cs_vector = np.dot(Q_temp, q)
  print(cs_vector)
  # sort cs_vector in ascending order and return the indices
  print(np.argsort(cs_vector)[-1:0:-1][0:n_recommendations])
  return np.argsort(cs_vector)[-1:0:-1][0:n_recommendations]

In [13]:
song_index = 8
rank = 4

print(track_df.iloc[song_index])
print(track_df.iloc[cosine_similarity(Qh, song_index, rank)])
#

artist_name                            Saints
track_title    When the Saints go Marching In
Name: 9, dtype: object
[0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
[8 7 6]
           artist_name                     track_title
track_id                                              
9               Saints  When the Saints go Marching In
8         Taylor Swift                    Cruel Summer
7          The Beatles                       Let it be
