<a href="https://colab.research.google.com/github/alpgokcek/music-recommendation/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm

# **Importing the dataset and extracting features**
In this part, the aim is to import the dataset and extract the features in it.

In [4]:
DATASET_PATH = '/content/drive/MyDrive/datasets/data.csv'
data_df = pd.read_csv(DATASET_PATH)

In [6]:
data_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [9]:
features = data_df.drop(labels=['artists', 'id', 'name', 'year', 'release_date'], axis=1).to_numpy()

In [15]:
features[:5]

array([[ 9.91000e-01,  5.98000e-01,  1.68333e+05,  2.24000e-01,
         0.00000e+00,  5.22000e-04,  5.00000e+00,  3.79000e-01,
        -1.26280e+01,  0.00000e+00,  1.20000e+01,  9.36000e-02,
         1.49976e+02,  6.34000e-01],
       [ 6.43000e-01,  8.52000e-01,  1.50200e+05,  5.17000e-01,
         0.00000e+00,  2.64000e-02,  5.00000e+00,  8.09000e-02,
        -7.26100e+00,  0.00000e+00,  7.00000e+00,  5.34000e-02,
         8.68890e+01,  9.50000e-01],
       [ 9.93000e-01,  6.47000e-01,  1.63827e+05,  1.86000e-01,
         0.00000e+00,  1.76000e-05,  0.00000e+00,  5.19000e-01,
        -1.20980e+01,  1.00000e+00,  4.00000e+00,  1.74000e-01,
         9.76000e+01,  6.89000e-01],
       [ 1.73000e-04,  7.30000e-01,  4.22087e+05,  7.98000e-01,
         0.00000e+00,  8.01000e-01,  2.00000e+00,  1.28000e-01,
        -7.31100e+00,  1.00000e+00,  1.70000e+01,  4.25000e-02,
         1.27997e+02,  4.22000e-02],
       [ 2.95000e-01,  7.04000e-01,  1.65224e+05,  7.07000e-01,
         1.00000e+00

In [None]:
features.shape

# **Normalizing the features of the dataset.**
In this part, the aim is to normalize the features using standard score formula.

In [51]:
def normalize_features(arr):
  # standard score
  cols_mean, cols_stdev = arr.mean(axis=0), arr.std(axis=0)
  output = (arr - cols_mean) / cols_stdev
  return output

In [28]:
normalized_features = normalize_features(features)

# **Importing the user input.**
In this part, the aim is to import the user input.

In [29]:
import json
USER_INPUT = '/content/drive/MyDrive/datasets/mood-pop.json' # '/content/drive/MyDrive/datasets/Oldies-but-Goldies--00's.json'

In [33]:
user_favorites_ids = None
with open(USER_INPUT, 'r') as f:
  user_favorites_ids = json.load(f)

In [84]:
user_favorites = []
for idx in user_favorites_ids['trackIds']:
  track = normalized_features[data_df['id'].str.contains(idx, na=False)]
  if len(track) > 0:
    user_favorites.append(track)
user_favorites = np.array(user_favorites)

# **Preprocessing and labelling the data.**
In this part, the aim is to label the data by preprocessing it.

In [None]:
# preprocess the data and extract the data

In [85]:
fav_mean = user_favorites.mean(axis=0)
fav_mean.reshape(14,1)

array([[-0.94057154],
       [ 0.70392987],
       [-0.17584712],
       [ 1.07112412],
       [ 0.25874655],
       [-0.58954557],
       [ 0.35851628],
       [-0.30688203],
       [ 1.28155487],
       [-0.80717956],
       [ 2.37312602],
       [-0.11296621],
       [ 0.24161251],
       [ 0.15099279]])

In [88]:
cosine_similarities = [dot(fav_mean, sample)/(norm(fav_mean)*norm(sample)) for sample in normalized_features]

In [109]:
zipped = zip(cosine_similarities, normalized_features)
sorted(zipped, reverse=True, key = lambda x: x[0])[:5]


[(array([0.96300016]),
  array([-1.06473145,  0.85921106, -0.27820296,  1.46058492, -0.27040128,
         -0.589564  ,  0.51010543, -0.72425926,  1.20561806, -1.53623943,
          2.62000938, -0.32003343,  0.19853514,  0.57648818])),
 (array([0.96166938]),
  array([-1.22502214,  0.71718558, -0.08093266,  1.38724013, -0.27040128,
         -0.589564  ,  0.51010543, -0.71539461,  1.46758423, -1.53623943,
          2.39141367,  0.01794592,  0.09772233,  0.48952378])),
 (array([0.95574557]),
  array([-1.28450603,  1.18871018, -0.22747362,  1.03518517, -0.27040128,
         -0.589564  ,  0.22587578, -0.6511259 ,  1.28977755, -1.53623943,
          2.9857625 , -0.03527486, -0.09792063,  0.80335183])),
 (array([0.95316632]),
  array([-1.28450603,  1.18871018, -0.22747362,  1.03518517, -0.27040128,
         -0.589564  ,  0.22587578, -0.6511259 ,  1.28977755, -1.53623943,
          2.80288594, -0.03527486, -0.09792063,  0.80335183])),
 (array([0.95305594]),
  array([-1.26529221,  0.56379806, -0

# **Naive Bayes Classifier**
Simple implementation of the Naive Bayes Classifier.

In [27]:
def naive_bayes(self, samples, labels):
    classes = np.unique(labels)
    n_classes, n_samples, n_features = len(classes), samples.shape

    mean = np.zeros((n_classes, n_features), dtype=np.float64)
    var = np.zeros((n_classes, n_features), dtype=np.float64)
    priors = np.zeros(n_classes, dtype=np.float64)

    for idx, c in enumerate(classes):
        samples_c = samples[labels == c]
        mean[idx, :] = samples_c.mean(axis=0)
        var[idx, :] = samples_c.var(axis=0)
        priors[idx] = samples_c.shape[0] / float(n_samples)

In [25]:
def pdf(class_idx, x, mean, var):
    mean, var = mean[class_idx], var[class_idx]
    numerator = np.exp(- (x-mean)**2 / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    return numerator / denominator

In [26]:
def predict(self, samples, classes, priors, mean, var):
    y_pred = []
    for x in samples:
        posteriors = []
        for idx, c in enumerate(classes):
            prior = np.log(priors[idx])
            posterior = np.sum(np.log(self.pdf(idx, x, mean, var)))
            posterior = prior + posterior
            posteriors.append(posterior)
        y_pred.append(classes[np.argmax(posteriors)])
    return np.array(y_pred)