<a href="https://colab.research.google.com/github/alpgokcek/music-recommendation/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [172]:
import pandas as pd
import numpy as np

In [314]:
# CONSTANTS
LIKE, DISLIKE = 1, 0
N_TRACKS = 100

# **Importing the dataset and extracting features**
In this part, the aim is to import the dataset and extract the features in it.

In [14]:
DATASET_PATH = '/content/drive/MyDrive/datasets/data.csv'
data_df = pd.read_csv(DATASET_PATH)

In [15]:
data_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [16]:
features = data_df.drop(labels=['artists', 'id', 'name', 'year', 'release_date'], axis=1).to_numpy()

In [17]:
features[:5]

array([[ 9.91000e-01,  5.98000e-01,  1.68333e+05,  2.24000e-01,
         0.00000e+00,  5.22000e-04,  5.00000e+00,  3.79000e-01,
        -1.26280e+01,  0.00000e+00,  1.20000e+01,  9.36000e-02,
         1.49976e+02,  6.34000e-01],
       [ 6.43000e-01,  8.52000e-01,  1.50200e+05,  5.17000e-01,
         0.00000e+00,  2.64000e-02,  5.00000e+00,  8.09000e-02,
        -7.26100e+00,  0.00000e+00,  7.00000e+00,  5.34000e-02,
         8.68890e+01,  9.50000e-01],
       [ 9.93000e-01,  6.47000e-01,  1.63827e+05,  1.86000e-01,
         0.00000e+00,  1.76000e-05,  0.00000e+00,  5.19000e-01,
        -1.20980e+01,  1.00000e+00,  4.00000e+00,  1.74000e-01,
         9.76000e+01,  6.89000e-01],
       [ 1.73000e-04,  7.30000e-01,  4.22087e+05,  7.98000e-01,
         0.00000e+00,  8.01000e-01,  2.00000e+00,  1.28000e-01,
        -7.31100e+00,  1.00000e+00,  1.70000e+01,  4.25000e-02,
         1.27997e+02,  4.22000e-02],
       [ 2.95000e-01,  7.04000e-01,  1.65224e+05,  7.07000e-01,
         1.00000e+00

In [18]:
features.shape

(174389, 14)

# **Normalizing the features of the dataset.**
In this part, the aim is to normalize the features using standard score formula.

In [19]:
def normalize_features(arr):
  # standard score
  cols_mean, cols_stdev = arr.mean(axis=0), arr.std(axis=0)
  output = (arr - cols_mean) / cols_stdev
  return output

In [20]:
normalized_features = normalize_features(features)

# **Importing the user input.**
In this part, the aim is to import the user input.

In [304]:
import json
USER_FAVORITES_INPUT = '/content/drive/MyDrive/datasets/oldies-but-goldies-00s.json'
USER_DISLIKES_INPUT = '/content/drive/MyDrive/datasets/rock-classics.json'

In [305]:
user_favorites_ids, user_dislikes_ids = None, None
with open(USER_FAVORITES_INPUT, 'r') as f:
  user_favorites_ids = json.load(f)

with open(USER_DISLIKES_INPUT, 'r') as f:
  user_dislikes_ids = json.load(f)

In [306]:
def get_track_features(idx_arr):
  track_features = []
  for idx in idx_arr['trackIds']:
    track = normalized_features[data_df['id'].str.contains(idx, na=False)]
    if len(track) > 0:
      track_features.append(track)
  return np.array(track_features)

In [307]:
user_favorites = get_track_features(user_favorites_ids)
user_dislikes = get_track_features(user_dislikes_ids)

In [308]:
len(user_favorites), len(user_dislikes)

(21, 69)

In [309]:
# extracting samples and its labels
samples = np.concatenate((user_favorites, user_dislikes), axis=0)
samples = samples.reshape(samples.shape[0], -1)
labels = np.array([LIKE for _ in range(len(user_favorites))] + [DISLIKE for _ in range(len(user_dislikes))])

# **Naive Bayes Classifier**
Simple implementation of the Naive Bayes Classifier.


In [278]:
import math

class NaiveBayesClassifier:
    def __init__(self, features, labels):
        self.means, self.std = [], []
        self.features, self.labels = features, labels
        self.classes = np.unique(labels)
        cls_idx, cls_features = [0] * len(self.classes), [0] * len(self.classes)
        label, counts = np.unique(labels, return_counts=True)
        self.prior = dict(zip(label, counts))
        for c in self.classes:
            cls_idx[c] = np.argwhere(labels == c)
            cls_features[c] = features[cls_idx[c], :]
            self.prior[c] = self.prior[c] / sum(list(self.prior.values()))
        n_features = features.shape[1]
        self.means = [np.mean(cls_features[c], axis=0).reshape(n_features,)
                      for c in self.classes]
        self.std = [np.std(cls_features[c], axis=0)[0]
                    for c in self.classes]

    def _calc_likelihood(self, x, mean, stdev):
        exponent = math.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    
    def _predict(self, sample):
        self.posterior = {label: math.log(
            self.prior[label], math.e) for label in self.classes}
        for c in self.classes:
            # calculate likelihood for each feature
            for i in range(len(self.means)):
                # log(prior) += log(feature_1 | class) + log(feature_2 | class) + ...
                self.posterior[c] += math.log(self._calc_likelihood(
                    sample[i], self.means[c][i], self.std[c][i]), math.e)
        
        evidence = 0 #math.log(sum([math.e ** val for val in self.posterior.values()]), math.e)
        # posterior = prior * likelihood / evidence
        self.posterior = {c: (math.e ** (self.posterior[c] - evidence)) for c in self.posterior}
                          
        return self.posterior

    def predict(self, samples):
        predictions = []
        for sample in samples:
            predicted_c, max_p = None, 0
            for c, p in self._predict(sample).items():
                if p > max_p:
                    max_p, predicted_c = p, c
            predictions.append((max_p, predicted_c))
        return predictions


# **Testing**
In this part, the aim is to test the algorithm.

In [310]:
nb = NaiveBayesClassifier(samples, list(labels))

In [311]:
predictions = nb.predict(normalized_features)

In [315]:
highest_probabilities = sorted(enumerate(predictions), key=lambda tup: tup[1], reverse=True)
like_predictions = sorted(highest_probabilities, key=lambda tup: list(tup[1])[1], reverse=True)[:N_TRACKS]


In [316]:
for idx, (p, c) in like_predictions:
  print("probability: \t{}\n{}\n".format(p, data_df.iloc[idx]))

probability: 	0.8182506046561416
acousticness                         0.0776
artists                   ['Eddie Rabbitt']
danceability                          0.691
duration_ms                          224013
energy                                0.474
explicit                                  0
id                   6KU5URgVCXtJbrTFGEOz7S
instrumentalness                   2.69e-05
key                                       7
liveness                              0.113
loudness                             -9.972
mode                                      1
name                You Can't Run From Love
popularity                               25
release_date                        1982-10
speechiness                          0.0279
tempo                               109.816
valence                               0.755
year                                   1982
Name: 119042, dtype: object

probability: 	0.8182447105045545
acousticness                                           0.0774
artists

In [323]:
nb.means

[array([-0.77003753,  0.16686771,  0.27240124,  0.79697407, -0.27040128,
        -0.47260683, -0.1530971 , -0.15311569,  0.52739968,  0.14376811,
         2.13167595, -0.20637968,  0.16075417,  0.42453337]),
 array([-1.11153648,  0.87436045, -0.05043273,  1.1782075 ,  0.10756145,
        -0.57012381, -0.26137506,  0.06675238,  1.26620051, -0.59887673,
         2.04525447, -0.08530813,  0.2469064 ,  0.30398172])]

In [325]:
ftrs = []
for idx, (p, c) in like_predictions:
  if c == LIKE:
    ftrs.append(normalized_features[idx])  
ftrs = np.array(ftrs)
print(ftrs.mean(axis=0))

[-1.11137104  0.87568602 -0.08450858  0.59581325  0.60269263 -0.48398487
  0.02975732 -0.10800547  0.45709759 -0.15831627  0.47441011  0.33663959
  0.05511222  0.51076199]


# **Sklearn Naive Bayes**
In this part, the aim is to run the NB algorithm of Sklearn on the dataset.

In [161]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(samples, labels)
predictions = clf.predict_proba(normalized_features)

In [163]:
count = 0
for idx, p in enumerate(predictions):
  if p[1] >= p[0]:
    count += 1
count

17241

# **Preprocessing and labelling the data.**
In this part, the aim is to label the data by preprocessing it.

In [16]:
# preprocess the data and extract the data

In [15]:
#fav_mean = user_favorites.mean(axis=0)
#fav_mean.reshape(14,1)

NameError: ignored

In [None]:
#cosine_similarities = [dot(fav_mean, sample)/(norm(fav_mean)*norm(sample)) for sample in normalized_features]
#zipped_similarities = zip(cosine_similarities, enumerate(normalized_features))
#sorted_similarities = sorted(zipped_similarities, reverse=True, key = lambda x: x[0])
#for _similarity,(idx,_features) in sorted_similarities[len(sorted_similarities)-5:]:
#  print("Similarity:\t{}\nFeatures:\n{}\n".format(_similarity,data_df.iloc[idx]))