In [64]:
! pip install kaggle



In [65]:
! kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification

Dataset URL: https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification
License(s): other
^C
User cancelled operation


In [None]:
import zipfile

with zipfile.ZipFile("gtzan-dataset-music-genre-classification.zip", "r") as zip_ref:
    zip_ref.extractall("GTZAN")

In [30]:
import pandas as pd
df = pd.read_csv('GTZAN/Data/features_30_sec.csv')

In [31]:
len(df['label'].unique())

10

In [32]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    return np.sqrt((np.sum(x1-x2))**2)

class KNN:
    def __init__(self, k=11):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict_all(self, X_test):
        return [self.predict(x) for x in X_test]
    
    def predict(self, x):
        distance = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_index = np.argsort(distance)[:self.k]
        k_labels = [self.y_train[i] for i in k_index]
        winner = Counter(list(k_labels)).most_common(1)
        return winner[0][0]

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

X = df.iloc[:, 1:-1].values  # Feature columns
y = df["label"].values  # Genre labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [34]:
knn = KNN()

In [35]:
knn.fit(X_train, y_train)

In [36]:
y = knn.predict_all(X_test)

In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y, zero_division=0))

              precision    recall  f1-score   support

       blues       0.15      0.10      0.12        20
   classical       0.40      0.62      0.48        13
     country       0.18      0.19      0.18        27
       disco       0.17      0.19      0.18        21
      hiphop       0.05      0.07      0.06        15
        jazz       0.27      0.18      0.22        22
       metal       0.35      0.24      0.29        25
         pop       0.16      0.38      0.22        13
      reggae       0.31      0.17      0.22        23
        rock       0.06      0.05      0.05        21

    accuracy                           0.20       200
   macro avg       0.21      0.22      0.20       200
weighted avg       0.21      0.20      0.20       200



In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

class Naive_Bayes:
    def __init__(self, distribution='Gaussian', epsilon=1e-6, laplace=1):
        """
        distribution: 'Gaussian' for continuous features, 'multinomial' for discrete features.
        epsilon: small constant added to standard deviation (for Gaussian) to avoid division by zero.
        laplace: Laplace smoothing constant for discrete features.
        """
        self.distribution = distribution.lower()
        self.epsilon = epsilon
        self.laplace = laplace
        self.class_priors = {}
        self.feature_likelihoods = {}
        self.label_encoder = None

    def encode_labels(self, y):
        """Encode string labels to integers."""
        self.label_encoder = LabelEncoder()
        return self.label_encoder.fit_transform(y)
    
    def inverse_transform_labels(self, encoded_labels):
        """
        Convert encoded labels back to the original labels.
        
        Parameters:
            encoded_labels (array-like): Encoded labels from prediction.
            
        Returns:
            array-like: Original labels.
        """
        if self.label_encoder is None:
            raise ValueError("Label encoder not found. Ensure you encoded labels during fit.")
        return self.label_encoder.inverse_transform(encoded_labels)

    def _fit_discrete(self, x, y):
        """
        Fit model for discrete features using Laplace smoothing.
        For each class and feature, compute:
            P(feature_value | class) = (count + laplace) / (N_class + laplace * k)
        where k is the number of unique values for that feature.
        """
        self.classes, class_counts = np.unique(y, return_counts=True)
        total_count = len(y)
        n_classes = len(self.classes)
        
        # Compute class priors with smoothing
        for cls, count in zip(self.classes, class_counts):
            self.class_priors[cls] = (count + self.laplace) / (total_count + n_classes * self.laplace)
        
        self.feature_likelihoods = {}
        n_features = x.shape[1]
        for cls in self.classes:
            X_cls = x[y == cls]
            likelihoods = []
            for j in range(n_features):
                # Get all possible values for feature j (from the entire dataset)
                unique_vals = np.unique(x[:, j])
                k = len(unique_vals)
                col = X_cls[:, j]
                values, counts = np.unique(col, return_counts=True)
                counts_dict = {val: cnt for val, cnt in zip(values, counts)}
                # Denominator used in smoothing:
                denom = len(col) + self.laplace * k
                likelihood = {}
                for v in unique_vals:
                    count_v = counts_dict.get(v, 0)
                    likelihood[v] = (count_v + self.laplace) / denom
                # Store a default probability for unseen values:
                default_prob = self.laplace / denom
                # Save as a tuple: (likelihood dictionary, default probability)
                likelihoods.append((likelihood, default_prob))
            self.feature_likelihoods[cls] = likelihoods
    
    def _predict_probabilities_discrete(self, x):
        """Predict probabilities for discrete features using the smoothed likelihoods."""
        n_samples = x.shape[0]
        probabilities = []
        n_features = x.shape[1]
        for cls in self.classes:
            prob = np.full(n_samples, self.class_priors[cls])
            for j in range(n_features):
                likelihood_dict, default_prob = self.feature_likelihoods[cls][j]
                col = x[:, j]
                # For each sample, multiply by the likelihood for the observed feature value.
                feature_probs = np.array([likelihood_dict.get(val, default_prob) for val in col])
                prob *= feature_probs
            probabilities.append(prob)
        return np.array(probabilities)
    
    # ------------------------------
    # Public Fit and Predict Methods
    # ------------------------------
    def fit(self, x, y):
        """
        Fit the Naive Bayes model.
        If using continuous features (Gaussian), compute mean and std for each feature per class.
        If using discrete features (multinomial), compute smoothed likelihoods.
        """
        if self.distribution == 'gaussian':
            self._fit_continuous(x, y)
        elif self.distribution == 'multinomial':
            self._fit_discrete(x, y)
        else:
            raise ValueError("Unsupported distribution. Use 'Gaussian' or 'multinomial'.")
    
    def predict_probabilities(self, x):
        """
        Return an array of predicted probabilities of shape (n_classes, n_samples).
        """
        if self.distribution == 'gaussian':
            return self._predict_probabilities_continuous(x)
        elif self.distribution == 'multinomial':
            return self._predict_probabilities_discrete(x)
        else:
            raise ValueError("Unsupported distribution.")
    
    def predict(self, x):
        """
        Predict the class for each sample.
        Returns an array of predicted class indices.
        """
        probabilities = self.predict_probabilities(x)
        return np.argmax(probabilities, axis=0)


In [15]:
X = df.iloc[:, 1:-1].values  # Feature columns
y = df["label"].values  # Genre labels

nb = Naive_Bayes()

y = nb.encode_labels(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,

In [20]:
nb = Naive_Bayes(distribution='multinomial', laplace=1)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)


In [21]:
y_pred

array([9, 7, 3, 2, 2, 7, 3, 5, 8, 5, 9, 9, 8, 5, 2, 6, 5, 3, 7, 9, 4, 0,
       6, 2, 5, 5, 2, 9, 5, 5, 3, 0, 7, 7, 7, 5, 7, 9, 5, 5, 9, 2, 2, 2,
       2, 5, 5, 8, 4, 5, 9, 8, 2, 5, 5, 5, 7, 0, 5, 9, 9, 2, 5, 4, 6, 6,
       6, 6, 2, 5, 5, 9, 2, 7, 0, 2, 0, 9, 2, 9, 3, 0, 5, 0, 9, 9, 5, 6,
       6, 2, 3, 9, 9, 2, 7, 4, 3, 9, 4, 2, 7, 7, 3, 5, 6, 6, 6, 9, 2, 9,
       5, 9, 5, 2, 9, 4, 2, 5, 5, 2, 8, 5, 8, 5, 5, 5, 5, 4, 7, 9, 2, 7,
       9, 9, 2, 2, 2, 5, 4, 9, 2, 5, 7, 2, 8, 9, 9, 0, 5, 5, 6, 2, 5, 3,
       9, 4, 9, 2, 5, 6, 9, 7, 5, 2, 8, 6, 9, 8, 2, 5, 3, 7, 9, 5, 5, 3,
       9, 2, 9, 8, 5, 6, 6, 5, 6, 0, 3, 3, 0, 6, 9, 5, 5, 5, 9, 9, 2, 2,
       6, 9])

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.30      0.15      0.20        20
           1       0.00      0.00      0.00        13
           2       0.23      0.30      0.26        27
           3       0.46      0.29      0.35        21
           4       0.56      0.33      0.42        15
           5       0.19      0.41      0.26        22
           6       0.47      0.36      0.41        25
           7       0.41      0.54      0.47        13
           8       0.50      0.22      0.30        23
           9       0.18      0.33      0.23        21

    accuracy                           0.29       200
   macro avg       0.33      0.29      0.29       200
weighted avg       0.33      0.29      0.29       200



In [None]:
class Naive_Bayes:
    def prior_prob(self, X, y):
        self.prior_p = (X.groupby(y).apply(lambda x: len(x) / self.rows)).to_numpy()
        return self.prior_p
    
    def post_prob(self, X, y):
        self.cond_p = X.groupby(y).apply(lambda x: x.sum() / len(x)).to_numpy()
        return self.cond_p
    
    def postProb(self, x):

        posteriors = []

        for i in range(self.count):

            prior = np.log(self.prior[i]) 
            conditional = np.sum(np.log(self.densGauss(i, x))) 
            posterior = prior + conditional
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]
        
    def train(self, X, y):
        self.classes = np.unique(y)
        self.count = len(self.classes)
        self.num_feature = X.shape[1]
        self.rows = X.shape[0]
        
        self.statParamteres(X, y)
        self.priorProb(X, y)
        
    def predict(self, X):
        preds = [self.postProb(f) for f in X.to_numpy()]
        return preds