## Classification based on Bayesian Decision Rule 
– which guarentees the Maximum A Posteriori decision

\begin{align}
\hat w(x)
&= \arg\max_{i}\;P(w_i \mid x)
= \arg\max_{i}\;\bigl[p(x \mid w_i)\,P(w_i)\bigr]
\end{align}


In order to do this we need to find 

\begin{align}
p(x \mid w_i)
\end{align}

In [None]:
import pandas as pd
import numpy as np
from dataclasses import dataclass
import math
from sklearn.preprocessing import StandardScaler


In [None]:



data = pd.read_csv("data/GenreClassData_30s.txt", sep="\t")

all_features = [col for col in data.columns if col not in ['Track ID', 'File', 'Genre', 'GenreID', 'Type']]
print(all_features)

data_train = data[data['Type'] == 'Train']
data_test = data[data['Type'] == 'Test']

features = all_features




X_train, y_train = data_train[features], data_train['Genre']
X_test, y_test = data_test[features], data_test['Genre']


scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(20).fit(X_train_s)
X_train_p = pca.transform(X_train_s)
X_test_p  = pca.transform(X_test_s)



In [None]:
# Helper functions

def calculate_mean(X):
    # Handles pd df and np nd array
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    n_samples, n_features = X.shape
    accumalated_sum = np.zeros(n_features)
    for k in range(n_samples):
        accumalated_sum += X[k]
    
    mean = accumalated_sum/n_samples
    return mean

def calculate_covariance(X, mean):
    # Handles pd df and np nd array
    if isinstance(X, pd.DataFrame):
        X = X.values


    n_samples, n_features = X.shape
    covariance_matrix = np.zeros((n_features, n_features))
    for k in range(n_samples):
        diff = X[k] - mean
        covariance_matrix += np.outer(diff, diff)

    covariance_matrix /= n_samples
    return covariance_matrix


def mahalanobis_distance(x, mean, inv_cov):
    diff = x - mean
    return 0.5 * diff.T @ inv_cov @ diff


def gaussian_density_model(x, mean, cov, inv_cov):
    d = x.shape[0]
    det_cov = np.linalg.det(cov)
    norm_constant = 1/((2*np.pi)**(d/2) * np.sqrt(det_cov))
    exponent = - 0.5 * ((x-mean).T @ inv_cov @ (x-mean))
    value = norm_constant * np.exp(exponent)
    return value


# Creating a dataclass for storing the density models and prior probablities for the classes
@dataclass(eq=False)
class ClassInformation:
    mean: np.ndarray
    covariance: np.ndarray
    inv_covariance: np.ndarray
    a_priori: int


In [None]:
from sklearn.mixture import GaussianMixture

def get_class_information(X, y, classes, n_components):
    class_information = {}
    n_samples, _ = X.shape

    for cls in classes:
        mask       = (y == cls)
        class_data = X[mask]

        gmm = GaussianMixture(
            n_components    = n_components,
            covariance_type = "full",
            reg_covar       = 1e-3,
            random_state    = 0,
        )
        gmm.fit(class_data)

        class_information[cls] = {
            "gmm"      : gmm,
            "a_priori" : class_data.shape[0] / n_samples
        }

    return class_information


def classify_map(x, class_info):
    best_score = -np.inf
    best_class = None

    for cls, info in class_info.items():
        gmm       = info["gmm"]
        log_prior = np.log(info["a_priori"])
        # gmm.score_samples returns an array of log p(x) for each row
        log_lik   = gmm.score_samples(x.reshape(1, -1))[0]
        score     = log_lik + log_prior

        if score > best_score:
            best_score = score
            best_class = cls

    return best_class


# --- usage ---
classes           = np.unique(y_train)
class_information = get_class_information(X_train_p, y_train, classes, n_components=9)
y_pred            = [classify_map(x, class_information) for x in X_test_p]

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:          ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
def classify_map(x, class_info):
    best_score = -np.inf
    best_class = None

    for cls, info in class_info.items():
        mean, covariance, covariance_inv, prob_class = info.mean, info.covariance, info.inv_covariance, info.a_priori
        

        d = x.shape[0]
        log_norm = -0.5 * ( d*np.log(2*np.pi)
                           + np.log(np.linalg.det(covariance))
                           + (x-mean).T @ covariance_inv @ (x-mean) )
        log_prior = np.log(prob_class)
        score = log_norm + log_prior

        if score > best_score:
            best_score = score
            best_class = cls

    return best_class

# predict on the whole test set
y_pred = [classify_map(x, class_information) for x in X_test.values]

# evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
