In [14]:
import os
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, recall_score
from collections import defaultdict
from sklearn.decomposition import PCA

# ------------------------------------------------------------
# Util: Load features and labels from a given folder
# ------------------------------------------------------------
def load_features_and_labels(root_dir):
    X = []
    y = []
    label_map = {}
    class_names = sorted([d.name for d in Path(root_dir).iterdir() if d.is_dir()])
    
    for idx, class_name in enumerate(class_names):
        label_map[class_name] = idx
        feature_dir = Path(root_dir) / class_name / "features"
        feature_paths = list(feature_dir.glob("*.npy"))

        for fpath in feature_paths:
            vec = np.load(fpath)
            X.append(vec)
            y.append(idx)
    
    X = np.stack(X)  # (N, d)
    y = np.array(y)  # (N,)
    
    return X, y, label_map

# ------------------------------------------------------------
# Util: Convert int labels to one-hot encoded matrix
# ------------------------------------------------------------
def one_hot_encode(y, num_classes):
    N = len(y)
    one_hot = np.zeros((N, num_classes))
    one_hot[np.arange(N), y] = 1
    return one_hot

# ------------------------------------------------------------
class BayesClassifier:
    def __init__(self, k, X, y):
        """
        k: int, number of classes
        X: np array, Nxd array with n samples of dimension d 
        y: ground truth,  Nxk ground truth labels  one hot encoding for k classes 
        """
        self.k = k
        self.X = X
        self.y = y
        self.N = self.X.shape[0]
        self.d = self.X.shape[1]
        self.samplesPerClass = None 
        self.mean = np.zeros((self.k, self.d))
        self.cov = np.zeros((self.k, self.d, self.d))
        self.prior = None
        assert np.all(self.y.sum(axis=1) == 1), "Each row in y must be one-hot encoded"

    def computeNumberClasses(self):
        self.samplesPerClass = self.y.sum(axis=0)

    def computePrior(self):
        self.prior = self.samplesPerClass/self.N
        
    def estimateMean(self):
        for i in range(self.k):
            self.mean[i,:] = self.X[self.y[:,i]==1, :].mean(0)
    def estimateCovariance(self, epsilon=1e-6):
        for i in range(self.k):
            X_i = self.X[self.y[:, i] == 1, :]
            X_center = X_i - self.mean[i, :]
            n_i = X_i.shape[0]  # Number of samples in class i
            # Estimate covariance and add regularization
            cov = (X_center.T @ X_center) / (n_i - 1)
            self.cov[i, :, :] = cov + epsilon * np.eye(self.d)

    def fit(self):
        self.computeNumberClasses() 
        self.computePrior()
        self.estimateMean()
        self.estimateCovariance()

    def predict(self, X):
        """
        X: Nxd test samples 
        Output: Nxk one hot encoding for prediction 
        """
        pred = np.zeros([X.shape[0], self.k])
        for i in range(self.k):
            pred[:,i] = -1/2*np.log(np.linalg.det(self.cov[i,:,:])) -1/2* np.diag((X - self.mean[i,:])@np.linalg.inv(self.cov[i,:,:])@ (X - self.mean[i,:]).T) + np.log(self.prior[i])
        pred = np.argmax(pred,1) 
        return pred 
        
            


In [12]:
# ------------------------------------------------------------
# Main
# ------------------------------------------------------------
if __name__ == "__main__":
    train_dir = "dataset/train"
    test_dir = "dataset/test"

    # 1. Load train and test data
    X_train, y_train, label_map = load_features_and_labels(train_dir)
    X_test, y_test, _ = load_features_and_labels(test_dir)

    # 2. Convert y to one-hot encoding
    k = len(label_map)
    y_train_oh = one_hot_encode(y_train, k)

    # Fit PCA to reduce dimensionality of features while capturing most information
    pca = PCA(n_components=50, svd_solver='auto', random_state=0)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca  = pca.transform(X_test)

    # 3. Train Bayes Classifier
    clf = BayesClassifier(k=k, X=X_train_pca, y=y_train_oh)
    clf.fit()

    # 4. Predict
    y_pred_train = clf.predict(X_train_pca)
    y_pred_test = clf.predict(X_test_pca)

    # 5. Metrics
    print("📊 Train Accuracy:", accuracy_score(y_train, y_pred_train))
    print("📊 Test Accuracy :", accuracy_score(y_test, y_pred_test))

    print("📈 Train Recall per class:", recall_score(y_train, y_pred_train, average=None))
    print("📈 Test Recall per class :", recall_score(y_test, y_pred_test, average=None))

    # 6. Optional: Map back to class names
    inv_label_map = {v: k for k, v in label_map.items()}
    for i, recall in enumerate(recall_score(y_test, y_pred_test, average=None)):
        print(f"🔍 Class '{inv_label_map[i]}' recall: {recall:.3f}")


📊 Train Accuracy: 1.0
📊 Test Accuracy : 1.0
📈 Train Recall per class: [1. 1.]
📈 Test Recall per class : [1. 1.]
🔍 Class 'Audi' recall: 1.000
🔍 Class 'airplane' recall: 1.000


In [13]:
X_test_pca.shape

(392, 50)