In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass



data = pd.read_csv('data/GenreClassData_30s.txt', sep='\t')
data["TrackID"] = range(len(data))

# Split the data into training and testing sets
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']


all_features = [col for col in data.columns if col not in ['Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type']]

# Define the features and targets
features = ['spectral_rolloff_mean', 'mfcc_1_mean', 'spectral_centroid_mean', 'chroma_stft_10_mean']
targets = ['Genre']

# feature data
X_train = train[features]
# genre data
y_train = train[targets]

X_test, y_test = test[features], test[targets]



## Thoughts before implementing 

Idea: Implement an algorithm that 
* iterates through each class
    * splits the class into M clusters that are used as references for that class


So: We have an interative procedure for finding the number of clusters (class references).
In order to implement this procedure we need a stopping criteria, since we don't know the exact number beforehand.

Start with 1 cluster per class (M=1)
calculate
* Λ_s (contains the mean and covariance for M=1)
* D_1 (the sum of distances between all points in this class and the mean of that class - using Mahalanobis distance or euclidean for example)
These D matrices need to be stored somewhere so that we can compare them


Then iteratively we increase M by splitting one of the clusters in two.

1. Choose which cluster to split - pick the cluster with the largest accumulated distance (D). Or choose the 
2. The cluster to split is denoted as Λ_s. To split this cluster use a small noise vector w and create two new centers:
    * my_s1 = my_s + w 
    * my_s2 = my_s - w 
* The covariance matrices are simply copied

3. Now for each training point x_k in class w_i
    * Compute the distance to all M+1 cluster centers
        * d = d(x_k, my_j)
    * Assign x_k to the cluster with lowest distance

4. Now since we have reassigned our data, we need to recompute the parameters for our clusters
    * compute new my_j and covarianceMatrix_j as done earlier


5. Evaluate total distance D_M
    * Find the accumulated distance and compare with previous D_M-1
        * If D_M << D_M-1: continue to next split
        * if not stop splitting fo this class -> we are satsified.


Summarized:

Loop over classes:
* Start with 1 cluster
* While improvement:
    * Find worst cluster
    * Split μ with noise ±w
    * Assign training vectors
    * Recompute μ, Σ
    * Compute new total distance
    * Check for convergence


In [None]:
# Creating a dataclass for storing the structured data
@dataclass
class Cluster:
    mean: np.ndarray
    covariance: np.ndarray
    datapoints: np.ndarray
    accumulated_distance: float = 0.0

In [None]:
## Helper functions

# Calculates the Euclidean distance between two music samples
def eucledian_distance(datapoint, mu):
    distance = 0
    for i in range(len(datapoint)):
        distance += (datapoint[i] - mu[i]) ** 2
    return np.sqrt(distance)

def calculate_mean(X):
    n_samples, n_features = X.shape
    accumalated_sum = np.zeros(n_features)
    print(accumalated_sum)
    for k in range(n_samples):
        accumalated_sum += X[k][:]
    
    mean = accumalated_sum/n_samples
    return mean

def calculate_covariance(X, mean):
    n_samples, n_features = X.shape
    covariance_matrix = np.zeros((n_features, n_features))
    for k in range(n_samples):
        diff = X[k] - mean
        covariance_matrix += np.outer(diff, diff)

    covariance_matrix /= n_samples
    return covariance_matrix

# Calculates the sum of distances between all points in this class and the mean of that class
import numpy as np

def calculate_accumulated_distance(X, mean, covariance):
    '''
    Args:
        X
        mean
        covariance
    '''
    n_samples, _ = X.shape
    accumulated_distance = 0.0

    try:
        inv_cov = np.linalg.inv(covariance)
    except np.linalg.LinAlgError:
        covariance += np.eye(covariance.shape[0]) * 1e-6
        inv_cov = np.linalg.inv(covariance)

    for k in range(n_samples):
        diff = X[k] - mean
        distance = diff.T @ inv_cov @ diff  # Mahalanobis squared
        accumulated_distance += distance

    return accumulated_distance  # total D


def find_cluster_to_split(cluster_dict, current_class):
    clusters = cluster_dict[current_class]
    if(len(clusters) == 1):
        return clusters[0]
    else:
        
        cluster_to_split = clusters[0]
        for i in range(1, len(clusters)):
            if(clusters[i].accumulated_distance > cluster_to_split.accumulated_distance):
                cluster_to_split = clusters[i] 

    return cluster_to_split
    

# Not made by me – found online
def generate_noise_vector(dim, delta=0.1):
    """
    Generates a noise vector of shape (dim,) where each element is sampled from U(-delta, delta).
    
    Args:
        dim (int): Number of dimensions (length of the feature vector).
        delta (float): Maximum absolute size of the noise (small positive number).
    
    Returns:
        np.ndarray: Noise vector of shape (dim,)
    """
    return np.random.uniform(low=-delta, high=delta, size=dim)



In [None]:
def create_classifier(X_train, y_train, classes):
    cluster_dict = {}
    for current_class in classes:
        # Start with one cluster
        M = 1
        clusters = cluster_dict[current_class]

        # Calculate the mean of the current cluster
        mean = calculate_mean(X_train)
        cov = calculate_covariance(X_train, mean)
        acc_dist = calculate_accumulated_distance(X_train, mean, cov)
        
        cluster_1 = Cluster(mean=mean, covariance=cov, points=X_train, accumulated_distance=acc_dist)

        clusters.append(cluster_1)

        while(...):
            splitting_cluster = find_cluster_to_split(cluster_dict, current_class)
            mu = splitting_cluster.mean
            w = generate_noise_vector(mu.shape[0], delta=0.1)

            mu_1 = mu + w
            mu_2 = mu - w
            cov = splitting_cluster.covariance

            cluster_split_1 = Cluster(mean=mu_1, covariance=cov)
            cluster_split_2 = Cluster(mean=mu_2, covariance=cov)


            clusters.remove(splitting_cluster)
            clusters.append(cluster_split_1)
            clusters.append(cluster_split_2)

            # Before assigning points to clusters, need to clear the datapoints.
            for cluster in clusters:
                cluster.datapoints = []

            class_data = X_train[y_train['Genre'] == current_class] # X_train[mask] - selects the row where the mask is true (works because of numpy)
            for x in class_data:
                closest_cluster = clusters[0]
                closest_distance = eucledian_distance(x, closest_cluster.mean)
                for c in range(1, len(clusters)):
                    if eucledian_distance(x, clusters[c].mean) < closest_distance:
                        closest_distance = eucledian_distance(x, clusters[c].mean)
                        closest_cluster = clusters[c]
                
                closest_cluster.datapoints.append(x)


        
            for cluster in clusters:
                cluster.mean = calculate_mean(X_train)
                cluster.covariance = calculate_covariance(X_train, cluster.mean)
                cluster.accumulated_distance = calculate_accumulated_distance(X_train, cluster.mean, cluster.covariance)









classes = y_train['Genre'].unique()
#print(f"Classes: {classes}")
create_classifier(X_train, y_train, classes)


[0. 0. 0. 0.]


KeyError: 0