In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass



data = pd.read_csv('data/GenreClassData_30s.txt', sep='\t')
data["TrackID"] = range(len(data))

# Split the data into training and testing sets
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']


all_features = [col for col in data.columns if col not in ['Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type']]

# Define the features and targets
features = ['spectral_rolloff_mean', 'mfcc_1_mean', 'spectral_centroid_mean', 'chroma_stft_10_mean']
targets = ['Genre']

# feature data
X_train = train[features]
# genre data
y_train = train[targets]

X_test, y_test = test[features], test[targets]



## Thoughts before implementing 

Idea: Implement an algorithm that 
* iterates through each class
    * splits the class into M clusters that are used as references for that class


So: We have an interative procedure for finding the number of clusters (class references).
In order to implement this procedure we need a stopping criteria, since we don't know the exact number beforehand.

Start with 1 cluster per class (M=1)
calculate
* Λ_s (contains the mean and covariance for M=1)
* D_1 (the sum of distances between all points in this class and the mean of that class - using Mahalanobis distance or euclidean for example)
These D matrices need to be stored somewhere so that we can compare them


Then iteratively we increase M by splitting one of the clusters in two.

1. Choose which cluster to split - pick the cluster with the largest accumulated distance (D). Or choose the 
2. The cluster to split is denoted as Λ_s. To split this cluster use a small noise vector w and create two new centers:
    * my_s1 = my_s + w 
    * my_s2 = my_s - w 
* The covariance matrices are simply copied

3. Now for each training point x_k in class w_i
    * Compute the distance to all M+1 cluster centers
        * d = d(x_k, my_j)
    * Assign x_k to the cluster with lowest distance

4. Now since we have reassigned our data, we need to recompute the parameters for our clusters
    * compute new my_j and covarianceMatrix_j as done earlier


5. Evaluate total distance D_M
    * Find the accumulated distance and compare with previous D_M-1
        * If D_M << D_M-1: continue to next split
        * if not stop splitting fo this class -> we are satsified.


Summarized:

Loop over classes:
* Start with 1 cluster
* While improvement:
    * Find worst cluster
    * Split μ with noise ±w
    * Assign training vectors
    * Recompute μ, Σ
    * Compute new total distance
    * Check for convergence


In [2]:
# Creating a dataclass for storing the structured data
@dataclass
class Cluster:
    mean: np.ndarray
    covariance: np.ndarray
    datapoints: np.ndarray
    accumulated_distance: float = 0.0
    inv_covariance: np.ndarray = None

In [3]:
## Helper functions

# Calculates the Euclidean distance between two music samples
def eucledian_distance(datapoint, mu):
    distance = 0
    for i in range(len(datapoint)):
        distance += (datapoint[i] - mu[i]) ** 2
    return np.sqrt(distance)

def calculate_mean(X):
    n_samples, n_features = X.shape
    accumalated_sum = np.zeros(n_features)
    for k in range(n_samples):
        accumalated_sum += X[k][:]
    
    mean = accumalated_sum/n_samples
    return mean

def calculate_covariance(X, mean):
    n_samples, n_features = X.shape
    covariance_matrix = np.zeros((n_features, n_features))
    for k in range(n_samples):
        diff = X[k] - mean
        covariance_matrix += np.outer(diff, diff)

    covariance_matrix /= n_samples
    return covariance_matrix

# Calculates the sum of distances between all points in this class and the mean of that class
import numpy as np

def calculate_accumulated_distance(X, mean, covariance):
    '''
    Args:
        X
        mean
        covariance
    '''
    n_samples, _ = X.shape
    accumulated_distance = 0.0

    try:
        inv_cov = np.linalg.inv(covariance)
    except np.linalg.LinAlgError:
        covariance += np.eye(covariance.shape[0]) * 1e-6
        inv_cov = np.linalg.inv(covariance)

    for k in range(n_samples):
        diff = X[k] - mean
        distance = diff.T @ inv_cov @ diff  # Mahalanobis squared
        accumulated_distance += distance

    return accumulated_distance  # total D


def find_cluster_to_split(cluster_dict, current_class):
    clusters = cluster_dict[current_class]
    if(len(clusters) == 1):
        return clusters[0]
    else:
        
        cluster_to_split = clusters[0]
        for i in range(1, len(clusters)):
            if(clusters[i].accumulated_distance > cluster_to_split.accumulated_distance):
                cluster_to_split = clusters[i] 

    return cluster_to_split
    




In [None]:
def create_classifier(X_train, y_train, classes, beta):
    cluster_dict = {}
    for current_class in classes:
        # Start with one cluster
        M = 1
        cluster_dict[current_class] = []
        clusters = cluster_dict[current_class]

        class_data = X_train[y_train['Genre'] == current_class].to_numpy() # X_train[mask] - selects the row where the mask is true (works because of numpy)

        # Calculate the mean of the current cluster
        mean = calculate_mean(class_data)
        cov = calculate_covariance(class_data, mean)
        acc_dist = calculate_accumulated_distance(class_data, mean, cov)
        
        cluster_main = Cluster(mean=mean, covariance=cov, datapoints=class_data, accumulated_distance=acc_dist)

        clusters.append(cluster_main)
        we_are_improving = True

        previous_accumulated_distance = 0
        for cluster in clusters:
                previous_accumulated_distance += cluster.accumulated_distance

        while(we_are_improving):
            
            current_accumulated_distance = 0

            

            splitting_cluster = find_cluster_to_split(cluster_dict, current_class)
            mu = splitting_cluster.mean
            cov = splitting_cluster.covariance
            
            # Use the varianve of the splitting cluster to generate a noise vector
            
            stds = np.sqrt(np.diag(cov))
            w = np.random.uniform(low=-stds, high=stds)


            # Altered mean for the new two clusters
            mu_1 = mu + w
            mu_2 = mu - w
            cluster_split_1 = Cluster(mean=mu_1, covariance=cov, datapoints=[])
            cluster_split_2 = Cluster(mean=mu_2, covariance=cov, datapoints=[])

            print("Splitting cluster:")
            print("  mu =", mu)
            print("  w  =", w)
            print("  mu_1 =", mu_1)
            print("  mu_2 =", mu_2)



            # Update our clusters array
            clusters.remove(splitting_cluster)
            clusters.append(cluster_split_1)
            clusters.append(cluster_split_2)

            # Before assigning points to clusters, need to clear the datapoints.
            for cluster in clusters:
                cluster.datapoints = []


            # Assign all datapoints between our new clusters. Necessary since there are new distributions in the collection.

            
            for x in class_data:
                closest_cluster = clusters[0]
                closest_distance = eucledian_distance(x, closest_cluster.mean)
                for c in range(1, len(clusters)):
                    if eucledian_distance(x, clusters[c].mean) < closest_distance:
                        closest_distance = eucledian_distance(x, clusters[c].mean)
                        closest_cluster = clusters[c]
                
                closest_cluster.datapoints.append(x)
        
            # After assigning the datapoints, we need to calculate the new mean, covariance and accumulated distance
            for cluster in clusters:
                if len(cluster.datapoints) == 0:
                    continue 
                X_cluster = np.array(cluster.datapoints)
                cluster.mean = calculate_mean(X_cluster)
                cluster.covariance = calculate_covariance(X_cluster, cluster.mean)
                cluster.accumulated_distance = calculate_accumulated_distance(X_cluster, cluster.mean, cluster.covariance)

                current_accumulated_distance+= cluster.accumulated_distance

            
            if current_accumulated_distance >= beta* previous_accumulated_distance:
                we_are_improving = False
            else:
                previous_accumulated_distance = current_accumulated_distance



    return cluster_dict




classes = y_train['Genre'].unique()
#print(f"Classes: {classes}")
beta = 0.99
cluster_dict = create_classifier(X_train, y_train, classes, beta)


Splitting cluster:
  mu = [ 6.57230942e+03 -6.98543422e+01  3.02657942e+03  4.02214269e-01]
  w  = [ 5.31715547e+02  2.75079249e+01  5.33149144e+02 -6.53412071e-02]
  mu_1 = [ 7.10402497e+03 -4.23464172e+01  3.55972856e+03  3.36873062e-01]
  mu_2 = [ 6.04059388e+03 -9.73622671e+01  2.49343027e+03  4.67555476e-01]
Splitting cluster:
  mu = [ 5.07810409e+03 -6.48245061e+01  2.56655673e+03  4.95371905e-01]
  w  = [ 1.21147351e+02 -3.11244698e+01 -6.19111193e+01  1.60949183e-02]
  mu_1 = [ 5.19925144e+03 -9.59489759e+01  2.50464561e+03  5.11466824e-01]
  mu_2 = [ 4.95695673e+03 -3.37000364e+01  2.62846785e+03  4.79276987e-01]
Splitting cluster:
  mu = [ 5.57114500e+03 -9.08886046e+01  2.64650614e+03  4.11966273e-01]
  w  = [ 2.69633678e+02 -3.44751707e+01  1.60598482e+02 -6.32407093e-02]
  mu_1 = [ 5.84077868e+03 -1.25363775e+02  2.80710462e+03  3.48725564e-01]
  mu_2 = [ 5.30151133e+03 -5.64134338e+01  2.48590766e+03  4.75206983e-01]
Splitting cluster:
  mu = [ 3.65012887e+03 -1.66777030e

In [5]:
for i in classes:
    print(f"{i} has {len(cluster_dict[i])} clusters")

pop has 2 clusters
metal has 2 clusters
disco has 2 clusters
blues has 2 clusters
reggae has 2 clusters
classical has 2 clusters
rock has 2 clusters
hiphop has 2 clusters
country has 2 clusters
jazz has 2 clusters
