In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


data = pd.read_csv('data/GenreClassData_30s.txt', sep='\t')
data["TrackID"] = range(len(data))

# Split the data into training and testing sets
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']


#all_features = [col for col in data.columns if col not in ['Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type']]
'''all_features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_1_mean','chroma_stft_2_mean','chroma_stft_3_mean','chroma_stft_4_mean',
    'chroma_stft_5_mean','chroma_stft_6_mean','chroma_stft_7_mean','chroma_stft_8_mean',
    'chroma_stft_9_mean','chroma_stft_10_mean','chroma_stft_11_mean','chroma_stft_12_mean',
    'chroma_stft_1_std','chroma_stft_2_std','chroma_stft_3_std','chroma_stft_4_std',
    'chroma_stft_5_std','chroma_stft_6_std','chroma_stft_7_std','chroma_stft_8_std',
    'chroma_stft_9_std','chroma_stft_10_std','chroma_stft_11_std','chroma_stft_12_std',
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',
    'mfcc_7_mean','mfcc_8_mean','mfcc_9_mean','mfcc_10_mean','mfcc_11_mean','mfcc_12_mean',
    'mfcc_1_std','mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std','mfcc_6_std',
    'mfcc_7_std','mfcc_8_std','mfcc_9_std','mfcc_10_std','mfcc_11_std','mfcc_12_std'
]'''
all_features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_7_mean',
    
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',

    'mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std', 'mfcc_7_std'
]
#Erfaringer: 
#'chroma_stft_x_std' er elendig, drar ned accuracy
features = all_features

#features = ['spectral_rolloff_mean', 'mfcc_1_mean', 'spectral_centroid_mean', 'chroma_stft_10_mean']

targets = ['Genre']

# feature data
X_train, y_train = train[features], train[targets]
X_test, y_test = test[features], test[targets]



## Thoughts before implementing 

Idea: Implement an algorithm that use the training data to
* iterate through each class
    * split the class into M clusters that are used as references for that class


So: We have an interative procedure for finding the number of clusters (class references).
In order to implement this procedure we need a stopping criteria, since we don't know the number of clusters that will give the best performance beforehand.

Starting with 1 cluster per class (M=1) we calculate

* Λ_s (contains the mean and covariance as a tuple)
* D_1 (the sum of distances between all points in this class and the mean of that class - using Mahalanobis distance or euclidean for example)
These D matrices need to be stored somewhere so that we can compare them


Then iteratively we increase M by splitting one of the clusters in two.

1. Choose which cluster to split - pick the cluster with the largest accumulated distance (D). Or choose the 
2. The cluster to split is denoted as Λ_s. To split this cluster use a small noise vector w and create two new centers:
    * my_s1 = my_s + w 
    * my_s2 = my_s - w 
* The covariance matrices are simply copied

3. Now for each training point x_k in class w_i
    * Compute the distance to all M+1 cluster centers
        * d = d(x_k, my_j)
    * Assign x_k to the cluster with lowest distance

4. Now since we have reassigned our data, we need to recompute the parameters for our clusters
    * compute new my_j and covarianceMatrix_j as done earlier


5. Evaluate total distance D_M
    * Find the accumulated distance and compare with previous D_M-1
        * If D_M << D_M-1: continue to next split
        * if not stop splitting fo this class -> we are satsified.


Summarized:

Loop over classes:
* Start with 1 cluster
* While improvement:
    * Find worst cluster
    * Split μ with noise ±w
    * Assign training vectors
    * Recompute μ, Σ
    * Compute new total distance
    * Check for convergence


In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(10,7))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_train['Genre'], palette='bright')
plt.title("PCA visualization of genres")
plt.show()


In [None]:
# Creating a dataclass for storing the structured data
@dataclass(eq=False)
class Cluster:
    mean: np.ndarray
    covariance: np.ndarray
    datapoints: np.ndarray
    accumulated_distance: float = 0.0
    inv_covariance: np.ndarray = None

In [None]:
## Helper functions

# Calculates the Euclidean distance between two music samples
def eucledian_distance(datapoint, mu):
    distance = 0
    for i in range(len(datapoint)):
        distance += (datapoint[i] - mu[i]) ** 2
    return np.sqrt(distance)

def calculate_mean(X):
    n_samples, n_features = X.shape
    accumalated_sum = np.zeros(n_features)
    for k in range(n_samples):
        accumalated_sum += X[k][:]
    
    mean = accumalated_sum/n_samples
    return mean

def calculate_covariance(X, mean):
    n_samples, n_features = X.shape
    covariance_matrix = np.zeros((n_features, n_features))
    for k in range(n_samples):
        diff = X[k] - mean
        covariance_matrix += np.outer(diff, diff)

    covariance_matrix /= n_samples
    return covariance_matrix

def mahalanobis_distance(x, mean, cov):
    try:
        inv_cov = np.linalg.inv(cov)
    except np.linalg.LinAlgError:
        cov += np.eye(cov.shape[0]) * 1e-6
        inv_cov = np.linalg.inv(cov)

    diff = x - mean
    return diff.T @ inv_cov @ diff


# Calculates the sum of distances between all points in this class and the mean of that class
def calculate_accumulated_distance(X, mean, covariance):
    '''
    Args:
        X
        mean
        covariance
    '''
    n_samples, _ = X.shape
    accumulated_distance = 0.0

    for k in range(n_samples):
        distance = mahalanobis_distance(X[k], mean, covariance)
        accumulated_distance += distance

    return accumulated_distance  # total D


def find_cluster_to_split_distance(cluster_dict, current_class):
    """
    Return the cluster in `cluster_dict[current_class]` that has
    the largest accumulated distance.
    """
    clusters = cluster_dict[current_class]
    if(len(clusters) == 1):
        return clusters[0]
    else:
        
        cluster_to_split = clusters[0]
        for i in range(1, len(clusters)):
            if(clusters[i].accumulated_distance > cluster_to_split.accumulated_distance):
                cluster_to_split = clusters[i] 

    return cluster_to_split

def find_cluster_to_split_points(cluster_dict, current_class):
    """
    Return the cluster in `cluster_dict[current_class]` that has
    the largest number of datapoints.
    """
    clusters = cluster_dict[current_class]
    
    if len(clusters) == 1:
        return clusters[0]
    # otherwise pick the cluster with the maximum number of datapoints
    # (datapoints may be a numpy array or a list)
    return max(clusters, key=lambda c: len(c.datapoints))

    




In [None]:
def create_classifier(X_train, y_train, classes):
    
    cluster_dict = {}
    for current_class in classes:
        counter = 1
        # Start with one cluster
        cluster_dict[current_class] = []
        clusters = cluster_dict[current_class]

        class_data = X_train[y_train['Genre'] == current_class].to_numpy() # X_train[mask] - selects the row where the mask is true (works because of numpy)

        # Calculate the mean of the current cluster
        mean = calculate_mean(class_data)
        cov = calculate_covariance(class_data, mean)
        acc_dist = calculate_accumulated_distance(class_data, mean, cov)
        
        cluster_main = Cluster(mean=mean, covariance=cov, datapoints=class_data, accumulated_distance=acc_dist)

        clusters.append(cluster_main)
        we_are_improving = True

        previous_accumulated_distance = 0
        for cluster in clusters:
                previous_accumulated_distance += cluster.accumulated_distance
        #or counter < 2:
        '''while we_are_improving:  # Removing this entire while loop increases accuracy...
            counter += 1
            
            current_accumulated_distance = 0

        
            splitting_cluster = find_cluster_to_split_points(cluster_dict, current_class)
            mu = splitting_cluster.mean
            cov = splitting_cluster.covariance
            
            # Use the varianve of the splitting cluster to generate a noise vector
            
             
            stds = np.sqrt(np.diag(cov))
            # w = np.random.uniform(low=-stds, high=stds)
            # i stedet for uniform(-stds, stds)
            scale_factor = 5.0
            w = scale_factor * np.random.uniform(-stds, stds)



            # Altered mean for the new two clusters
            mu_1 = mu + w
            mu_2 = mu - w
            cluster_split_1 = Cluster(mean=mu_1, covariance=cov, datapoints=[])
            cluster_split_2 = Cluster(mean=mu_2, covariance=cov, datapoints=[])

            print("Splitting cluster:")
            print("  mu =", mu)
            print("  w  =", w)
            print("  mu_1 =", mu_1)
            print("  mu_2 =", mu_2)



            # Update our clusters array
            clusters.remove(splitting_cluster)
            clusters.append(cluster_split_1)
            clusters.append(cluster_split_2)

            # Before assigning points to clusters, need to clear the datapoints.
            for cluster in clusters:
                cluster.datapoints = []


            # Assign all datapoints between our new clusters. Necessary since there are new distributions in the collection.

            
            for x in class_data:
                closest_cluster = clusters[0]
                closest_distance = eucledian_distance(x, closest_cluster.mean)
                for c in range(1, len(clusters)):
                    if eucledian_distance(x, clusters[c].mean) < closest_distance:
                        closest_distance = eucledian_distance(x, clusters[c].mean)
                        closest_cluster = clusters[c]
                
                closest_cluster.datapoints.append(x)
        
            # After assigning the datapoints, we need to calculate the new mean, covariance and accumulated distance
            for cluster in clusters:
                if len(cluster.datapoints) == 0:
                    continue 
                X_cluster = np.array(cluster.datapoints)
                cluster.mean = calculate_mean(X_cluster)
                cluster.covariance = calculate_covariance(X_cluster, cluster.mean)
                cluster.accumulated_distance = calculate_accumulated_distance(X_cluster, cluster.mean, cluster.covariance)

                current_accumulated_distance += cluster.accumulated_distance

            
            if current_accumulated_distance >= beta* previous_accumulated_distance:
                we_are_improving = False
            else:
                previous_accumulated_distance = current_accumulated_distance'''



    return cluster_dict




classes = y_train['Genre'].unique()
#print(f"Classes: {classes}")
beta = 0.99
cluster_dict = create_classifier(X_train, y_train, classes)


In [None]:
for i in classes:
    print(f"{i} has {len(cluster_dict[i])} clusters")

In [None]:
def plot_clusters_for_class(class_name, cluster_dict, X_train, y_train):
    class_data = X_train[y_train['Genre'] == class_name]

    
    pca = PCA(n_components=2) #Reduces to 2 dimensions for plotting
    class_data_2d = pca.fit_transform(class_data) # Fit and transform the data to 2D

    # Transform cluster-means
    cluster_means_2d = [pca.transform(cluster.mean.reshape(1, -1))[0] for cluster in cluster_dict[class_name]]

    # Plot datapunktene
    plt.figure(figsize=(8, 6))
    plt.scatter(class_data_2d[:, 0], class_data_2d[:, 1], c='lightgray', label='Data points')

    # Plot cluster sentrene
    for idx, mean_2d in enumerate(cluster_means_2d):
        plt.scatter(mean_2d[0], mean_2d[1], c='red', marker='X', s=100, label=f'Cluster' if idx == 0 else "")
    
    plt.title(f"Clusters for class '{class_name}'")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
plot_clusters_for_class('blues', cluster_dict, X_train, y_train)
plot_clusters_for_class('classical', cluster_dict, X_train, y_train)
plot_clusters_for_class('country', cluster_dict, X_train, y_train)
plot_clusters_for_class('disco', cluster_dict, X_train, y_train)
plot_clusters_for_class('hiphop', cluster_dict, X_train, y_train)
plot_clusters_for_class('jazz', cluster_dict, X_train, y_train)
plot_clusters_for_class('metal', cluster_dict, X_train, y_train)
plot_clusters_for_class('pop', cluster_dict, X_train, y_train)

In [None]:
def classify_sample(x,cluster_dict):
    best_distance = float('inf')
    predicted_class = ''

    for class_name, clusters in cluster_dict.items(): # For key, values in dict
        for cluster in clusters:
            d = mahalanobis_distance(x,cluster.mean, cluster.covariance)
            if d < best_distance:
                best_distance = d
                predicted_class = class_name
    return predicted_class
    

predicted_label = classify_sample(X_test.iloc[0],cluster_dict)
print(f"Predicted label: {predicted_label}. True label: {y_test.iloc[0]['Genre']}")


In [None]:
def predict(X_test, cluster_dict):
    predictions = []
    for x in X_test.to_numpy():
        label = classify_sample(x, cluster_dict)
        predictions.append(label)
    return predictions

In [None]:

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_test_true = y_test['Genre'].to_numpy()
y_pred = predict(X_test, cluster_dict)

accuracy = accuracy_score(y_test_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

classes = np.unique(y_test_true)

print("Overall accuracy:   ", accuracy_score(y_test, y_pred))
print()
print("Per‐class performance:")
print(classification_report(
    y_test, 
    y_pred, 
    labels=classes,       
    target_names=classes,
    digits=4
))
print("Confusion matrix:")
print(confusion_matrix(
    y_test_true,
    y_pred,
    labels=classes
))


## Liten konklusjon:

Music often has overlapping properties with large variations. Classification of 10 genres is a demanding a task and a simple clustering approach may not be sufficient.

Also, by splitting a parent cluster into two "baby-clusters" that are simply shifted with a noise vector we are hurting the class-separator. We should instead have the splits sit on the actual peaks of the data density, and not on random perturbations.
from split & hope

List of improvements:
FIRST: Replace the splitting logic with GMM + EM
1) Normalize the data - we need to scale the data so that they follow a simialr size 
2) Evaluer klyngene
    Plott klyngene i 2D (PCA) og se om datapunktene i hver klynge virkelig hører til den tilsiktede sjangeren, eller om mange punkter er åpenbart “feilplassert”.
3) For å finne ut om teknikken virkelig passer, bør du sammenligne med en standard klassifikator (f.eks. SVM, RF, MLP) og også undersøke parametere (antall klynger, normalisering, robust estimering av kovarianser, osv.). Hvis du raskt kan få bedre resultater med mer vanlige supervised-metoder, er det et tydelig tegn på at kluster-tilnærmingen ikke er særlig godt egnet eller i det minste krever langt mer avanserte justeringer.
