In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


data = pd.read_csv('data/GenreClassData_30s.txt', sep='\t')
data["TrackID"] = range(len(data))

# Split the data into training and testing sets
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']


#all_features = [col for col in data.columns if col not in ['Track ID','TrackID', 'File', 'GenreID', 'Genre', 'Type']]
'''all_features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_1_mean','chroma_stft_2_mean','chroma_stft_3_mean','chroma_stft_4_mean',
    'chroma_stft_5_mean','chroma_stft_6_mean','chroma_stft_7_mean','chroma_stft_8_mean',
    'chroma_stft_9_mean','chroma_stft_10_mean','chroma_stft_11_mean','chroma_stft_12_mean',
    'chroma_stft_1_std','chroma_stft_2_std','chroma_stft_3_std','chroma_stft_4_std',
    'chroma_stft_5_std','chroma_stft_6_std','chroma_stft_7_std','chroma_stft_8_std',
    'chroma_stft_9_std','chroma_stft_10_std','chroma_stft_11_std','chroma_stft_12_std',
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',
    'mfcc_7_mean','mfcc_8_mean','mfcc_9_mean','mfcc_10_mean','mfcc_11_mean','mfcc_12_mean',
    'mfcc_1_std','mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std','mfcc_6_std',
    'mfcc_7_std','mfcc_8_std','mfcc_9_std','mfcc_10_std','mfcc_11_std','mfcc_12_std'
]'''
all_features = [
    'zero_cross_rate_mean','zero_cross_rate_std','rmse_mean','rmse_var',
    'spectral_centroid_mean','spectral_centroid_var','spectral_bandwidth_mean','spectral_bandwidth_var',
    'spectral_rolloff_mean','spectral_rolloff_var','spectral_contrast_mean','spectral_contrast_var',
    'spectral_flatness_mean','spectral_flatness_var',
    'chroma_stft_7_mean',
    
    'tempo',
    'mfcc_1_mean','mfcc_2_mean','mfcc_3_mean','mfcc_4_mean','mfcc_5_mean','mfcc_6_mean',

    'mfcc_2_std','mfcc_3_std','mfcc_4_std','mfcc_5_std', 'mfcc_7_std'
]
#Erfaringer: 
#'chroma_stft_x_std' er elendig, drar ned accuracy
features = all_features

#features = ['spectral_rolloff_mean', 'mfcc_1_mean', 'spectral_centroid_mean', 'chroma_stft_10_mean']

targets = ['Genre']

# feature data
X_train, y_train = train[features], train[targets]
X_test, y_test = test[features], test[targets]



scaler = StandardScaler().fit(X_train)      # compute μ,σ on TRAINING data only
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)




In [None]:
# Creating a dataclass for storing the structured data
@dataclass(eq=False)
class Cluster:
    mean: np.ndarray
    covariance: np.ndarray
    datapoints: np.ndarray
    accumulated_distance: float = 0.0
    inv_covariance: np.ndarray = None


def mahalanobis_distance(x, mean, cov):
    try:
        inv_cov = np.linalg.inv(cov)
    except np.linalg.LinAlgError:
        cov += np.eye(cov.shape[0]) * 1e-6
        inv_cov = np.linalg.inv(cov)

    diff = x - mean
    return diff.T @ inv_cov @ diff

In [None]:
def create_classifier_with_gmm(X_train, y_train, classes, M):
    
    cluster_dict = {}
    for current_class in classes:
        
        # Start with one cluster
        cluster_dict[current_class] = []
        clusters = cluster_dict[current_class]

        class_data = X_train[y_train['Genre'] == current_class] # X_train[mask] - selects the row where the mask is true (works because of pandas)

        # Calculate the mean of the current cluster
        gmm = GaussianMixture(
            n_components    = M,
            covariance_type = "full",  
            random_state    = 0
        )

        gmm.fit(class_data)

        for my, cov in zip(gmm.means_, gmm.covariances_):
             clusters.append(
                Cluster(
                    mean               = my,
                    covariance         = cov,
                    datapoints         = None,
                    accumulated_distance=0.0
                )
            )
        

    return cluster_dict




classes = y_train['Genre'].unique()
#print(f"Classes: {classes}")
M = 60
cluster_dict = create_classifier_with_gmm(X_train, y_train, classes, M)


In [None]:
def classify_sample(x,cluster_dict):
    best_distance = float('inf')
    predicted_class = ''

    for class_name, clusters in cluster_dict.items(): # For key, values in dict
        for cluster in clusters:
            d = mahalanobis_distance(x,cluster.mean, cluster.covariance)
            if d < best_distance:
                best_distance = d
                predicted_class = class_name
    return predicted_class
    

predicted_label = classify_sample(X_test[0],cluster_dict)
print(f"Predicted label: {predicted_label}. True label: {y_test.iloc[0]['Genre']}")


In [None]:
def predict(X_test, cluster_dict):
    predictions = []
    for x in X_test:
        label = classify_sample(x, cluster_dict)
        predictions.append(label)
    return predictions

In [None]:

y_test_true = y_test['Genre']
y_pred = predict(X_test, cluster_dict)

accuracy = accuracy_score(y_test_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")
