In [1]:
from sklearn.mixture import GaussianMixture
from collections import Counter
import numpy as np
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [3]:
def evaluation_metrics(pred_labels, true_labels):
    N = len(pred_labels)
    
    cluster_labels = {}
    for i in range(len(pred_labels)):
        cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

    K = len(cluster_labels)

    # Store list of labels as a Counter
    for key,value in cluster_labels.items():
        cluster_labels[key] = Counter(value)

    # Calculate purity
    purity = 0
    for cluster in cluster_labels:
        purity += max(cluster_labels[cluster].values())
    purity /= N

    # Calculate gini index
    gini_index = 0
    for key,value in cluster_labels.items():
        gini = 0
        for k,v in value.items():
            gini += (v / sum(cluster_labels[key].values())) ** 2
        gini_index += 1 - gini
    gini_index /= K

    # Final result
    print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4), '\n')

In [4]:
# Fetch data
spambase_path = abspath('datasets', 'spambase.data')
print(spambase_path)
spambase_dataset = np.loadtxt(open(spambase_path, 'rb'), delimiter=',')

# Data and labels
spambase_data = spambase_dataset[:, list(range(0, spambase_dataset.shape[1] - 1))]
spambase_labels = spambase_dataset[:, spambase_dataset.shape[1] - 1]

print(spambase_dataset.shape)
print(spambase_data.shape)
print(spambase_labels.shape)

C:\Users\Ashton\Documents\GitHub\Machine-Learning-Experiments\Soft K-Means\..\datasets\spambase.data
(4601, 58)
(4601, 57)
(4601,)


In [5]:
model = GaussianMixture(n_components=2, covariance_type='diag', init_params='kmeans', max_iter=200)
model.fit(spambase_data)

pred_labels = model.predict(spambase_data)
evaluation_metrics(pred_labels, spambase_labels)

Purity - 0.7485 Gini Index - 0.3076 

