In [1]:
from sklearn.mixture import GaussianMixture
from collections import Counter
import numpy as np
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [3]:
def evaluation_metrics(pred_labels, true_labels):
    N = len(pred_labels)
    
    cluster_labels = {}
    for i in range(len(pred_labels)):
        cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

    K = len(cluster_labels)

    # Store list of labels as a Counter
    for key,value in cluster_labels.items():
        cluster_labels[key] = Counter(value)

    # Calculate purity
    purity = 0
    for cluster in cluster_labels:
        purity += max(cluster_labels[cluster].values())
    purity /= N

    # Calculate gini index
    gini_index = 0
    for key,value in cluster_labels.items():
        gini = 0
        for k,v in value.items():
            gini += (v / sum(cluster_labels[key].values())) ** 2
        gini_index += 1 - gini
    gini_index /= K

    # Final result
    print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4), '\n')

In [4]:
# Fetch data
fashion_train_path = abspath('datasets', 'fashion-mnist_train.csv')
fashion_test_path = abspath('datasets', 'fashion-mnist_test.csv')

fashion_train_dataset = np.loadtxt(open(fashion_train_path, 'rb'), delimiter=',', skiprows=1)
fashion_test_dataset = np.loadtxt(open(fashion_test_path, 'rb'), delimiter=',', skiprows=1)

print(fashion_train_dataset.shape)
print(fashion_test_dataset.shape)

(60000, 785)
(10000, 785)


In [5]:
# Data and labels
fashion_train_data = fashion_train_dataset[:, list(range(1, fashion_train_dataset.shape[1]))]
fashion_train_labels = fashion_train_dataset[:, 0]
fashion_test_data = fashion_test_dataset[:, list(range(1, fashion_test_dataset.shape[1]))]
fashion_test_labels = fashion_test_dataset[:, 0]

print(fashion_train_data.shape)
print(fashion_train_labels.shape)
print(fashion_test_data.shape)
print(fashion_test_labels.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [6]:
print('Without Normalizing - Train Data')
model = GaussianMixture(n_components=10, covariance_type='diag', init_params='kmeans', max_iter=200)
model.fit(fashion_train_data)

pred_train_labels = model.predict(fashion_train_data)
evaluation_metrics(pred_train_labels, fashion_train_labels)

Without Normalizing - Train Data
Purity - 0.4513 Gini Index - 0.5619 



In [7]:
print('Without Normalizing - Test Data')
model = GaussianMixture(n_components=10, covariance_type='diag', init_params='kmeans', max_iter=200)
model.fit(fashion_test_data)

pred_train_labels = model.predict(fashion_test_data)
evaluation_metrics(pred_train_labels, fashion_test_labels)

Without Normalizing - Test Data
Purity - 0.4691 Gini Index - 0.5798 



In [8]:
# Normalize data
norm_fashion_train_data = np.divide(fashion_train_data, 255)
norm_fashion_test_data = np.divide(fashion_test_data, 255)

In [9]:
print('With Normalizing - Train Data')
model = GaussianMixture(n_components=10, covariance_type='diag', init_params='kmeans', max_iter=200)
model.fit(norm_fashion_train_data)

pred_train_labels = model.predict(norm_fashion_train_data)
evaluation_metrics(pred_train_labels, fashion_train_labels)

With Normalizing - Train Data
Purity - 0.5208 Gini Index - 0.5509 



In [10]:
print('With Normalizing - Test Data')
model = GaussianMixture(n_components=10, covariance_type='diag', init_params='kmeans', max_iter=200)
model.fit(norm_fashion_test_data)

pred_test_labels = model.predict(norm_fashion_test_data)
evaluation_metrics(pred_test_labels, fashion_test_labels)

With Normalizing - Test Data
Purity - 0.4794 Gini Index - 0.5924 

