In [1]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np
import pandas as pd
import math
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [10]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [3]:
# k - index of current data point in data
# e - epsilon
def find_neighbors(k, e, distance_matrix):
    N = []      # Neighbors
    
    for i in range(len(distance_matrix[k])):
        if distance_matrix[k][i] <= e and i != k:   # Return neighbors within distance e, except for the point itself
            N.append(i)

    return N

In [4]:
# e - epsilon
# min_pts - min points
def dbscan(data, e, min_pts, labels=None):
    
    distance_matrix = euclidean_distances(data)

    clusters = []
    for i in range(data.shape[0]):
        clusters.append(math.nan)

    c = 0   # Cluster label
    for i in range(data.shape[0]):

        # Skip if already assigned a cluster
        if not pd.isnull(clusters[i]):
            continue

        S = find_neighbors(i, e, distance_matrix)

        # Density check - label Noise if no. of neighbors less than min_pts
        if len(S) < min_pts:
            clusters[i] = 'Noise'
            continue

        # Next cluster label
        c = c + 1

        # Add point to the new cluster
        clusters[i] = c

        # Process every point in neighborhood except the point itself
        for j in S:
            j = int(j)
            if j != i:

                # Change noise point to border point 
                if clusters[j] == 'Noise':
                    clusters[j] = c

                # Skip if already assigned a cluster
                if not pd.isnull([clusters[j]]):
                    continue

                # Add neighbor to the current cluster
                clusters[j] = c

                # Get neighbors
                N = find_neighbors(j, e, distance_matrix)

                # Density check - add new neighbors to seed set if no. of neighbors greater than min_pts
                if len(N) >= min_pts:
                    for k in N:
                        if int(k) != i:
                            S.append(k)

    # Evaluate results
    print('epsilon -', e, 'min_pts -', min_pts)
    evaluation_metrics(clusters, labels)

In [5]:
# N - size of sample
def get_samples(data, N, labels=None):
    sampled_data = np.zeros((N, data.shape[1]))
    
    if labels is None:
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]

        return sampled_data
        
    else:
        sampled_labels = []
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]
            sampled_labels.append(labels[j])

        return (sampled_data, sampled_labels)

In [6]:
# Fetch data
fashion_train_path = abspath('datasets', 'fashion-mnist_train.csv')
fashion_test_path = abspath('datasets', 'fashion-mnist_test.csv')

fashion_train_dataset = np.loadtxt(open(fashion_train_path, 'rb'), delimiter=',', skiprows=1)
fashion_test_dataset = np.loadtxt(open(fashion_test_path, 'rb'), delimiter=',', skiprows=1)

print(fashion_train_dataset.shape)
print(fashion_test_dataset.shape)

(60000, 785)
(60000, 784)
(60000,)
(60000, 784)
(10000, 785)
(10000, 784)
(10000,)
(10000, 784)


In [None]:
# Data and labels
fashion_train_data = fashion_train_dataset[:, list(range(1, fashion_train_dataset.shape[1]))]
fashion_train_labels = fashion_train_dataset[:, 0]
fashion_test_data = fashion_test_dataset[:, list(range(1, fashion_test_dataset.shape[1]))]
fashion_test_labels = fashion_test_dataset[:, 0]

print(fashion_train_data.shape)
print(fashion_train_labels.shape)
print(fashion_test_data.shape)
print(fashion_test_labels.shape)

In [7]:
print('Without Normalizing - Train Data')

sampled_data, sampled_labels = get_samples(fashion_train_data, 5000, fashion_mnist_train_labels)
for p in [1,3,5]:
    for e in [3.5,4.0]:
        dbscan(data=sampled_data, e=e, min_pts=p, labels=sampled_labels)
    print('------------------------------')

epsilon - 3.5 min_pts - 1
Purity - 0.2886 Gini Index - 0.053
No. of clusters - 288
Counter({'Noise': 3307, 3: 399, 1: 280, 2: 198, 33: 29, 41: 26, 73: 18, 28: 16, 45: 16, 13: 13, 133: 11, 9: 10, 83: 10, 4: 9, 51: 7, 49: 6, 55: 6, 95: 6, 104: 6, 176: 6, 187: 6, 226: 6, 227: 6, 25: 5, 84: 5, 97: 5, 119: 5, 157: 5, 171: 5, 11: 4, 16: 4, 27: 4, 35: 4, 46: 4, 125: 4, 145: 4, 165: 4, 183: 4, 195: 4, 197: 4, 12: 3, 14: 3, 19: 3, 23: 3, 26: 3, 32: 3, 37: 3, 54: 3, 59: 3, 60: 3, 69: 3, 72: 3, 75: 3, 85: 3, 91: 3, 93: 3, 103: 3, 108: 3, 110: 3, 116: 3, 118: 3, 124: 3, 128: 3, 142: 3, 191: 3, 210: 3, 216: 3, 218: 3, 240: 3, 247: 3, 254: 3, 258: 3, 259: 3, 5: 2, 6: 2, 7: 2, 8: 2, 10: 2, 15: 2, 17: 2, 18: 2, 20: 2, 21: 2, 22: 2, 24: 2, 29: 2, 30: 2, 31: 2, 34: 2, 36: 2, 38: 2, 39: 2, 40: 2, 42: 2, 43: 2, 44: 2, 47: 2, 48: 2, 50: 2, 52: 2, 53: 2, 56: 2, 57: 2, 58: 2, 61: 2, 62: 2, 63: 2, 64: 2, 65: 2, 66: 2, 67: 2, 68: 2, 70: 2, 71: 2, 74: 2, 76: 2, 77: 2, 78: 2, 79: 2, 80: 2, 81: 2, 82: 2, 86: 2, 8

epsilon - 3.9 min_pts - 1
Purity - 0.1926 Gini Index - 0.0536
No. of clusters - 231
Counter({'Noise': 2616, 1: 1800, 20: 24, 25: 19, 33: 11, 43: 10, 9: 7, 26: 7, 46: 7, 53: 7, 30: 6, 139: 5, 140: 5, 10: 4, 12: 4, 23: 4, 84: 4, 8: 3, 13: 3, 18: 3, 21: 3, 36: 3, 40: 3, 49: 3, 50: 3, 51: 3, 62: 3, 68: 3, 73: 3, 74: 3, 80: 3, 89: 3, 91: 3, 94: 3, 98: 3, 101: 3, 105: 3, 113: 3, 119: 3, 129: 3, 143: 3, 146: 3, 160: 3, 169: 3, 173: 3, 180: 3, 184: 3, 192: 3, 193: 3, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 11: 2, 14: 2, 15: 2, 16: 2, 17: 2, 19: 2, 22: 2, 24: 2, 27: 2, 28: 2, 29: 2, 31: 2, 32: 2, 34: 2, 35: 2, 37: 2, 38: 2, 39: 2, 41: 2, 42: 2, 44: 2, 45: 2, 47: 2, 48: 2, 52: 2, 54: 2, 55: 2, 56: 2, 57: 2, 58: 2, 59: 2, 60: 2, 61: 2, 63: 2, 64: 2, 65: 2, 66: 2, 67: 2, 69: 2, 70: 2, 71: 2, 72: 2, 75: 2, 76: 2, 77: 2, 78: 2, 79: 2, 81: 2, 82: 2, 83: 2, 85: 2, 86: 2, 87: 2, 88: 2, 90: 2, 92: 2, 93: 2, 95: 2, 96: 2, 97: 2, 99: 2, 100: 2, 102: 2, 103: 2, 104: 2, 106: 2, 107: 2, 108: 2, 109: 2, 110: 2, 1

In [11]:
print('Without Normalizing - Test Data')

for p in [1,3,5]:
    for e in [3.0,3.5]:
        dbscan(normalized_fashion_mnist_test_data, e, p, fashion_mnist_test_labels)
    print('------------------------------')

epsilon - 3.1 min_pts - 1
Purity - 0.1959 Gini Index - 0.0557
No. of clusters - 261
Counter({'Noise': 7792, 1: 690, 2: 412, 11: 241, 7: 40, 53: 35, 27: 25, 22: 22, 64: 21, 68: 15, 131: 14, 33: 13, 4: 10, 10: 9, 66: 9, 5: 8, 18: 8, 98: 8, 109: 8, 141: 8, 8: 7, 91: 7, 171: 7, 198: 7, 34: 6, 61: 6, 129: 6, 185: 6, 41: 5, 51: 5, 106: 5, 147: 5, 216: 5, 14: 4, 15: 4, 19: 4, 24: 4, 26: 4, 30: 4, 39: 4, 40: 4, 54: 4, 58: 4, 65: 4, 71: 4, 75: 4, 85: 4, 115: 4, 118: 4, 126: 4, 130: 4, 189: 4, 192: 4, 205: 4, 3: 3, 13: 3, 23: 3, 28: 3, 31: 3, 32: 3, 36: 3, 37: 3, 52: 3, 55: 3, 57: 3, 76: 3, 78: 3, 84: 3, 92: 3, 96: 3, 100: 3, 104: 3, 105: 3, 107: 3, 116: 3, 128: 3, 133: 3, 137: 3, 138: 3, 139: 3, 145: 3, 152: 3, 154: 3, 169: 3, 172: 3, 173: 3, 187: 3, 195: 3, 214: 3, 229: 3, 252: 3, 6: 2, 9: 2, 12: 2, 16: 2, 17: 2, 20: 2, 21: 2, 25: 2, 29: 2, 35: 2, 38: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 50: 2, 56: 2, 59: 2, 60: 2, 62: 2, 63: 2, 67: 2, 69: 2, 70: 2, 72: 2, 73: 2, 74: 2, 7

epsilon - 3.5 min_pts - 1
Purity - 0.1679 Gini Index - 0.0824
No. of clusters - 251
Counter({'Noise': 6434, 1: 2588, 6: 126, 12: 64, 11: 40, 25: 29, 9: 19, 50: 18, 93: 18, 13: 11, 45: 11, 26: 10, 4: 9, 15: 9, 89: 8, 33: 7, 17: 6, 73: 6, 78: 6, 107: 6, 109: 6, 5: 5, 23: 5, 30: 5, 31: 5, 48: 5, 74: 5, 111: 5, 130: 5, 147: 5, 167: 5, 205: 5, 10: 4, 39: 4, 41: 4, 43: 4, 53: 4, 56: 4, 57: 4, 61: 4, 63: 4, 106: 4, 119: 4, 120: 4, 126: 4, 132: 4, 138: 4, 162: 4, 168: 4, 181: 4, 220: 4, 7: 3, 8: 3, 16: 3, 21: 3, 24: 3, 38: 3, 40: 3, 67: 3, 72: 3, 75: 3, 76: 3, 79: 3, 80: 3, 83: 3, 84: 3, 90: 3, 95: 3, 96: 3, 99: 3, 100: 3, 102: 3, 103: 3, 123: 3, 141: 3, 144: 3, 151: 3, 153: 3, 155: 3, 160: 3, 166: 3, 185: 3, 198: 3, 199: 3, 217: 3, 219: 3, 221: 3, 230: 3, 232: 3, 2: 2, 3: 2, 14: 2, 18: 2, 19: 2, 20: 2, 22: 2, 27: 2, 28: 2, 29: 2, 32: 2, 34: 2, 35: 2, 36: 2, 37: 2, 42: 2, 44: 2, 46: 2, 47: 2, 49: 2, 51: 2, 52: 2, 54: 2, 55: 2, 58: 2, 59: 2, 60: 2, 62: 2, 64: 2, 65: 2, 66: 2, 68: 2, 69: 2, 70: 

In [None]:
# Normalize data
norm_fashion_train_data = np.divide(fashion_train_data, 255)
norm_fashion_test_data = np.divide(fashion_test_data, 255)

In [7]:
print('With Normalizing - Train Data')

sampled_data, sampled_labels = get_samples(norm_fashion_train_data, 5000, fashion_mnist_train_labels)
for p in [1,3,5]:
    for e in [3.5,4.0]:
        dbscan(sampled_data, e, p, sampled_labels)
    print('------------------------------')

epsilon - 3.5 min_pts - 1
Purity - 0.2886 Gini Index - 0.053
No. of clusters - 288
Counter({'Noise': 3307, 3: 399, 1: 280, 2: 198, 33: 29, 41: 26, 73: 18, 28: 16, 45: 16, 13: 13, 133: 11, 9: 10, 83: 10, 4: 9, 51: 7, 49: 6, 55: 6, 95: 6, 104: 6, 176: 6, 187: 6, 226: 6, 227: 6, 25: 5, 84: 5, 97: 5, 119: 5, 157: 5, 171: 5, 11: 4, 16: 4, 27: 4, 35: 4, 46: 4, 125: 4, 145: 4, 165: 4, 183: 4, 195: 4, 197: 4, 12: 3, 14: 3, 19: 3, 23: 3, 26: 3, 32: 3, 37: 3, 54: 3, 59: 3, 60: 3, 69: 3, 72: 3, 75: 3, 85: 3, 91: 3, 93: 3, 103: 3, 108: 3, 110: 3, 116: 3, 118: 3, 124: 3, 128: 3, 142: 3, 191: 3, 210: 3, 216: 3, 218: 3, 240: 3, 247: 3, 254: 3, 258: 3, 259: 3, 5: 2, 6: 2, 7: 2, 8: 2, 10: 2, 15: 2, 17: 2, 18: 2, 20: 2, 21: 2, 22: 2, 24: 2, 29: 2, 30: 2, 31: 2, 34: 2, 36: 2, 38: 2, 39: 2, 40: 2, 42: 2, 43: 2, 44: 2, 47: 2, 48: 2, 50: 2, 52: 2, 53: 2, 56: 2, 57: 2, 58: 2, 61: 2, 62: 2, 63: 2, 64: 2, 65: 2, 66: 2, 67: 2, 68: 2, 70: 2, 71: 2, 74: 2, 76: 2, 77: 2, 78: 2, 79: 2, 80: 2, 81: 2, 82: 2, 86: 2, 8

epsilon - 3.9 min_pts - 1
Purity - 0.1926 Gini Index - 0.0536
No. of clusters - 231
Counter({'Noise': 2616, 1: 1800, 20: 24, 25: 19, 33: 11, 43: 10, 9: 7, 26: 7, 46: 7, 53: 7, 30: 6, 139: 5, 140: 5, 10: 4, 12: 4, 23: 4, 84: 4, 8: 3, 13: 3, 18: 3, 21: 3, 36: 3, 40: 3, 49: 3, 50: 3, 51: 3, 62: 3, 68: 3, 73: 3, 74: 3, 80: 3, 89: 3, 91: 3, 94: 3, 98: 3, 101: 3, 105: 3, 113: 3, 119: 3, 129: 3, 143: 3, 146: 3, 160: 3, 169: 3, 173: 3, 180: 3, 184: 3, 192: 3, 193: 3, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 11: 2, 14: 2, 15: 2, 16: 2, 17: 2, 19: 2, 22: 2, 24: 2, 27: 2, 28: 2, 29: 2, 31: 2, 32: 2, 34: 2, 35: 2, 37: 2, 38: 2, 39: 2, 41: 2, 42: 2, 44: 2, 45: 2, 47: 2, 48: 2, 52: 2, 54: 2, 55: 2, 56: 2, 57: 2, 58: 2, 59: 2, 60: 2, 61: 2, 63: 2, 64: 2, 65: 2, 66: 2, 67: 2, 69: 2, 70: 2, 71: 2, 72: 2, 75: 2, 76: 2, 77: 2, 78: 2, 79: 2, 81: 2, 82: 2, 83: 2, 85: 2, 86: 2, 87: 2, 88: 2, 90: 2, 92: 2, 93: 2, 95: 2, 96: 2, 97: 2, 99: 2, 100: 2, 102: 2, 103: 2, 104: 2, 106: 2, 107: 2, 108: 2, 109: 2, 110: 2, 1

In [11]:
print('With Normalizing - Test Data')

for p in [1,3,5]:
    for e in [3.0,3.5]:
        dbscan(norm_fashion_test_data, e, p, fashion_mnist_test_labels)
    print('------------------------------')

epsilon - 3.1 min_pts - 1
Purity - 0.1959 Gini Index - 0.0557
No. of clusters - 261
Counter({'Noise': 7792, 1: 690, 2: 412, 11: 241, 7: 40, 53: 35, 27: 25, 22: 22, 64: 21, 68: 15, 131: 14, 33: 13, 4: 10, 10: 9, 66: 9, 5: 8, 18: 8, 98: 8, 109: 8, 141: 8, 8: 7, 91: 7, 171: 7, 198: 7, 34: 6, 61: 6, 129: 6, 185: 6, 41: 5, 51: 5, 106: 5, 147: 5, 216: 5, 14: 4, 15: 4, 19: 4, 24: 4, 26: 4, 30: 4, 39: 4, 40: 4, 54: 4, 58: 4, 65: 4, 71: 4, 75: 4, 85: 4, 115: 4, 118: 4, 126: 4, 130: 4, 189: 4, 192: 4, 205: 4, 3: 3, 13: 3, 23: 3, 28: 3, 31: 3, 32: 3, 36: 3, 37: 3, 52: 3, 55: 3, 57: 3, 76: 3, 78: 3, 84: 3, 92: 3, 96: 3, 100: 3, 104: 3, 105: 3, 107: 3, 116: 3, 128: 3, 133: 3, 137: 3, 138: 3, 139: 3, 145: 3, 152: 3, 154: 3, 169: 3, 172: 3, 173: 3, 187: 3, 195: 3, 214: 3, 229: 3, 252: 3, 6: 2, 9: 2, 12: 2, 16: 2, 17: 2, 20: 2, 21: 2, 25: 2, 29: 2, 35: 2, 38: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 50: 2, 56: 2, 59: 2, 60: 2, 62: 2, 63: 2, 67: 2, 69: 2, 70: 2, 72: 2, 73: 2, 74: 2, 7

epsilon - 3.5 min_pts - 1
Purity - 0.1679 Gini Index - 0.0824
No. of clusters - 251
Counter({'Noise': 6434, 1: 2588, 6: 126, 12: 64, 11: 40, 25: 29, 9: 19, 50: 18, 93: 18, 13: 11, 45: 11, 26: 10, 4: 9, 15: 9, 89: 8, 33: 7, 17: 6, 73: 6, 78: 6, 107: 6, 109: 6, 5: 5, 23: 5, 30: 5, 31: 5, 48: 5, 74: 5, 111: 5, 130: 5, 147: 5, 167: 5, 205: 5, 10: 4, 39: 4, 41: 4, 43: 4, 53: 4, 56: 4, 57: 4, 61: 4, 63: 4, 106: 4, 119: 4, 120: 4, 126: 4, 132: 4, 138: 4, 162: 4, 168: 4, 181: 4, 220: 4, 7: 3, 8: 3, 16: 3, 21: 3, 24: 3, 38: 3, 40: 3, 67: 3, 72: 3, 75: 3, 76: 3, 79: 3, 80: 3, 83: 3, 84: 3, 90: 3, 95: 3, 96: 3, 99: 3, 100: 3, 102: 3, 103: 3, 123: 3, 141: 3, 144: 3, 151: 3, 153: 3, 155: 3, 160: 3, 166: 3, 185: 3, 198: 3, 199: 3, 217: 3, 219: 3, 221: 3, 230: 3, 232: 3, 2: 2, 3: 2, 14: 2, 18: 2, 19: 2, 20: 2, 22: 2, 27: 2, 28: 2, 29: 2, 32: 2, 34: 2, 35: 2, 36: 2, 37: 2, 42: 2, 44: 2, 46: 2, 47: 2, 49: 2, 51: 2, 52: 2, 54: 2, 55: 2, 58: 2, 59: 2, 60: 2, 62: 2, 64: 2, 65: 2, 66: 2, 68: 2, 69: 2, 70: 