In [1]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np
import pandas as pd
import math
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [3]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [4]:
# k - index of current data point in data
# e - epsilon
def find_neighbors(k, e, distance_matrix):
    N = []      # Neighbors
    
    for i in range(len(distance_matrix[k])):
        if distance_matrix[k][i] <= e and i != k:   # Return neighbors within distance e, except for the point itself
            N.append(i)

    return N

In [5]:
# e - epsilon
# min_pts - min points
def dbscan(data, e, min_pts, labels=None):
    
    distance_matrix = euclidean_distances(data)

    clusters = []
    for i in range(data.shape[0]):
        clusters.append(math.nan)

    c = 0   # Cluster label
    for i in range(data.shape[0]):

        # Skip if already assigned a cluster
        if not pd.isnull(clusters[i]):
            continue

        S = find_neighbors(i, e, distance_matrix)

        # Density check - label Noise if no. of neighbors less than min_pts
        if len(S) < min_pts:
            clusters[i] = 'Noise'
            continue

        # Next cluster label
        c = c + 1

        # Add point to the new cluster
        clusters[i] = c

        # Process every point in neighborhood except the point itself
        for j in S:
            j = int(j)
            if j != i:

                # Change noise point to border point 
                if clusters[j] == 'Noise':
                    clusters[j] = c

                # Skip if already assigned a cluster
                if not pd.isnull([clusters[j]]):
                    continue

                # Add neighbor to the current cluster
                clusters[j] = c

                # Get neighbors
                N = find_neighbors(j, e, distance_matrix)

                # Density check - add new neighbors to seed set if no. of neighbors greater than min_pts
                if len(N) >= min_pts:
                    for k in N:
                        if int(k) != i:
                            S.append(k)

    # Evaluate results
    print('epsilon -', e, 'min_pts -', min_pts)
    evaluation_metrics(clusters, labels)

In [6]:
# N - size of sample
def get_samples(data, N, labels=None):
    sampled_data = np.zeros((N, data.shape[1]))
    
    if labels is None:
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]

        return sampled_data
        
    else:
        sampled_labels = []
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]
            sampled_labels.append(labels[j])

        return (sampled_data, sampled_labels)

In [7]:
# Fetch data

dataset_path = abspath('datasets', 'household_power_consumption.txt')
household_dataset = pd.read_csv(dataset_path, sep=';', header=0)
household_dataset = household_dataset.values[:, range(2, household_dataset.shape[1])]

rows = 0
for i in range(household_dataset.shape[0]):
    flag = 0
    for j in range(household_dataset.shape[1]):
        if household_dataset[i][j] == '.' or household_dataset[i][j] == '?' or math.isnan(float(household_dataset[i][j])):
            flag = 1
            break
    
    if flag == 0:
        rows += 1

# Clean data
new_dataset = np.zeros((rows, household_dataset.shape[1]))
k = 0
for i in range(household_dataset.shape[0]):
    flag = 0
    for j in range(household_dataset.shape[1]):
        if household_dataset[i][j] == '.' or household_dataset[i][j] == '?' or math.isnan(float(household_dataset[i][j])):
            flag = 1
            break
    
    if flag == 0:
        for l in range(household_dataset.shape[1]):
            new_dataset[k][l] = float(household_dataset[i][l])
        k += 1
                    
print(new_dataset.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(2049280, 7)


In [8]:
sampled_data = get_samples(new_dataset, 5000)
for p in [1,3,5]:
    for e in [2,4]:
        dbscan(data=sampled_data, e=e, min_pts=p)
    print('------------------------------')

epsilon - 2 min_pts - 1
No. of clusters - 36
Counter({1: 3186, 2: 1406, 'Noise': 217, 6: 54, 8: 32, 7: 10, 17: 9, 3: 8, 12: 6, 29: 6, 11: 5, 15: 5, 16: 4, 5: 3, 13: 3, 19: 3, 21: 3, 24: 3, 25: 3, 4: 2, 9: 2, 10: 2, 14: 2, 18: 2, 20: 2, 22: 2, 23: 2, 26: 2, 27: 2, 28: 2, 30: 2, 31: 2, 32: 2, 33: 2, 34: 2, 35: 2}) 

epsilon - 4 min_pts - 1
No. of clusters - 21
Counter({1: 4705, 6: 65, 2: 54, 'Noise': 51, 3: 37, 10: 19, 5: 17, 4: 14, 17: 6, 7: 5, 15: 4, 11: 3, 13: 3, 18: 3, 8: 2, 9: 2, 12: 2, 14: 2, 16: 2, 19: 2, 20: 2}) 

------------------------------
epsilon - 2 min_pts - 3
No. of clusters - 16
Counter({1: 3085, 2: 1405, 'Noise': 278, 3: 100, 4: 49, 5: 29, 9: 10, 12: 9, 7: 6, 14: 6, 6: 4, 8: 4, 13: 4, 11: 4, 15: 4, 10: 3}) 

epsilon - 4 min_pts - 3
No. of clusters - 11
Counter({1: 4704, 'Noise': 80, 4: 65, 3: 54, 7: 37, 8: 18, 6: 15, 2: 12, 9: 6, 5: 5, 10: 4}) 

------------------------------
epsilon - 2 min_pts - 5
No. of clusters - 9
Counter({1: 3077, 2: 1402, 'Noise': 333, 3: 97, 5: