In [3]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np
import pandas as pd
import math
import os

In [4]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [5]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [6]:
# k - index of current data point in data
# e - epsilon
def find_neighbors(k, e, distance_matrix):
    N = []      # Neighbors
    
    for i in range(len(distance_matrix[k])):
        if distance_matrix[k][i] <= e and i != k:   # Return neighbors within distance e, except for the point itself
            N.append(i)

    return N

In [7]:
# e - epsilon
# min_pts - min points
def dbscan(data, e, min_pts, labels=None):
    
    distance_matrix = euclidean_distances(data)

    clusters = []
    for i in range(data.shape[0]):
        clusters.append(math.nan)

    c = 0   # Cluster label
    for i in range(data.shape[0]):

        # Skip if already assigned a cluster
        if not pd.isnull(clusters[i]):
            continue

        S = find_neighbors(i, e, distance_matrix)

        # Density check - label Noise if no. of neighbors less than min_pts
        if len(S) < min_pts:
            clusters[i] = 'Noise'
            continue

        # Next cluster label
        c = c + 1

        # Add point to the new cluster
        clusters[i] = c

        # Process every point in neighborhood except the point itself
        for j in S:
            j = int(j)
            if j != i:

                # Change noise point to border point 
                if clusters[j] == 'Noise':
                    clusters[j] = c

                # Skip if already assigned a cluster
                if not pd.isnull([clusters[j]]):
                    continue

                # Add neighbor to the current cluster
                clusters[j] = c

                # Get neighbors
                N = find_neighbors(j, e, distance_matrix)

                # Density check - add new neighbors to seed set if no. of neighbors greater than min_pts
                if len(N) >= min_pts:
                    for k in N:
                        if int(k) != i:
                            S.append(k)

    # Evaluate results
    print('epsilon -', e, 'min_pts -', min_pts)
    evaluation_metrics(clusters, labels)

In [11]:
# N - size of sample
def get_samples(data, N, labels=None):
    sampled_data = np.zeros((N, data.shape[1]))
    
    if labels is None:
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]

        return sampled_data
        
    else:
        sampled_labels = []
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]
            sampled_labels.append(labels[j])

        return (sampled_data, sampled_labels)

In [9]:
# Fetch data

dataset_path = abspath('datasets', 'household_power_consumption.txt')
household_dataset = pd.read_csv(dataset_path, sep=';', header=0)
household_dataset = household_dataset.values[:, range(2, household_dataset.shape[1])]

rows = 0
for i in range(household_dataset.shape[0]):
    flag = 0
    for j in range(household_dataset.shape[1]):
        if household_dataset[i][j] == '.' or household_dataset[i][j] == '?' or math.isnan(float(household_dataset[i][j])):
            flag = 1
            break
    
    if flag == 0:
        rows += 1

# Clean data
new_dataset = np.zeros((rows, household_dataset.shape[1]))
k = 0
for i in range(household_dataset.shape[0]):
    flag = 0
    for j in range(household_dataset.shape[1]):
        if household_dataset[i][j] == '.' or household_dataset[i][j] == '?' or math.isnan(float(household_dataset[i][j])):
            flag = 1
            break
    
    if flag == 0:
        for l in range(household_dataset.shape[1]):
            new_dataset[k][l] = float(household_dataset[i][l])
        k += 1
                    
print(new_dataset.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(2049280, 7)


In [12]:
sampled_data = get_samples(new_dataset, 5000)
for p in [1,3,5]:
    for e in [1,2,3,4,5,6]:
        dbscan(data=sampled_data, e=e, min_pts=p)
    print('------------------------------')

epsilon - 1 min_pts - 1
No. of clusters - 159
Counter({1: 2139, 'Noise': 694, 2: 500, 5: 296, 8: 196, 4: 179, 11: 129, 16: 110, 6: 92, 17: 82, 33: 55, 15: 32, 10: 21, 19: 20, 38: 18, 32: 16, 13: 12, 27: 10, 54: 8, 80: 7, 3: 6, 22: 6, 46: 6, 59: 6, 63: 6, 75: 6, 12: 5, 18: 5, 21: 5, 39: 5, 81: 5, 82: 5, 87: 5, 91: 5, 136: 5, 30: 4, 36: 4, 37: 4, 44: 4, 49: 4, 51: 4, 52: 4, 65: 4, 73: 4, 96: 4, 97: 4, 102: 4, 107: 4, 118: 4, 150: 4, 154: 4, 14: 3, 25: 3, 40: 3, 42: 3, 43: 3, 47: 3, 53: 3, 57: 3, 60: 3, 64: 3, 67: 3, 68: 3, 69: 3, 76: 3, 84: 3, 90: 3, 92: 3, 95: 3, 111: 3, 115: 3, 121: 3, 126: 3, 138: 3, 7: 2, 9: 2, 20: 2, 23: 2, 24: 2, 26: 2, 28: 2, 29: 2, 31: 2, 34: 2, 35: 2, 41: 2, 45: 2, 48: 2, 50: 2, 55: 2, 56: 2, 58: 2, 61: 2, 62: 2, 66: 2, 70: 2, 71: 2, 72: 2, 74: 2, 77: 2, 78: 2, 79: 2, 83: 2, 85: 2, 86: 2, 88: 2, 89: 2, 93: 2, 94: 2, 98: 2, 99: 2, 100: 2, 101: 2, 103: 2, 104: 2, 105: 2, 106: 2, 108: 2, 109: 2, 110: 2, 112: 2, 113: 2, 114: 2, 116: 2, 117: 2, 119: 2, 120: 2, 122: 2

KeyboardInterrupt: 