In [None]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np
import pandas as pd
import math

In [None]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [None]:
# k - index of current data point in data
# e - epsilon
def find_neighbors(k, e, distance_matrix):
    N = []      # Neighbors
    
    for i in range(len(distance_matrix[k])):
        if distance_matrix[k][i] <= e and i != k:   # Return neighbors within distance e, except for the point itself
            N.append(i)

    return N

In [None]:
# e - epsilon
# min_pts - min points
def dbscan(data, e, min_pts, labels=None):
    
    distance_matrix = euclidean_distances(data)

    clusters = []
    for i in range(data.shape[0]):
        clusters.append(math.nan)

    c = 0   # Cluster label
    for i in range(data.shape[0]):

        # Skip if already assigned a cluster
        if not pd.isnull(clusters[i]):
            continue

        S = find_neighbors(i, e, distance_matrix)

        # Density check - label Noise if no. of neighbors less than min_pts
        if len(S) < min_pts:
            clusters[i] = 'Noise'
            continue

        # Next cluster label
        c = c + 1

        # Add point to the new cluster
        clusters[i] = c

        # Process every point in neighborhood except the point itself
        for j in S:
            j = int(j)
            if j != i:

                # Change noise point to border point 
                if clusters[j] == 'Noise':
                    clusters[j] = c

                # Skip if already assigned a cluster
                if not pd.isnull([clusters[j]]):
                    continue

                # Add neighbor to the current cluster
                clusters[j] = c

                # Get neighbors
                N = find_neighbors(j, e, distance_matrix)

                # Density check - add new neighbors to seed set if no. of neighbors greater than min_pts
                if len(N) >= min_pts:
                    for k in N:
                        if int(k) != i:
                            S.append(k)

    # Evaluate results
    print('epsilon -', e, 'min_pts -', min_pts)
    evaluation_metrics(clusters, labels)

In [None]:
# Fetch data
ng_all = fetch_20newsgroups(subset='all')

# Data and labels
ng_data = ng_all.data

ng_labels = []
for i in range(len(ng_data)):
    ng_labels.append(ng_all.target_names[ng_all.target[i]])

print(len(ng_data))
print(len(ng_labels))

In [None]:
# Converting text to vectors
tfidf = TfidfVectorizer(stop_words='english')
vect_ng_all = tfidf.fit_transform(ng_all.data)
print(vect_ng_all.shape)

In [None]:
for p in [1,3,5]:
    for e in [1, 2]:
        dbscan(data=vect_ng_all, e=e, min_pts=p, labels=ng_labels)
    print('------------------------------')