In [1]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np
import pandas as pd
import math

In [2]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [3]:
# k - index of current data point in data
# e - epsilon
def find_neighbors(k, e, distance_matrix):
    N = []      # Neighbors
    
    for i in range(len(distance_matrix[k])):
        if distance_matrix[k][i] <= e and i != k:   # Return neighbors within distance e, except for the point itself
            N.append(i)

    return N

In [4]:
# e - epsilon
# min_pts - min points
def dbscan(data, e, min_pts, labels=None):
    
    distance_matrix = euclidean_distances(data)

    clusters = []
    for i in range(data.shape[0]):
        clusters.append(math.nan)

    c = 0   # Cluster label
    for i in range(data.shape[0]):

        # Skip if already assigned a cluster
        if not pd.isnull(clusters[i]):
            continue

        S = find_neighbors(i, e, distance_matrix)

        # Density check - label Noise if no. of neighbors less than min_pts
        if len(S) < min_pts:
            clusters[i] = 'Noise'
            continue

        # Next cluster label
        c = c + 1

        # Add point to the new cluster
        clusters[i] = c

        # Process every point in neighborhood except the point itself
        for j in S:
            j = int(j)
            if j != i:

                # Change noise point to border point 
                if clusters[j] == 'Noise':
                    clusters[j] = c

                # Skip if already assigned a cluster
                if not pd.isnull([clusters[j]]):
                    continue

                # Add neighbor to the current cluster
                clusters[j] = c

                # Get neighbors
                N = find_neighbors(j, e, distance_matrix)

                # Density check - add new neighbors to seed set if no. of neighbors greater than min_pts
                if len(N) >= min_pts:
                    for k in N:
                        if int(k) != i:
                            S.append(k)

    # Evaluate results
    print('epsilon -', e, 'min_pts -', min_pts)
    evaluation_metrics(clusters, labels)

In [6]:
# Fetch data
ng_all = fetch_20newsgroups(subset='all')

# Data and labels
ng_data = ng_all.data

ng_labels = []
for i in range(len(ng_data)):
    ng_labels.append(ng_all.target_names[ng_all.target[i]])

print(len(ng_data))
print(len(ng_labels))

18846
18846


In [7]:
# Converting text to vectors
tfidf = TfidfVectorizer(stop_words='english')
vect_ng_all = tfidf.fit_transform(ng_all.data)
print(vect_ng_all.shape)

(18846, 173451)


In [10]:
for p in [1,3,5]:
    for e in [0.8,0.9,1,1.1,1.2]:
        dbscan(data=vect_ng_all, e=e, min_pts=p, labels=ng_labels)
    print('------------------------------')

epsilon - 0.8 min_pts - 1
Purity - 0.2662 Gini Index - 0.0194
No. of clusters - 2019
Counter({'Noise': 13723, 65: 16, 175: 15, 379: 15, 289: 14, 55: 12, 147: 12, 549: 12, 284: 11, 625: 10, 7: 9, 184: 9, 250: 9, 571: 9, 656: 9, 1007: 9, 1027: 9, 1048: 9, 43: 8, 194: 8, 216: 8, 293: 8, 412: 8, 476: 8, 899: 8, 154: 7, 371: 7, 502: 7, 596: 7, 619: 7, 621: 7, 622: 7, 697: 7, 905: 7, 1009: 7, 1092: 7, 1210: 7, 1280: 7, 1569: 7, 1582: 7, 25: 6, 35: 6, 94: 6, 122: 6, 195: 6, 227: 6, 229: 6, 335: 6, 339: 6, 344: 6, 350: 6, 406: 6, 430: 6, 448: 6, 454: 6, 458: 6, 474: 6, 477: 6, 499: 6, 518: 6, 602: 6, 603: 6, 611: 6, 647: 6, 691: 6, 710: 6, 739: 6, 813: 6, 846: 6, 1185: 6, 1467: 6, 1: 5, 3: 5, 10: 5, 20: 5, 40: 5, 56: 5, 59: 5, 76: 5, 82: 5, 129: 5, 173: 5, 196: 5, 212: 5, 232: 5, 247: 5, 267: 5, 286: 5, 362: 5, 382: 5, 407: 5, 434: 5, 436: 5, 464: 5, 467: 5, 481: 5, 484: 5, 486: 5, 489: 5, 513: 5, 520: 5, 527: 5, 612: 5, 688: 5, 716: 5, 726: 5, 745: 5, 752: 5, 756: 5, 781: 5, 866: 5, 911: 5, 9

epsilon - 0.9 min_pts - 1
Purity - 0.4031 Gini Index - 0.0242
No. of clusters - 2678
Counter({'Noise': 11015, 114: 28, 234: 27, 110: 26, 148: 23, 556: 23, 96: 21, 435: 21, 83: 20, 496: 20, 273: 19, 4: 18, 429: 18, 667: 16, 11: 15, 244: 15, 408: 15, 43: 14, 102: 14, 171: 14, 231: 14, 481: 14, 60: 13, 189: 13, 332: 13, 926: 13, 609: 12, 756: 12, 804: 12, 919: 12, 66: 11, 288: 11, 413: 11, 1329: 11, 44: 10, 178: 10, 212: 10, 362: 10, 539: 10, 600: 10, 742: 10, 747: 10, 774: 10, 894: 10, 944: 10, 1054: 10, 1400: 10, 2: 9, 42: 9, 72: 9, 247: 9, 309: 9, 333: 9, 384: 9, 683: 9, 876: 9, 1145: 9, 1265: 9, 1627: 9, 1645: 9, 1857: 9, 1936: 9, 23: 8, 87: 8, 151: 8, 264: 8, 268: 8, 301: 8, 331: 8, 388: 8, 390: 8, 431: 8, 441: 8, 575: 8, 582: 8, 642: 8, 689: 8, 695: 8, 697: 8, 757: 8, 807: 8, 860: 8, 861: 8, 988: 8, 994: 8, 1505: 8, 2143: 8, 39: 7, 53: 7, 90: 7, 142: 7, 147: 7, 261: 7, 310: 7, 365: 7, 422: 7, 547: 7, 569: 7, 686: 7, 690: 7, 703: 7, 707: 7, 725: 7, 733: 7, 745: 7, 748: 7, 755: 7, 765

epsilon - 1 min_pts - 1
Purity - 0.5425 Gini Index - 0.0351
No. of clusters - 2749
Counter({'Noise': 7869, 33: 173, 5: 165, 15: 157, 123: 133, 176: 101, 2: 85, 106: 73, 43: 65, 234: 59, 4: 55, 3: 53, 136: 50, 179: 35, 13: 33, 461: 33, 662: 33, 40: 32, 67: 32, 433: 32, 52: 31, 273: 31, 304: 28, 373: 26, 516: 26, 555: 26, 220: 25, 258: 25, 25: 24, 57: 24, 95: 24, 240: 23, 370: 23, 376: 23, 609: 23, 83: 22, 339: 22, 277: 21, 851: 21, 572: 20, 624: 20, 771: 20, 6: 19, 274: 19, 291: 19, 295: 19, 319: 19, 463: 19, 773: 19, 921: 19, 243: 18, 355: 18, 412: 18, 510: 18, 828: 18, 19: 17, 169: 17, 219: 17, 246: 17, 270: 17, 278: 17, 398: 17, 496: 17, 521: 17, 568: 17, 762: 17, 998: 17, 93: 16, 99: 16, 196: 16, 213: 16, 730: 16, 31: 15, 186: 15, 188: 15, 394: 15, 514: 15, 524: 15, 718: 15, 859: 15, 44: 14, 205: 14, 214: 14, 430: 14, 827: 14, 847: 14, 1056: 14, 1268: 14, 1372: 14, 35: 13, 78: 13, 111: 13, 122: 13, 236: 13, 459: 13, 634: 13, 883: 13, 1081: 13, 8: 12, 70: 12, 142: 12, 206: 12, 247: 1

epsilon - 1.1 min_pts - 1
Purity - 0.384 Gini Index - 0.0591
No. of clusters - 1887
Counter({2: 6795, 'Noise': 4961, 4: 112, 28: 100, 164: 46, 17: 44, 61: 37, 79: 35, 327: 35, 473: 35, 71: 33, 37: 32, 1: 31, 86: 29, 217: 29, 176: 27, 869: 27, 9: 26, 260: 26, 33: 24, 45: 23, 68: 23, 329: 23, 134: 22, 238: 22, 271: 22, 104: 21, 80: 20, 208: 20, 766: 20, 722: 19, 411: 18, 491: 18, 578: 18, 627: 18, 124: 17, 190: 17, 200: 16, 713: 16, 92: 15, 296: 15, 374: 15, 398: 15, 85: 14, 173: 14, 186: 14, 312: 14, 330: 14, 383: 14, 404: 14, 437: 14, 497: 14, 39: 13, 100: 13, 107: 13, 218: 13, 227: 13, 294: 13, 407: 13, 548: 13, 556: 13, 1272: 13, 57: 12, 152: 12, 174: 12, 289: 12, 315: 12, 410: 12, 431: 12, 610: 12, 638: 12, 804: 12, 13: 11, 110: 11, 496: 11, 693: 11, 1129: 11, 10: 10, 15: 10, 23: 10, 58: 10, 65: 10, 130: 10, 199: 10, 406: 10, 409: 10, 489: 10, 512: 10, 614: 10, 624: 10, 725: 10, 783: 10, 1085: 10, 21: 9, 24: 9, 127: 9, 256: 9, 283: 9, 295: 9, 372: 9, 550: 9, 607: 9, 646: 9, 702: 9, 

epsilon - 1.2 min_pts - 1
Purity - 0.1504 Gini Index - 0.0987
No. of clusters - 740
Counter({1: 14240, 'Noise': 2442, 224: 16, 372: 16, 164: 12, 248: 12, 11: 11, 57: 11, 284: 11, 17: 10, 27: 10, 54: 10, 270: 10, 140: 9, 361: 9, 33: 8, 50: 8, 113: 8, 126: 8, 159: 8, 161: 8, 173: 8, 174: 8, 175: 8, 207: 8, 403: 8, 8: 7, 59: 7, 76: 7, 145: 7, 259: 7, 269: 7, 285: 7, 474: 7, 12: 6, 72: 6, 92: 6, 107: 6, 148: 6, 160: 6, 189: 6, 232: 6, 304: 6, 345: 6, 388: 6, 405: 6, 409: 6, 416: 6, 443: 6, 448: 6, 533: 6, 553: 6, 14: 5, 70: 5, 71: 5, 86: 5, 88: 5, 91: 5, 102: 5, 106: 5, 117: 5, 124: 5, 134: 5, 137: 5, 165: 5, 170: 5, 180: 5, 202: 5, 203: 5, 209: 5, 236: 5, 251: 5, 261: 5, 271: 5, 277: 5, 278: 5, 313: 5, 316: 5, 320: 5, 327: 5, 358: 5, 390: 5, 433: 5, 436: 5, 452: 5, 488: 5, 515: 5, 550: 5, 560: 5, 633: 5, 651: 5, 28: 4, 31: 4, 45: 4, 46: 4, 68: 4, 90: 4, 128: 4, 144: 4, 152: 4, 162: 4, 172: 4, 194: 4, 219: 4, 220: 4, 227: 4, 229: 4, 231: 4, 243: 4, 245: 4, 249: 4, 256: 4, 260: 4, 272: 4, 3

epsilon - 1 min_pts - 3
Purity - 0.2743 Gini Index - 0.0787
No. of clusters - 682
Counter({'Noise': 13153, 3: 103, 47: 102, 59: 87, 2: 82, 104: 80, 73: 72, 10: 62, 23: 62, 5: 56, 62: 52, 39: 50, 35: 48, 57: 43, 78: 34, 60: 32, 18: 30, 191: 29, 155: 28, 105: 27, 227: 27, 76: 25, 234: 25, 6: 24, 156: 24, 95: 23, 32: 22, 113: 22, 29: 21, 217: 21, 404: 21, 94: 20, 96: 20, 257: 20, 223: 20, 99: 19, 101: 19, 117: 19, 120: 19, 136: 19, 146: 19, 224: 19, 238: 18, 128: 18, 82: 18, 423: 18, 180: 18, 242: 18, 25: 17, 44: 17, 228: 17, 127: 17, 173: 17, 169: 17, 259: 17, 301: 17, 19: 16, 359: 16, 488: 16, 75: 16, 115: 16, 158: 16, 306: 16, 291: 15, 34: 15, 55: 15, 147: 15, 425: 15, 135: 15, 161: 15, 212: 15, 125: 14, 253: 14, 313: 14, 383: 14, 561: 13, 151: 13, 31: 13, 318: 13, 174: 13, 182: 13, 245: 13, 314: 13, 410: 12, 49: 12, 74: 12, 88: 12, 263: 12, 123: 12, 145: 12, 202: 12, 261: 12, 262: 12, 350: 12, 464: 12, 415: 12, 421: 12, 12: 11, 41: 11, 51: 11, 79: 11, 131: 11, 239: 11, 272: 11, 209: 1

epsilon - 1.2 min_pts - 3
Purity - 0.0875 Gini Index - 0.1768
No. of clusters - 169
Counter({1: 13800, 'Noise': 4152, 39: 16, 93: 15, 29: 12, 45: 12, 11: 11, 101: 11, 5: 10, 7: 10, 51: 10, 57: 10, 113: 10, 3: 9, 120: 9, 22: 9, 84: 9, 6: 8, 8: 8, 65: 8, 82: 8, 19: 8, 26: 8, 27: 8, 144: 8, 38: 8, 76: 8, 81: 8, 2: 7, 131: 7, 23: 7, 31: 7, 47: 7, 50: 7, 110: 7, 100: 7, 16: 6, 24: 6, 89: 6, 60: 6, 41: 6, 77: 6, 80: 6, 96: 6, 87: 6, 91: 6, 132: 6, 118: 6, 4: 5, 34: 5, 15: 5, 147: 5, 40: 5, 130: 5, 37: 5, 13: 5, 17: 5, 32: 5, 20: 5, 30: 5, 161: 5, 133: 5, 35: 5, 36: 5, 83: 5, 42: 5, 122: 5, 59: 5, 52: 5, 73: 5, 54: 5, 55: 5, 153: 5, 85: 5, 62: 5, 67: 5, 152: 5, 72: 5, 109: 5, 78: 5, 86: 5, 90: 5, 92: 5, 95: 5, 105: 5, 107: 5, 117: 5, 127: 5, 128: 5, 129: 5, 135: 5, 140: 5, 142: 5, 104: 4, 158: 4, 49: 4, 9: 4, 10: 4, 125: 4, 136: 4, 74: 4, 164: 4, 150: 4, 146: 4, 134: 4, 75: 4, 21: 4, 151: 4, 25: 4, 28: 4, 56: 4, 33: 4, 114: 4, 123: 4, 157: 4, 145: 4, 143: 4, 111: 4, 43: 4, 44: 4, 64: 4, 46: 4

------------------------------
