In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import math

In [2]:
print("Fecthing data..................................")
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
newsgroups_data = newsgroups_train.data + newsgroups_test.data

print("Data fetched successfully")
print("Turning data into Tf-IDF format................")
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(newsgroups_data)
print("Done")


Fecthing data..................................
Data fetched successfully
Turning data into Tf-IDF format................
Done


In [3]:
def make_clusters(vectors, centroids, k):
    number_of_docs = vectors.shape[0]
    cluster_assignment = []
    for i in range(0, number_of_docs - 1):
        similarities = []
        for j in range(0, k):
            similarity = cosine_similarity(vectors[i], centroids[j])
            similarity = similarity[0][0]
            similarities.append((similarity, j))

        similarities = sorted(similarities, reverse = True)
        cluster_number = similarities[0][1]
        cluster_assignment.append((cluster_number, i))

    clusters = [[] for i in range(0, k)]

    for assignment in cluster_assignment:
        cluster = assignment[0]
        doc_index = assignment[1]
        clusters[cluster].append(doc_index)

    return clusters

In [4]:
def recalculate_centroids(vectors, centroids, clusters):
    for i in range (0, len(clusters)):
        centroid = vectors[clusters[i][0]]
        for j in range(1, len(clusters[i])):
            centroid += vectors[clusters[i][j]]
        centroid = centroid / len(clusters[i])
        centroids[i] = centroid
    return centroids

In [5]:
def remake_cluster(clusters, centroids, vectors):
    total_docs_reclustered = 0
    for i in range(0, len(clusters)):
        cluster = clusters[i]
        number_of_docs_reclustered = 0
        for doc_index in cluster:
            similarity_to_each_centroid = {}
            for j in range(0 , len(centroids)):
                similarity = cosine_similarity(vectors[doc_index], centroids[j])
                similarity_to_each_centroid[j] = similarity[0][0]

            max_similarity = similarity_to_each_centroid[0]
            max_similarity_index = 0
            for cluster_no in similarity_to_each_centroid.keys():
                if similarity_to_each_centroid[cluster_no] > max_similarity:
                    max_similarity = similarity_to_each_centroid[cluster_no]
                    max_similarity_index = cluster_no
            if max_similarity_index != i:
                clusters[i].remove(doc_index)
                clusters[max_similarity_index].append(doc_index)
                change_occured = True
                number_of_docs_reclustered += 1
        total_docs_reclustered += number_of_docs_reclustered
        
    return clusters, total_docs_reclustered

In [6]:
def calculate_gini_index(clusters):
    gini_total = 0
    for cluster in clusters:
        cluster_gini = 1
        doc_label_count = create_doc_label_count_dict(cluster)
        for label in doc_label_count.keys():
            cluster_gini = cluster_gini - math.pow((doc_label_count[label] / len(cluster)) , 2)
        gini_total += cluster_gini * len(cluster)
    return gini_total / vectors.shape[0]
    
    

In [7]:
def create_doc_label_count_dict(cluster):
    doc_label_count = {}
    for doc_index in cluster:
        if doc_index > 11313:
            label = newsgroups_test.target_names[newsgroups_test.target[11314-doc_index]]
        else:
            label = newsgroups_train.target_names[newsgroups_train.target[doc_index]]
            
        if label in doc_label_count:
            doc_label_count[label] += 1
        else:
            doc_label_count[label] = 1
    return doc_label_count

In [8]:
def calculate_purity_of_clusters(clusters):
    total_purity = 0
    for i in range(0, len(clusters)):
        majority_count = majority_count_of_cluster(clusters[i])
        total_purity += majority_count
    total_purity = total_purity / vectors.shape[0]
    return total_purity

In [9]:
def majority_count_of_cluster(cluster):
    label_occurance = {}
    for doc_index in cluster:
        if doc_index > 11313:
            label = newsgroups_test.target_names[newsgroups_test.target[11314-doc_index]]
        else:
            label = newsgroups_train.target_names[newsgroups_train.target[doc_index]]
            
        if label in label_occurance:
            label_occurance[label] += 1
        else:
            label_occurance[label] = 1
    majority_label = ""
    max_occurance = 0
    for label in label_occurance.keys():
        if label_occurance[label] > max_occurance:
            max_occurance = label_occurance[label]
            majority_label = label
    
    majority_count = 0
    for doc_index in cluster:
        if doc_index > 11313:
            label = newsgroups_test.target_names[newsgroups_test.target[11314-doc_index]]
        else:
            label = newsgroups_train.target_names[newsgroups_train.target[doc_index]]
        if label == majority_label:
            majority_count += 1
    
    return majority_count 

In [10]:
def KMeans(k):

    number_of_docs = vectors.shape[0]


    centroid_indexs = np.linspace(0, number_of_docs-1, k)
    centroid_indexs = list(map(int, centroid_indexs))
    
    centroids = []
    for i in range(0, len(centroid_indexs)):
        centroids.append(vectors[centroid_indexs[i]])
    # step 1
    clusters = make_clusters(vectors, centroids, k)

    iterate = True
    while(iterate):
        centroids = recalculate_centroids(vectors, centroids, clusters)
        clusters, total_docs_reclustered = remake_cluster(clusters, centroids, vectors)
        purity_of_clusters = calculate_purity_of_clusters(clusters)
        gini_index = calculate_gini_index(clusters)
        
        print("Number of documents reclustered: " + str(total_docs_reclustered))
        print("Total purity: " + str(purity_of_clusters))
        print("Gini index of clusters: " + str(gini_index))
        print()
        if(total_docs_reclustered == 0):
            iterate = False
    return clusters

In [11]:
print("Starting KMeans.....\n")
clusters = KMeans(20)
print("Done")

Starting KMeans.....

Number of documents reclustered: 5091
Total purity: 0.19691181152499204
Gini index of clusters: 0.9107886258537689

Number of documents reclustered: 3575
Total purity: 0.22821818953624112
Gini index of clusters: 0.8936143344562979

Number of documents reclustered: 2456
Total purity: 0.24588772153242067
Gini index of clusters: 0.8821873141048198

Number of documents reclustered: 1552
Total purity: 0.254589833386395
Gini index of clusters: 0.8767052614567643

Number of documents reclustered: 1054
Total purity: 0.25899395097102834
Gini index of clusters: 0.873518843134441

Number of documents reclustered: 734
Total purity: 0.2611694789345219
Gini index of clusters: 0.8718082183076427

Number of documents reclustered: 554
Total purity: 0.2626552053486151
Gini index of clusters: 0.8702882009625796

Number of documents reclustered: 445
Total purity: 0.2633450068980155
Gini index of clusters: 0.8689366654182014

Number of documents reclustered: 402
Total purity: 0.264883