In [112]:
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
import sklearn.datasets as ds
import itertools as it
import matplotlib.pyplot as plt

In [113]:
def kmeans(number_of_clusters, data_set, random_state=0):
    kmeans = KMeans(n_clusters=number_of_clusters, random_state=random_state).fit(data_set)
    return kmeans

def mini_batch_kmeans(number_of_clusters, data_set, random_state=0):
    kmeans = MiniBatchKMeans(n_clusters=number_of_clusters, random_state=random_state).fit(data_set)
    return kmeans

In [114]:
def get_points_in_each_cluster(labels):
    clusters = []
    for cluster_name in np.unique(labels):
        clusters.append([i for i,x in enumerate(labels) if x == cluster_name])
    return clusters

def get_points_from_same_cluster(clusters, dimension):
    pairs = [it.combinations(cluster, dimension) for cluster in clusters]
    return list(it.chain.from_iterable(pairs))

def count_true_positives(original_pairs, assigned_pairs):
    if_contains = [pair in original_pairs for pair in assigned_pairs]
    return np.count_nonzero(if_contains)

def count_false_negatives(original_pairs, assigned_pairs):
    if_not_contains = [pair not in assigned_pairs for pair in original_pairs]
    return np.count_nonzero(if_not_contains)

## F1 score known as czekanowski dice index

In [115]:
def f1_score(original_labels, assigned_labels, dimension):
    original_clusters = get_points_in_each_cluster(original_labels)
    assigned_clusters = get_points_in_each_cluster(assigned_labels)
    original_pairs = get_points_from_same_cluster(original_clusters, dimension)
    assigned_pairs = get_points_from_same_cluster(assigned_clusters, dimension)
    tp = count_true_positives(original_pairs, assigned_pairs)
    fn = count_false_negatives(original_pairs, assigned_pairs)
    precision = float(tp) / len(assigned_pairs)
    recall = float(tp) / (tp + fn)
    index_value = 2 * (precision * recall) / (precision + recall)
    return index_value

# Test

In [120]:
def test_k_means(data_set, labels):
    result = kmeans(clusters, data_set)
    index_value = f1_score(labels, result.labels_, dim)
    print('kmeans score %f' % index_value)

def test_mini_batch_kmeans(data_set, labels):
    result = mini_batch_kmeans(clusters, data_set)
    index_value = f1_score(labels, result.labels_, dim)
    print('mini batch kmeans score %f' % index_value)
    
clusters = 32
samples = 400
dim = 2
data_set, labels = ds.make_blobs(samples, dim, clusters, cluster_std=1, random_state=3)
test_k_means(data_set, labels)
test_mini_batch_kmeans(data_set, labels)


kmeans score 0.417359
mini batch kmeans score 0.423610
