In [17]:
import numpy as np
import pandas as pd
import math
import random
import os

In [11]:
df = pd.read_csv('BCLL.txt', sep='\t', index_col=False)
df.drop('IDENTIFIER', axis=1, inplace=True)

In [12]:
def calculate_distance(case1, case2): # complexity is d = 21
    distance_squared = 0
    for i in range(len(case1)):
        distance_squared += (case1[i] - case2[i]) ** 2
    return math.sqrt(distance_squared)

In [13]:
def calculate_centroids(clusters):
    cluster_centers = {}
    for key, cases in clusters.items():
        temp = []
        for i in range(1, 22):
            suma = 0
            for case in cases:
                suma += case[i]
            temp.append(suma / len(cases))
        cluster_centers[key] = temp
    return cluster_centers

In [14]:
def compare_clusters(centroids_1, centroids_2):
    keys_1 = centroids_1.keys()
    keys_2 = centroids_2.keys()
    if len(keys_1) != len(keys_2):
        return False
    for key in keys_1:
        if centroids_1[key] != centroids_2[key]:
            return False
    return True

In [15]:
def KMeans(N, k):
    initial_center_indexes = random.sample(range(0, N), k)
    cluster_centers = {}
    for i in initial_center_indexes:
        list1 = []
        for j in range(21):
            list1.append(df.ix[i][j + 1])
        cluster_centers[i] = list1
    while True:
        clusters = {}
        for key in cluster_centers.keys():
            clusters[key] = []
        for index, rows in df.iterrows(): # comlexity is N = 4655
            rows = rows.tolist()
            mini = math.inf
            min_key = -1
            for key in cluster_centers.keys(): # complexity is k
                dis = calculate_distance(rows[1:], cluster_centers[key]) # complexity is d = 21
                if mini > dis:
                    mini = dis
                    min_key = key
            clusters[min_key].append(rows)
        centroids_new = calculate_centroids(clusters)
        if compare_clusters(cluster_centers, centroids_new):
            return clusters
        else:
            cluster_centers = centroids_new

In [22]:
N = len(df)
sqrt_N = math.floor(math.sqrt(N))
Ks = random.sample(range(2, sqrt_N), 10)
print("Go for walk or something because this may take hours.")
for k in Ks:
    print("Processing k-means for k =", k)
    clusters = KMeans(N, k)
    print(k, "clusters found. Storing them in a folder named 'cluster_number_'" + str(k))
    if not os.path.exists(os.path.dirname('cluster_number_' + str(k))):
        os.makedirs('cluster_number_' + str(k))
    else:
        folder = os.path.dirname('cluster_number_' + str(k))
        shutil.rmtree(folder)
        os.makedirs('cluster_number_' + str(k))
    itr = 0
    for key, cases in clusters.items():
        itr += 1
        f = open('cluster_number_' + str(k) + '/cluster' + str(itr) + '.txt', 'a+')
        for case in cases:
            f.write(case[0] + '\n')
        f.close()

Go for walk or something because this may take hours.
Processing k-means for k = 15
15 clusters found. Storing them in a folder named 'cluster_number_'15
Processing k-means for k = 24
24 clusters found. Storing them in a folder named 'cluster_number_'24
Processing k-means for k = 34
34 clusters found. Storing them in a folder named 'cluster_number_'34
Processing k-means for k = 33
33 clusters found. Storing them in a folder named 'cluster_number_'33
Processing k-means for k = 20
20 clusters found. Storing them in a folder named 'cluster_number_'20
Processing k-means for k = 4
4 clusters found. Storing them in a folder named 'cluster_number_'4
Processing k-means for k = 43
43 clusters found. Storing them in a folder named 'cluster_number_'43
Processing k-means for k = 17
17 clusters found. Storing them in a folder named 'cluster_number_'17
Processing k-means for k = 46
46 clusters found. Storing them in a folder named 'cluster_number_'46
Processing k-means for k = 10
10 clusters found. 