In [3]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import copy
import math
import os

In [4]:
df = pd.read_csv('BCLL.txt', sep='\t', index_col=False)

In [5]:
df.drop('IDENTIFIER', 1, inplace=True)

In [6]:
X = df.values

In [7]:
def get_distance(point1:'numpy array', point2:'numpy array'):
    return np.sqrt(np.sum((point1 - point2)**2)) #no use of zero index which is id

In [8]:
def get_id_best_centroid(node, centroid_list:'dictionary of centroids id, value'):
    mini = math.inf
    for key, centroid in centroid_list.items():
        distance = get_distance(node, centroid)
        if distance < mini:
            mini = distance
            best_id = key
        
    return best_id
    

In [9]:
def get_new_centroids(node_id_to_cluster_id:'node_id:cluster id', node_id_to_node_features:'node id: node vector'):
    cluster_id_to_node_list = defaultdict(list)
    for node_id, cluster_id in node_id_to_cluster_id.items():
        cluster_id_to_node_list[cluster_id].append(node_id_to_node_features[node_id])
    centroid_list_new = defaultdict(list)
    for cluster_id, list_of_nodes in cluster_id_to_node_list.items():
        centroid_list_new[cluster_id] = sum(list_of_nodes)/len(list_of_nodes)
    del cluster_id_to_node_list
    return centroid_list_new

In [10]:

def K_means(k, X):
    node_id_to_node_features = defaultdict(list) # node id : node features
    centroid_dictionary = defaultdict(list) # cluster id: node features
    node_id_to_cluster_id = defaultdict(list) # node id: node features

    for node in X:
        id_ref = node[0]
        node_id_to_node_features[id_ref] = node[1:]

    indexes = [i for i in range(len(X))]
    random.shuffle(indexes)
    for i in range(k):
        centroid_dictionary[i] = copy.deepcopy(X[indexes[i]][1:])

    while True:
        count =0
        for node_id, node_features in node_id_to_node_features.items():
            id_best_centroid = get_id_best_centroid(node_features,centroid_dictionary)
                
            if node_id_to_cluster_id[node_id]!=id_best_centroid:
                count+=1
            node_id_to_cluster_id[node_id] = id_best_centroid
        if count==0:
            return centroid_dictionary, node_id_to_cluster_id, node_id_to_node_features
        centroid_dictionary_new = get_new_centroids(node_id_to_cluster_id, node_id_to_node_features)      
        centroid_dictionary = copy.deepcopy(centroid_dictionary_new)

In [None]:
total_clusters = [i for i in range(2, int(math.sqrt(len(X))))]
random.shuffle(total_clusters)
for k in total_clusters[:10]:
    print("Starting K means for k={}".format(k))
    cluster_id_to_centroid_features, node_id_to_cluster_id, node_id_to_node_features = K_means(k, X)
    print("K means finished!. Starting writing i")
    if not os.path.exists(os.path.dirname('cluster_number_{}'.format(k))):
        os.makedirs('clusters/cluster_number_'+str(k))

    for node_id, cluster_id in node_id_to_cluster_id.items():
        with open('clusters/cluster_number_{}/cluster{}.txt'.format(k, cluster_id+1), 'a+') as f:
            text = str(node_id)+'\n'
            f.write(text)