In [106]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import copy
import math
import os

In [2]:
df = pd.read_csv('BCLL.txt', sep='\t', index_col=False)

In [3]:
df.drop('IDENTIFIER', 1, inplace=True)
print(list(df))

['ID_REF', 'GSM45021', 'GSM45022', 'GSM45023', 'GSM45024', 'GSM45025', 'GSM45066', 'GSM45067', 'GSM45068', 'GSM45069', 'GSM45070', 'GSM45071', 'GSM45072', 'GSM45073', 'GSM45074', 'GSM45075', 'GSM45076', 'GSM45077', 'GSM45078', 'GSM45079', 'GSM45080', 'GSM45081']


In [4]:
X = df.values

In [5]:
print(len(X))

4655


In [6]:
print(X[:2])

[['1009_at' 1145.5 1014.9 1103.1 850.4 886.6 859.3 1228.6 1231.9 1118.3
  762.1 1438.9 763.7 1382.4 1008.2 1025.9 1197.0 735.2 865.3 485.0 811.1
  783.0]
 ['100_g_at' 169.0 200.6 196.3 151.4 167.8 111.3 164.9 240.6 155.9 215.2
  121.7 277.2 172.7 163.6 194.0 117.1 130.5 164.2 212.2 108.4 114.7]]


In [7]:
temp =[i[0] for i in X]
print(temp[:10])
print(len(temp))
print(len(set(temp)))

['1009_at', '100_g_at', '1011_s_at', '1012_at', '1013_at', '1018_at', '1019_g_at', '1020_s_at', '1025_g_at', '1029_s_at']
4655
4655


In [8]:
def get_distance(point1:'numpy array', point2:'numpy array'):
    return np.sqrt(np.sum((point1 - point2)**2)) #no use of zero index which is id

In [17]:
def get_id_best_centroid(node, centroid_list:'dictionary of centroids id, value'):
    mini = math.inf
    for key, centroid in centroid_list.items():
        distance = get_distance(node, centroid)
        if distance < mini:
            mini = distance
            best_id = key
        
    return best_id
    

In [56]:
def get_new_centroids(node_id_to_cluster_id:'node_id:cluster id', node_id_to_node_features:'node id: node vector'):
    cluster_id_to_node_list = defaultdict(list)
    for node_id, cluster_id in node_id_to_cluster_id.items():
#         print("node id {0}\nnode features{1}\ncluster id{2}".format(node_id, node_id_to_node_features[node_id], node_id_to_cluster_id[node_id]))
        cluster_id_to_node_list[cluster_id].append(node_id_to_node_features[node_id])
    centroid_list_new = defaultdict(list)
#     centroid_list_new = {}
    for cluster_id, list_of_nodes in cluster_id_to_node_list.items():
#         print("cluster id {}".format(cluster_id))
        centroid_list_new[cluster_id] = sum(list_of_nodes)/len(list_of_nodes)
    del cluster_id_to_node_list
    return centroid_list_new

In [97]:

def K_means(k, X):
    node_id_to_node_features = defaultdict(list) # node id : node features
    centroid_dictionary = defaultdict(list) # cluster id: node features
    node_id_to_cluster_id = defaultdict(list) # node id: node features
#     node_id_to_node_features = {}
#     centroid_dictionary = {}
#     node_id_to_cluster_id ={}
    for node in X:
        id_ref = node[0]
        node_id_to_node_features[id_ref] = node[1:]

    indexes = [i for i in range(len(X))]
    random.shuffle(indexes)
    for i in range(k):
        centroid_dictionary[i] = copy.deepcopy(X[indexes[i]][1:])
#     print(centroid_dictionary)
#     print('********')
    while True:
        count =0
        for node_id, node_features in node_id_to_node_features.items():
            id_best_centroid = get_id_best_centroid(node_features,centroid_dictionary)
#             print(node_id_to_cluster_id[node_id], id_best_centroid)
                
            if node_id_to_cluster_id[node_id]!=id_best_centroid:
                count+=1
            node_id_to_cluster_id[node_id] = id_best_centroid
#         print(" node id to cluster id {}".format(node_id_to_cluster_id))
#         print("no of changes {}".format(count))
        if count==0:
            return centroid_dictionary, node_id_to_cluster_id, node_id_to_node_features
#         for node_id, features in centroid_dictionary.items():
#             print("node :{0} cluster :{1}".format(node_id, node_id_to_cluster_id[node_id]))
        centroid_dictionary_new = get_new_centroids(node_id_to_cluster_id, node_id_to_node_features)
#         print(" old centroid {}". format(centroid_dictionary))
#         print("*********")
#         print(" new dictionary {}".format(centroid_dictionary_new))
#         print('++++++++++')
#         print(len(centroid_dictionary[0]))
#         print(len(centroid_dictionary_new[0]))
#         print(centroid_dictionary_new[0] == centroid_dictionary[0])
#         flag=0
#         for centroid_id , y in centroid_dictionary.items():
#             if np.all([centroid_dictionary[centroid_id], centroid_dictionary_new[centroid_id]]) is not True:
#                 print("NOT true")
#                 flag=1
#                 break
                
#         if flag==0:
#             break            
        centroid_dictionary = copy.deepcopy(centroid_dictionary_new)
#     return dictionary

In [113]:
total_clusters = [i for i in range(2, int(math.sqrt(len(X))))]
random.shuffle(total_clusters)
for k in total_clusters[:10]:
#     print("k: {}".format(k))
    print("Starting K means for k={}".format(k))
    cluster_id_to_centroid_features, node_id_to_cluster_id, node_id_to_node_features = K_means(k, X)
    print("K means finished!. Starting writing i")
    if not os.path.exists(os.path.dirname('cluster_number_{}'.format(k))):
        os.makedirs('clusters/cluster_number_'+str(k))

    for node_id, cluster_id in node_id_to_cluster_id.items():
        with open('clusters/cluster_number_{}/cluster{}.txt'.format(k, cluster_id+1), 'a+') as f:
            text = str(node_id)+'\n'
            f.write(text)

k: 32


KeyboardInterrupt: 