In [2]:
import enum
import sys
from graphkitlearn.graphkitlearn.gklearn.utils import graphfiles
import networkx
import matplotlib.pyplot as plt
import numpy as np
from gklearn.utils import *
import os
import random
from gklearn.ged.env import GEDEnv
import numpy as np
from time import process_time
from sklearn.cluster import KMeans
from sklearn.metrics import *
import datetime
pathAids = "./AIDS/AIDS/data/"
path1Grec = "./GREC/GREC/data/"
import pickle

In [3]:
def LoadData(filename, childrentagName):
    import xml.etree.ElementTree as ET
    dirname_dataset = os.path.dirname(filename)
    tree = ET.parse(filename)
    root = tree.getroot()
    data = []
    y = []
    children = list([elem for elem in root.find(childrentagName).iter() if elem is not root.find(childrentagName)])
    for graph in children:
        mol_filename = graph.attrib['file']
        mol_class = graph.attrib['class']
        data.append(graphfiles.loadGXL(dirname_dataset + '/' + mol_filename))
        y.append(mol_class)
    return data, y


In [4]:
class GraphHelper:
    def __init__(self, trainData, mProtoTypes):
        self.trainData = trainData
        self.InitializeGraphToVector(trainData, mProtoTypes)
        self.mProtoTypes = mProtoTypes
        
    def GetDistanceBetweenGraphs(self,graph1, graph2):
        ged_env = GEDEnv() # initailize GED environment.
        ged_env.set_edit_cost('CONSTANT', # GED cost type.
                            edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
                            )  
        ged_env.add_nx_graph(graph1, '') # add graph1
        ged_env.add_nx_graph(graph2, '') # add graph2
        listID = ged_env.get_all_graph_ids() # get list IDs of graphs
        ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
        options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
                'threads': 1 # parallel threads.
                }
        ged_env.set_method('BIPARTITE', # GED method.
                        options # options for GED method.
                        )
        ged_env.init_method() # initialize GED method.

        ged_env.run_method(listID[0], listID[1]) # run.
        dis = ged_env.get_upper_bound(listID[0], listID[1])
        return dis

    def MaxEditDistance(self, graphSets, nodes, addedIndices):
        distanceVector = np.empty(shape=(len(graphSets ), len(nodes)))
        for graphIndex,graph in enumerate(graphSets):
            for nodeIndex,node in enumerate(nodes):
                dis = self.GetDistanceBetweenGraphs(graph, node)
                distanceVector[graphIndex][nodeIndex] = dis
        maxValue = -1
        maxIndex = -1
        for graphIndex in range(len(graphSets)):
            if(graphIndex not in addedIndices):
                maxDistanceIndex = np.argmax(distanceVector[graphIndex])
                if(distanceVector[graphIndex][maxDistanceIndex] > maxValue):
                    maxValue = distanceVector[graphIndex][maxDistanceIndex]
                    maxIndex = graphIndex

        return maxIndex

    def SelectSpanningPrototypes(self, graphData, mprototypes):
        choiceIndex = random.randrange(len(graphData))
        graphSelected = [graphData[choiceIndex]]
        graphSelectedIndex = [choiceIndex]

        for selectors in range(mprototypes - 1):
            maxEditDistanceIndex = self.MaxEditDistance(graphData, graphSelected, graphSelectedIndex)
            graphSelectedIndex.append(maxEditDistanceIndex)
            graphSelected.append(graphData[maxEditDistanceIndex])
        return graphSelectedIndex
    
    def GraphToVector(self, graphSet):
        vectorMatrix = np.empty(shape= (len(graphSet), self.mProtoTypes))
        for row, graph in enumerate(graphSet):
            for col,prototypeIndex in enumerate(self.selectedProtoTypes):
                vectorMatrix[row][col] = self.GetDistanceBetweenGraphs(graph, self.trainData[prototypeIndex])
        return vectorMatrix
    
    def InitializeGraphToVector(self, graphSet, mprotoTypes):
        self.selectedProtoTypes = self.SelectSpanningPrototypes(graphSet, mprotoTypes)
        #return self.GraphToVector(graphSet)

In [5]:
XtrainAids, y_train = LoadData(pathAids +"train.cxl", "fingerprints")
XvalidateAids, y_validate = LoadData(pathAids +"valid.cxl", "fingerprints")
XtestAids, y_test = LoadData(pathAids +"test.cxl", "fingerprints")

In [5]:
t1_start = process_time() 
graph = GraphHelper(XtrainAids, 10)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Prototype Selection at ", str(now))

853.046875
Completed Prototype Selection at  2021-04-24 13:18:20.758335


In [6]:
t1_start = process_time() 
trainVector = graph.GraphToVector(XtrainAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Train Conversion at ", str(now))

171.171875
Completed Train Conversion at  2021-04-24 13:21:13.061440


In [7]:
t1_start = process_time() 
testVector = graph.GraphToVector(XtestAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))



1034.9375
Completed Test at  2021-04-24 13:38:34.677509


In [8]:
t1_start = process_time() 
validationVector = graph.GraphToVector(XvalidateAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Validation at ", str(now))


172.8125
Completed Validation at  2021-04-24 13:41:28.737996


## Saving the computed Data

In [9]:
np.save("TrainVectorAids", trainVector)
np.save("TestVectorAids", testVector)
np.save("validateVectorAids", validationVector)
with open('GraphHelperObjAids', 'wb') as config_dictionary_file:
    pickle.dump(graph, config_dictionary_file)

## Loading from saved files 


In [15]:
trainVector = np.load("TrainVectorAids.npy")
testVector = np.load("TestVectorAids.npy")
validationVector = np.load("validateVectorAids.npy")
with open('GraphHelperObjAids', 'rb') as config_dictionary_file:
    graph = pickle.load(config_dictionary_file)

In [51]:
import math
import warnings



def get_intra_cluster_distance(cluster_centroids,train_data_labels, trainVector) :
    
    number_of_clusters = cluster_centroids.shape[0]
    
    dist_list = []
    for i in range(number_of_clusters) :
        dist_list.append(0)
    
    for i in range(trainVector.shape[0]) :
        
        cluster_number = train_data_labels[i]
        eucl_dist = cluster_centroids[cluster_number] - trainVector[i]
        eucl_dist = np.square(eucl_dist)
        eucl_dist = np.sum(eucl_dist)
        eucl_dist = math.sqrt(eucl_dist)
        
        dist_list[cluster_number] = dist_list[cluster_number] + eucl_dist
    #for i in range(number_of_clusters) :
    #    dist_list[i] /= np.where(train_data_labels == i)[0].shape[0]
    return max(dist_list)


from scipy.spatial.distance import cdist, euclidean

def geometric_median_weinsfeild(X, eps=1e-5):
    y = np.mean(X, 0)

    while True:
        D = cdist(X, [y])
        nonzeros = (D != 0)[:, 0]

        Dinv = 1 / D[nonzeros]
        Dinvs = np.sum(Dinv)
        W = Dinv / Dinvs
        T = np.sum(W * X[nonzeros], 0)

        num_zeros = len(X) - np.sum(nonzeros)
        if num_zeros == 0:
            y1 = T
        elif num_zeros == len(X):
            return y
        else:
            R = (T - y) * Dinvs
            r = np.linalg.norm(R)
            rinv = 0 if r == 0 else num_zeros/r
            y1 = max(0, 1-rinv)*T + min(1, rinv)*y

        if euclidean(y, y1) < eps:
            return y1

        y = y1

In [52]:
def get_inter_cluster_distance(cluster_centroids) :
    
    number_of_clusters = cluster_centroids.shape[0]
    intra_cluster_list = []
    
    for i in range(number_of_clusters) :
        for j in range(i+1,number_of_clusters) :
            eucl_dist = cluster_centroids[i] - cluster_centroids[j]
            eucl_dist = np.square(eucl_dist)
            eucl_dist = np.sum(eucl_dist)
            eucl_dist = math.sqrt(eucl_dist)
            
            intra_cluster_list.append(eucl_dist)
    
    return min(intra_cluster_list)

In [53]:
def return_dunn_index(kmeans_object, trainData, givenLables = None) :
    cluster_centroids = kmeans_object.cluster_centers_
    if(givenLables is None):
        train_data_labels = kmeans_object.labels_
    else:
        train_data_labels = givenLables
    
    #print(type(cluster_centroids))
    #print(type(train_data_labels))
    #print(cluster_centroids.shape)
    #print(train_data_labels.shape)
    #print(kmeans.inertia_)
    
    intra_cluster_dist = get_intra_cluster_distance(cluster_centroids,train_data_labels, trainData)
    inter_cluster_dist = get_inter_cluster_distance(cluster_centroids)
    
    return inter_cluster_dist/intra_cluster_dist

In [16]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(trainVector)

In [22]:
from sklearn.metrics.cluster import rand_score
from sklearn.metrics.cluster import homogeneity_score

pred = kmeans.predict(trainVector)
score = rand_score(y_train,pred)
print('Rand Index Accuracy of Train:{0:f}'.format(score))
print("Homogenity Score of Train %.6f" % homogeneity_score(y_train, pred))

print()
print()
pred = kmeans.predict(validationVector)
score = rand_score(y_validate,pred)
print('Rand Index Accuracy of Validation:{0:f}'.format(score))
print("Homogenity Score of Validation %.6f" % homogeneity_score(y_validate, pred))


print()
print()
pred = kmeans.predict(testVector)
score = rand_score(y_test,pred)
print('Rand Index Accuracy of Test:{0:f}'.format(score))
print("Homogenity Score of Test %.6f" % homogeneity_score(y_test, pred))

print()
print()
print('Dunn Index of the Cluster is :{0:f}'.format(return_dunn_index(kmeans, trainVector)))


Rand Index Accuracy of Train:0.781880
Homogenity Score of Train 0.271935


Rand Index Accuracy of Validation:0.781880
Homogenity Score of Validation 0.271935


Rand Index Accuracy of Test:0.780604
Homogenity Score of Test 0.266596


Dunn Index of the Cluster is :0.057186


In [13]:
from sklearn import svm
clf = svm.SVC()
clf.fit(trainVector, y_train)

SVC()

In [14]:
svmPredtrain = clf.predict(trainVector)
score = accuracy_score(y_train,svmPredtrain)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.972000


In [15]:
svmPredvalidate = clf.predict(validationVector)
score = accuracy_score(y_validate,svmPredvalidate)
print('Accuracy Validate:{0:f}'.format(score))

svmPredtest = clf.predict(testVector)
score = accuracy_score(y_test,svmPredtest)
print('Accuracy Test:{0:f}'.format(score))

Accuracy Validate:0.972000
Accuracy Test:0.960667


In [36]:
print('Dunn Index of the Cluster is :{0:f}'.format(return_dunn_index(kmeans, trainVector)))

Dunn Index of the Cluster is :0.057186


In [16]:
print(testVector.shape)
print(validationVector.shape)
print(trainVector.shape)

(1500, 10)
(250, 10)
(250, 10)


In [10]:
geometric_median_weinsfeild(trainVector)

array([ 69.34128354, 510.22491515, 336.46999172, 343.71676551,
       234.77264256, 353.8099154 , 242.16562397, 165.87642801,
       289.06716178, 147.81402484])

# Generalized Median

In [23]:
class GraphHelperGM:
    def __init__(self, trainData):
        self.trainData = trainData
        
    def GraphToVector(self,graphSet):
        ged_env = GEDEnv() # initailize GED environment.
        ged_env.set_edit_cost('CONSTANT', # GED cost type.
                            edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
                            )  
        for graph in graphSet:
            ged_env.add_nx_graph(graph, '') # add graph1
        for graph in self.trainData:
            ged_env.add_nx_graph(graph, '') # add graph1
        listID = ged_env.get_all_graph_ids() # get list IDs of graphs
        ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
        options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
                'threads': 1 # parallel threads.
                }
        ged_env.set_method('BIPARTITE', # GED method.
                        options # options for GED method.
                        )
        ged_env.init_method() # initialize GED method.
        
        
        vectorMatrix = np.empty(shape= (len(graphSet), len(self.trainData)))
        for row, graph in enumerate(graphSet):
            for col  in range(len(self.trainData)):
                ged_env.run_method(listID[row], listID[len(graphSet) + col]) # run.
                dis = ged_env.get_upper_bound(listID[row], listID[len(graphSet) + col])
                vectorMatrix[row][col] = dis
        return vectorMatrix

In [25]:
graphGM = GraphHelperGM(XtrainAids)


In [26]:
t1_start = process_time() 
trainVectorGM = graphGM.GraphToVector(XtrainAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

1448.453125
Completed Test at  2021-04-27 16:23:26.291111


In [27]:
t1_start = process_time() 
testVectorGM = graphGM.GraphToVector(XtestAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

9167.9375
Completed Test at  2021-04-27 19:11:36.378615


In [28]:
t1_start = process_time() 
validationVectorGM = graphGM.GraphToVector(XvalidateAids)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

1479.140625
Completed Test at  2021-04-27 19:36:36.678905


## Saving Data Grec GM

In [29]:
np.save("TrainVectorGRECGM", trainVectorGM)
np.save("TestVectorGRECGM", testVectorGM)
np.save("validateVectorGRECGM", validationVectorGM)
with open('GraphHelperObjGRECGM', 'wb') as config_dictionary_file:
    pickle.dump(graphGM, config_dictionary_file)

## Loading Data Grec GM

In [30]:
trainVectorGM = np.load("TrainVectorGRECGM.npy")
testVectorGM = np.load("TestVectorGRECGM.npy")
validationVectorGM = np.load("validateVectorGRECGM.npy")
with open('GraphHelperObjGRECGM', 'rb') as config_dictionary_file:
    graphGM = pickle.load(config_dictionary_file)

In [54]:
from sklearn.metrics.cluster import rand_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.preprocessing import LabelEncoder  


kmeansGM = KMeans(n_clusters=2, random_state=0).fit(trainVectorGM)
pred = kmeansGM.predict(trainVectorGM)
score = rand_score(y_train,pred)
print('Rand Index Accuracy of Train:{0:f}'.format(score))
print("Homogenity Score of Train %.6f" % homogeneity_score(y_train, pred))

print()
print()
pred = kmeansGM.predict(validationVectorGM)
score = rand_score(y_validate,pred)
print('Rand Index Accuracy of Validation:{0:f}'.format(score))
print("Homogenity Score of Validation %.6f" % homogeneity_score(y_validate, pred))


print()
print()
pred = kmeansGM.predict(testVectorGM)
score = rand_score(y_test,pred)
print('Rand Index Accuracy of Test:{0:f}'.format(score))
print("Homogenity Score of Test %.6f" % homogeneity_score(y_test, pred))

print()
print()
print('Dunn Index of the Cluster is :{0:f}'.format(return_dunn_index(kmeansGM, trainVectorGM)))

le = LabelEncoder()
y_train_labels = le.fit_transform(y_train)
print("Dunn Index of Ground Truth is : {0:f}".format(return_dunn_index(kmeansGM, trainVectorGM, y_train_labels)))

Rand Index Accuracy of Train:0.825735
Homogenity Score of Train 0.390315


Rand Index Accuracy of Validation:0.825735
Homogenity Score of Validation 0.390315


Rand Index Accuracy of Test:0.829558
Homogenity Score of Test 0.399280


Dunn Index of the Cluster is :3.175717
Dunn Index of Ground Truth is : 0.969642


In [43]:
print(y_train_labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
