In [11]:
import enum
import sys
from graphkitlearn.graphkitlearn.gklearn.utils import graphfiles
import networkx
import matplotlib.pyplot as plt
import numpy as np
from gklearn.utils import *
import os
import random
from gklearn.ged.env import GEDEnv
import numpy as np
from time import process_time
from sklearn.cluster import KMeans
from sklearn.metrics import *
import datetime
pathWeb = "./Web/Web/data/"
import pickle

In [12]:
def LoadData(filename, childrentagName):
    import xml.etree.ElementTree as ET
    dirname_dataset = os.path.dirname(filename)
    tree = ET.parse(filename)
    root = tree.getroot()
    data = []
    y = []
    children = list([elem for elem in root.find(childrentagName).iter() if elem is not root.find(childrentagName)])
    for graph in children:
        mol_filename = graph.attrib['file']
        mol_class = graph.attrib['class']
        data.append(graphfiles.loadGXL(dirname_dataset + '/' + mol_filename))
        y.append(mol_class)
    return data, y

In [13]:
class GraphHelper:
    def __init__(self, trainData, mProtoTypes):
        self.trainData = trainData
        self.InitializeGraphToVector(trainData, mProtoTypes)
        self.mProtoTypes = mProtoTypes
        
    def GetDistanceBetweenGraphs(self,graph1, graph2):
        ged_env = GEDEnv() # initailize GED environment.
        ged_env.set_edit_cost('CONSTANT', # GED cost type.
                            edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
                            )  
        ged_env.add_nx_graph(graph1, '') # add graph1
        ged_env.add_nx_graph(graph2, '') # add graph2
        listID = ged_env.get_all_graph_ids() # get list IDs of graphs
        ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
        options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
                'threads': 1 # parallel threads.
                }
        ged_env.set_method('BIPARTITE', # GED method.
                        options # options for GED method.
                        )
        ged_env.init_method() # initialize GED method.
        dis = -1
        try:
            ged_env.run_method(listID[0], listID[1]) # run.
            dis = ged_env.get_upper_bound(listID[0], listID[1])
        except Exception as e:
            pass
        return dis

    def MaxEditDistance(self, graphSets, nodes, addedIndices):
        distanceVector = np.empty(shape=(len(graphSets ), len(nodes)))
        for graphIndex,graph in enumerate(graphSets):
            for nodeIndex,node in enumerate(nodes):
                dis = self.GetDistanceBetweenGraphs(graph, node)
                distanceVector[graphIndex][nodeIndex] = dis
        maxValue = -1
        maxIndex = -1
        for graphIndex in range(len(graphSets)):
            if(graphIndex not in addedIndices):
                maxDistanceIndex = np.argmax(distanceVector[graphIndex])
                if(distanceVector[graphIndex][maxDistanceIndex] > maxValue):
                    maxValue = distanceVector[graphIndex][maxDistanceIndex]
                    maxIndex = graphIndex

        return maxIndex

    def SelectSpanningPrototypes(self, graphData, mprototypes):
        choiceIndex = random.randrange(len(graphData))
        graphSelected = [graphData[choiceIndex]]
        graphSelectedIndex = [choiceIndex]

        for selectors in range(mprototypes - 1):
            maxEditDistanceIndex = self.MaxEditDistance(graphData, graphSelected, graphSelectedIndex)
            graphSelectedIndex.append(maxEditDistanceIndex)
            graphSelected.append(graphData[maxEditDistanceIndex])
        return graphSelectedIndex
    
    def GraphToVector(self, graphSet):
        vectorMatrix = np.empty(shape= (len(graphSet), self.mProtoTypes))
        for row, graph in enumerate(graphSet):
            for col,prototypeIndex in enumerate(self.selectedProtoTypes):
                vectorMatrix[row][col] = self.GetDistanceBetweenGraphs(graph, self.trainData[prototypeIndex])
        return vectorMatrix
    
    def InitializeGraphToVector(self, graphSet, mprotoTypes):
        self.selectedProtoTypes = self.SelectSpanningPrototypes(graphSet, mprotoTypes)
        #return self.GraphToVector(graphSet)

In [14]:
XtrainWeb, y_train = LoadData(pathWeb +"train.cxl", "fingerprints")
XvalidateWeb, y_validate = LoadData(pathWeb +"valid.cxl", "fingerprints")
XtestWeb, y_test = LoadData(pathWeb +"test.cxl", "fingerprints")

In [None]:
t1_start = process_time() 
graph = GraphHelper(XtrainWeb, 10)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Prototype Selection at ", str(now))

In [None]:
t1_start = process_time() 
trainVector = graph.GraphToVector(XtrainWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Train Conversion at ", str(now))

In [None]:
t1_start = process_time() 
testVector = graph.GraphToVector(XtestWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))



In [None]:
t1_start = process_time() 
validationVector = graph.GraphToVector(XvalidateWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Validation at ", str(now))


## Saving the computed Data

In [None]:
np.save("TrainVectorWeb", trainVector)
np.save("TestVectorWeb", testVector)
np.save("validateVectorWeb", validationVector)
with open('GraphHelperObjWeb', 'wb') as config_dictionary_file:
    pickle.dump(graph, config_dictionary_file)

## Loading from saved files 


In [None]:
trainVector = np.load("TrainVectorWeb.npy")
testVector = np.load("TestVectorWeb.npy")
validationVector = np.load("validateVectorWeb.npy")
with open('GraphHelperObjWeb', 'rb') as config_dictionary_file:
    graph = pickle.load(config_dictionary_file)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

def delta_fast(ck, cl, distances):
    values = distances[np.where(ck)][:, np.where(cl)]
    values = values[np.nonzero(values)]

    return np.min(values)
    
def big_delta_fast(ci, distances):
    values = distances[np.where(ci)][:, np.where(ci)]
    #values = values[np.nonzero(values)]
            
    return np.max(values)

def dunn_fast(points, labels):
    """ Dunn index - FAST (using sklearn pairwise euclidean_distance function)
    
    Parameters
    ----------
    points : np.array
        np.array([N, p]) of all points
    labels: np.array
        np.array([N]) labels of all points
    """
    distances = euclidean_distances(points)
    ks = np.sort(np.unique(labels))
    
    deltas = np.ones([len(ks), len(ks)])*1000000
    big_deltas = np.zeros([len(ks), 1])
    
    l_range = list(range(0, len(ks)))
    
    for k in l_range:
        for l in (l_range[0:k]+l_range[k+1:]):
            deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances)
        
        big_deltas[k] = big_delta_fast((labels == ks[k]), distances)

    di = np.min(deltas)/np.max(big_deltas)
    return di

In [None]:
kmeans = KMeans(n_clusters=22, random_state=0).fit(trainVector)

In [None]:
from sklearn.metrics.cluster import rand_score
score = rand_score(y_train,kmeans.predict(trainVector))
print('Accuracy of Train:{0:f}'.format(score))

score = rand_score(y_validate,kmeans.predict(validationVector))
print('Accuracy of Validation:{0:f}'.format(score))

score = rand_score(y_test,kmeans.predict(testVector))
print('Accuracy of Test:{0:f}'.format(score))

In [None]:
class GraphHelperGM:
    def __init__(self, trainData):
        self.trainData = trainData
        
    def GraphToVector(self,graphSet):
        ged_env = GEDEnv() # initailize GED environment.
        ged_env.set_edit_cost('CONSTANT', # GED cost type.
                            edit_cost_constants=[3, 3, 1, 3, 3, 1] # edit costs.
                            )  
        for graph in graphSet:
            ged_env.add_nx_graph(graph, '') # add graph1
        for graph in self.trainData:
            ged_env.add_nx_graph(graph, '') # add graph1
        listID = ged_env.get_all_graph_ids() # get list IDs of graphs
        ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES') # initialize GED environment.
        options = {'initialization_method': 'RANDOM', # or 'NODE', etc.
                'threads': 1 # parallel threads.
                }
        ged_env.set_method('BIPARTITE', # GED method.
                        options # options for GED method.
                        )
        ged_env.init_method() # initialize GED method.
        
        
        vectorMatrix = np.empty(shape= (len(graphSet), len(self.trainData)))
        for row, graph in enumerate(graphSet):
            for col  in range(len(self.trainData)):
                ged_env.run_method(listID[row], listID[len(graphSet) + col]) # run.
                dis = ged_env.get_upper_bound(listID[row], listID[len(graphSet) + col])
                vectorMatrix[row][col] = dis
        return vectorMatrix

In [None]:
graphGM = GraphHelperGM(XtrainWeb)

In [None]:
t1_start = process_time() 
trainVectorGM = graphGM.GraphToVector(XtrainWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

In [None]:
t1_start = process_time() 
testVectorGM = graphGM.GraphToVector(XtestWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

In [None]:
t1_start = process_time() 
validationVectorGM = graphGM.GraphToVector(XvalidateWeb)
t1_stop = process_time()
print(t1_stop - t1_start)
now = datetime.datetime.now()
print("Completed Test at ", str(now))

## Saving Data

In [None]:
np.save("TrainVectorWebGM", trainVectorGM)
np.save("TestVectorWebGM", testVectorGM)
np.save("validateVectorWebGM", validationVectorGM)
with open('GraphHelperObjWebGM', 'wb') as config_dictionary_file:
    pickle.dump(graphGM, config_dictionary_file)

## Loading Data

In [None]:
trainVectorGM = np.load("TrainVectorWebGM.npy")
testVectorGM = np.load("TestVectorWebGM.npy")
validationVectorGM = np.load("validateVectorWebGM.npy")
with open('GraphHelperObjWebGM', 'rb') as config_dictionary_file:
    graphGM = pickle.load(config_dictionary_file)

In [None]:
from sklearn.metrics.cluster import rand_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.preprocessing import LabelEncoder  


kmeansGM = KMeans(n_clusters=22, random_state=0).fit(trainVectorGM)
pred = kmeansGM.predict(trainVectorGM)
score = rand_score(y_train,pred)
print('Rand Index Accuracy of Train:{0:f}'.format(score))
print("Homogenity Score of Train %.6f" % homogeneity_score(y_train, pred))
print('Dunn Index of the Cluster is :{0:f}'.format(dunn_fast(trainVectorGM, pred)))
le = LabelEncoder()
y_train_labels = le.fit_transform(y_train)
print('Dunn Index of the Ground Truth is :{0:f}'.format(dunn_fast(trainVectorGM, y_train_labels)))


print()
print()
pred = kmeansGM.predict(validationVectorGM)
score = rand_score(y_validate,pred)
print('Rand Index Accuracy of Validation:{0:f}'.format(score))
print("Homogenity Score of Validation %.6f" % homogeneity_score(y_validate, pred))
print('Dunn Index of the Cluster is :{0:f}'.format(dunn_fast(validationVectorGM, pred)))
le = LabelEncoder()
y_validate_labels = le.fit_transform(y_validate)
print('Dunn Index of the Ground Truth is :{0:f}'.format(dunn_fast(validationVectorGM, y_validate_labels)))

print()
print()
pred = kmeansGM.predict(testVectorGM)
score = rand_score(y_test,pred)
print('Rand Index Accuracy of Test:{0:f}'.format(score))
print("Homogenity Score of Test %.6f" % homogeneity_score(y_test, pred))
print('Dunn Index of the Cluster is :{0:f}'.format(dunn_fast(testVectorGM, pred)))
le = LabelEncoder()
y_train_labels = le.fit_transform(y_test)
print('Dunn Index of the Ground Truth is :{0:f}'.format(dunn_fast(testVectorGM, y_train_labels)))