In [1]:
import keras
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Activation

Using Theano backend.


In [2]:
import numpy as np

In [3]:
from keras import backend as K

In [4]:
from collections import defaultdict, Counter
from scipy.sparse import csr_matrix

class MultiGraph:
    def __init__(self):
        self.n_nodes = 0
        self.n_rels = 0
        self.nodes = {}
        self.rels = {}
        self.node_labels = []
        self.rel_labels = []
        self.sparse_graph = {} #{relation: [row, col, data]}
        self.rel_counter = Counter()
    
    def add_connection(self, connection):
        # connection is (source, relation, target)
        src,rel,targ = connection
        #add new nodes and relations to dictionaries
        if src not in self.nodes.keys():
            self.nodes[src] = self.n_nodes
            self.node_labels.append(src)
            self.n_nodes += 1
        if targ not in self.nodes.keys():
            self.nodes[targ] = self.n_nodes
            self.node_labels.append(targ)
            self.n_nodes += 1
        if rel not in self.rels.keys():
            self.rels[rel] = self.n_rels
            self.rel_labels.append(rel)
            self.n_rels += 1
            self.sparse_graph[self.rels[rel]] = [[],[],[]] #{relation: [row, col, data]}
        #count number of relations
        self.rel_counter.update({rel: 1})
        # add new connection to graph
        self.sparse_graph[self.rels[rel]][0].append(self.nodes[src])
        self.sparse_graph[self.rels[rel]][1].append(self.nodes[targ])
        self.sparse_graph[self.rels[rel]][2].append(1)
    
    def get_adjacency_matrix(self):
        #iterleave all the columns from the individual adjacency matrices
        #it is an out-going adjacency graph
        full_matrix = [[],[],[]]
        for k in range(self.n_rels):
            full_matrix[0].extend(self.sparse_graph[k][0])
            #shift the column index to fit the added relation 
            col_shift = [i*self.n_rels+k for i in self.sparse_graph[k][1]]
            full_matrix[1].extend(col_shift)
            full_matrix[2].extend(self.sparse_graph[k][2])
        shape = (self.n_nodes,self.n_nodes*self.n_rels)
        return csr_matrix((full_matrix[2], (full_matrix[0],full_matrix[1])), shape=shape, dtype=np.float32)
    
    def get_adjacency_matrix_k(self,k):
        #it is an out-going adjacency graph of relation k
        graph_k = self.sparse_graph[self.rels[k]]
        shape = (self.n_nodes,self.n_nodes)
        return csr_matrix((graph_k[2], (graph_k[0],graph_k[1])), shape=shape, dtype=np.float32)
    
    def get_node_label(self,index):
        return self.node_labels[index]
    
    def get_relation_label(self,index):
        return self.rel_labels[index]
    
    def get_relation_counter(self):
        return self.counter

In [5]:
#import csv
import unicodecsv as csv
""" EXAMPLE
add,IS,operator
subtract,IS,operator
multiply,IS,operator
divide,IS,operator
open_closure,IS,operator
close_closure,IS,operator
"""
op_graph = MultiGraph()
#with open('operator_graph.csv', 'r') as csvfile:
for i in range(45):
    print('loading relation '+str(i)+'...')
    with open('aifb_csv/aifb_relation_'+str(i)+'.csv', 'r') as csvfile:
        graphreader = csv.reader(csvfile, delimiter=",")
        for row in graphreader:
            #print(row)
            op_graph.add_connection(row)

loading relation 0...
loading relation 1...
loading relation 2...
loading relation 3...
loading relation 4...
loading relation 5...
loading relation 6...
loading relation 7...
loading relation 8...
loading relation 9...
loading relation 10...
loading relation 11...
loading relation 12...
loading relation 13...
loading relation 14...
loading relation 15...
loading relation 16...
loading relation 17...
loading relation 18...
loading relation 19...
loading relation 20...
loading relation 21...
loading relation 22...
loading relation 23...
loading relation 24...
loading relation 25...
loading relation 26...
loading relation 27...
loading relation 28...
loading relation 29...
loading relation 30...
loading relation 31...
loading relation 32...
loading relation 33...
loading relation 34...
loading relation 35...
loading relation 36...
loading relation 37...
loading relation 38...
loading relation 39...
loading relation 40...
loading relation 41...
loading relation 42...
loading relation 43..

In [6]:
a_graph = op_graph.get_adjacency_matrix()
print("number of nodes",op_graph.n_nodes)
print("number of relations",op_graph.n_rels)
print("relations:")
for k,v in op_graph.rel_counter.iteritems():
    print(k.rsplit('/',1)[-1], v)

('number of nodes', 8284)
('number of relations', 45)
relations:
(u'ontology#number', 145)
(u'ontology#isWorkedOnBy', 571)
(u'ontology#worksAtProject', 200)
(u'owl#allValuesFrom', 152)
(u'ontology#dealtWithIn', 357)
(u'owl#onProperty', 152)
(u'ontology#type', 50)
(u'ontology#author', 3986)
(u'ontology#abstract', 534)
(u'ontology#carriedOutBy', 79)
(u'ontology#month', 759)
(u'ontology#phone', 227)
(u'22-rdf-syntax-ns#type', 4124)
(u'ontology#address', 202)
(u'ontology#note', 114)
(u'ontology#publication', 4163)
(u'ontology#financedBy', 65)
(u'ontology#chapter', 15)
(u'ontology#editor', 190)
(u'ontology#pages', 548)
(u'owl#inverseOf', 10)
(u'ontology#projectInfo', 952)
(u'type', 129)
(u'ontology#edition', 12)
(u'ontology#booktitle', 765)
(u'ontology#isAbout', 2477)
(u'ontology#finances', 68)
(u'ontology#howpublished', 49)
(u'ontology#member', 339)
(u'ontology#hasProject', 952)
(u'ontology#isbn', 16)
(u'ontology#journal', 161)
(u'ontology#year', 1227)
(u'ontology#title', 1227)
(u'ontology

In [7]:
x_train = a_graph
print('input dims:',x_train.shape)
print(type(x_train))
sum(list(x_train[5]))

('input dims:', (8284, 372780))
<class 'scipy.sparse.csr.csr_matrix'>


<1x372780 sparse matrix of type '<type 'numpy.float32'>'
	with 18 stored elements in Compressed Sparse Row format>

In [8]:
encoding_dim = 128
input_dim = x_train.shape[1]

inputs = Input(shape=(input_dim,))
# Encoder Layers
encoding_1 = Dense(4 * encoding_dim, activation='tanh',name="encoding_1")(inputs)
encoding_2 = Dense(2 * encoding_dim, activation='tanh',name="encoding_2")(encoding_1)
the_code = Dense(encoding_dim, activation='tanh',name="the_code")(encoding_2)
# Decoder Layers
decoding_1 = Dense(4 * encoding_dim, activation='tanh',name="decoding_1")(the_code)
decoding_2 = Dense(2 * encoding_dim, activation='tanh',name="decoding_2")(decoding_1)
reconstruction = Dense(input_dim, activation='tanh',name="reconstruction")(decoding_2)

ae = Model(inputs=inputs, outputs=reconstruction)
#monitor = EarlyStopping(monitor='loss', min_delta=0.0001, patience=5, verbose=1, mode='auto')
ae.compile(optimizer='adam', loss='mse')
ae.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 372780)            0         
_________________________________________________________________
encoding_1 (Dense)           (None, 512)               190863872 
_________________________________________________________________
encoding_2 (Dense)           (None, 256)               131328    
_________________________________________________________________
the_code (Dense)             (None, 128)               32896     
_________________________________________________________________
decoding_1 (Dense)           (None, 512)               66048     
_________________________________________________________________
decoding_2 (Dense)           (None, 256)               131328    
_________________________________________________________________
reconstruction (Dense)       (None, 372780)            95804460  
Total para

In [9]:
ae.fit(x_train, x_train.toarray(), epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1096e8890>

In [10]:
#coding_model = Model(inputs=ae.inputs, outputs=ae.get_layer("the_code").output)
coding_model = Model(inputs=inputs, outputs=the_code)

In [43]:
#save the embeddings in order to plot them later
# serialize model to JSON
model_json = coding_model.to_json()
with open("coding_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
coding_model.save_weights("coding_model.h5")
print("Saved model to disk")

Saved model to disk
