In [1]:
import stellargraph as sg
from stellargraph import StellarGraph

In [2]:
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import GCN
from sklearn import model_selection
from stellargraph.layer import HinSAGE

In [3]:
import os
import py2neo
import pandas as pd

default_host = os.environ.get("STELLARGRAPH_NEO4J_HOST")

# Create the Neo4j Graph database object; the parameters can be edited to specify location and authentication
neo4j_graph = py2neo.Graph(host='localhost', port=7687, user='neo4j', password='anis')

In [4]:
node_labels = neo4j_graph.run(
    """
MATCH (n) RETURN distinct labels(n)[0] AS nodeLabel
    """
).to_data_frame()


In [5]:
print(node_labels)

         nodeLabel
0              uid
1               ts
2        id.orig_h
3        id.orig_p
4        id.resp_h
5        id.resp_p
6            proto
7         duration
8       orig_bytes
9       resp_bytes
10      conn_state
11    missed_bytes
12         history
13       orig_pkts
14   orig_ip_bytes
15       resp_pkts
16   resp_ip_bytes
17  tunnel_parents
18           label
19  detailed-label
20         service


In [6]:
nodeDict = {}

for idx, row in node_labels.itertuples():
    if row == 'uid':
        cypherQ = "MATCH (a:`" + row + "`) MATCH (a)-[r1:`label`]->(c) RETURN DISTINCT ID(a) as name, ID(a) as idFeature, case when c.id = 'Malicious' then 1 else 0 end as label"
    else:
        cypherQ = "MATCH (a:`" + row + "`) MATCH (n:`uid`)-[r1:`" + row + "`]->(a) MATCH(n)-[r2:`label`]->(c) RETURN DISTINCT toInteger(toString(ID(a)) + toString(ID(n))) as name, toInteger(toString(ID(a)) + toString(ID(n))) as idFeature, case when c.id = 'Malicious' then 1 else 0 end as label"
    currNode = neo4j_graph.run(cypherQ).to_data_frame().set_index('name')
    nodeDict[f'n_{row}'] = currNode

In [7]:
print(nodeDict['n_uid'])

        idFeature  label
name                    
315056     315056      1
315076     315076      1
315082     315082      1
315086     315086      1
315091     315091      1
...           ...    ...
472571     472571      1
472573     472573      1
472575     472575      0
472579     472579      1
472582     472582      1

[50000 rows x 2 columns]


In [8]:
rel_types = neo4j_graph.run(
    """
MATCH ()-[r]->() RETURN distinct type(r)
    """
).to_data_frame()

In [9]:
edgeDict = {}

for idx, row in rel_types.itertuples():
    currEdge = neo4j_graph.run("MATCH (a)-[r1:`" + row + "`]->(b) MATCH (a)-[r2:`label`]->(c) RETURN DISTINCT ID(r1) AS rid, ID(a) as source, toInteger(toString(ID(b)) + toString(ID(a))) as target, case when c.id = 'Malicious' then 1 else 0 end as label").to_data_frame().set_index('rid')
    edgeDict[f'e_{row}'] = currEdge

In [10]:
print(edgeDict['e_ts'])

         source        target  label
rid                                 
1620054  315056  315057315056      1
1620073  315076  315077315076      1
1620089  315082  315083315082      1
1620105  315086  315087315086      1
1620124  315091  315092315091      1
...         ...           ...    ...
2429999  472571  472572472571      1
2430015  472573  472574472573      1
2430031  472575  472576472575      0
2430046  472579  472580472579      1
2430065  472582  472583472582      1

[50000 rows x 3 columns]


In [11]:
# for idx, val in enumerate(nodeDict):
#     nodeDict[int(idx)] = nodeDict[val]
#     del nodeDict[val]
    
# for idx, val in enumerate(edgeDict):
#     edgeDict[int(idx)] = edgeDict[val]
#     del edgeDict[val]

In [12]:
hetereogeneousGraph = StellarGraph(nodeDict, edgeDict, is_directed=False)

In [13]:
print(hetereogeneousGraph.info())

StellarGraph: Undirected multigraph
 Nodes: 860027, Edges: 810027

 Node types:
  n_uid: [50000]
    Features: float32 vector, length 2
    Edge types: n_uid-e_conn_state->n_conn_state, n_uid-e_detailed-label->n_detailed-label, n_uid-e_duration->n_duration, n_uid-e_history->n_history, n_uid-e_id.orig_h->n_id.orig_h, ... (15 more)
  n_tunnel_parents: [50000]
    Features: float32 vector, length 2
    Edge types: n_tunnel_parents-e_tunnel_parents->n_uid
  n_ts: [50000]
    Features: float32 vector, length 2
    Edge types: n_ts-e_ts->n_uid
  n_resp_pkts: [50000]
    Features: float32 vector, length 2
    Edge types: n_resp_pkts-e_resp_pkts->n_uid
  n_resp_ip_bytes: [50000]
    Features: float32 vector, length 2
    Edge types: n_resp_ip_bytes-e_resp_ip_bytes->n_uid
  n_proto: [50000]
    Features: float32 vector, length 2
    Edge types: n_proto-e_proto->n_uid
  n_orig_pkts: [50000]
    Features: float32 vector, length 2
    Edge types: n_orig_pkts-e_orig_pkts->n_uid
  n_orig_ip_bytes: [

In [14]:
hetereogeneousGraph.check_graph_for_ml(expensive_check=True)

In [15]:
#edges_train, edges_test = model_selection.train_test_split(pd.concat(list(edgeDict.values())), train_size=0.7, test_size=0.3)

# train: 0.7, val: 0.1, test: 0.2
edges_train, edges_test = model_selection.train_test_split(edgeDict['e_label'], train_size=0.8, test_size=0.2)
edges_train, edges_val = model_selection.train_test_split(edges_train, train_size=0.875, test_size=0.125)
print(edges_train)
print(edges_val)
print(edges_test)

         source        target  label
rid                                 
1626318  316359  315074316359      1
1890497  368732  315105368732      0
1653935  321937  315105321937      0
2408564  468567  315074468567      1
2016523  393361  315074393361      1
...         ...           ...    ...
1620854  315241  315074315241      1
2425588  471745  315074471745      1
1819735  354893  315105354893      0
1808969  352777  315074352777      1
1935704  377586  315105377586      0

[35000 rows x 3 columns]
         source        target  label
rid                                 
2349382  457356  315074457356      1
2149154  418991  315105418991      0
2204764  429670  315105429670      0
1925183  375524  315074375524      1
1984288  387040  315105387040      0
...         ...           ...    ...
2325393  452631  315074452631      1
2155169  420140  315074420140      1
2304498  448674  315074448674      1
1913693  373275  315074373275      1
1843145  359478  315074359478      1

[5000 rows 

In [16]:
edgelist_train = list(edges_train[["source", "target"]].itertuples(index=False))#.rename(columns={"source":"uid", "target":"uid"}, inplace=False)
edgelist_val = list(edges_val[["source", "target"]].itertuples(index=False))
edgelist_test = list(edges_test[["source", "target"]].itertuples(index=False))

print(edgelist_train[1000])

Pandas(source=408882, target=315105408882)


In [17]:
labels_train = list(edges_train[['label']].itertuples(index=False))
labels_val = list(edges_val[['label']].itertuples(index=False))
labels_test = list(edges_test[['label']].itertuples(index=False))

print(labels_train[1000])

Pandas(label=0)


In [18]:
batch_size = 50
num_samples = [8, 8, 8]

generator = HinSAGELinkGenerator(hetereogeneousGraph, batch_size, num_samples, head_node_types=['n_uid', 'n_label'])

In [19]:
train_gen = generator.flow(edgelist_train, labels_train, shuffle=True)
val_gen = generator.flow(edgelist_val, labels_val)
test_gen = generator.flow(edgelist_test, labels_test)

In [20]:
generator.schema.type_adjacency_list(generator.head_node_types, len(num_samples))

[('n_uid',
  [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
 ('n_label', [22]),
 ('n_conn_state', [23]),
 ('n_detailed-label', [24]),
 ('n_duration', [25]),
 ('n_history', [26]),
 ('n_id.orig_h', [27]),
 ('n_id.orig_p', [28]),
 ('n_id.resp_h', [29]),
 ('n_id.resp_p', [30]),
 ('n_label', [31]),
 ('n_missed_bytes', [32]),
 ('n_orig_bytes', [33]),
 ('n_orig_ip_bytes', [34]),
 ('n_orig_pkts', [35]),
 ('n_proto', [36]),
 ('n_resp_bytes', [37]),
 ('n_resp_ip_bytes', [38]),
 ('n_resp_pkts', [39]),
 ('n_service', [40]),
 ('n_ts', [41]),
 ('n_tunnel_parents', [42]),
 ('n_uid',
  [43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62]),
 ('n_uid',
  [63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79,
   80,
   81,
   82]),
 ('n_uid',
  [83,
   84,
   85,
   86,
   87,
   88,
   89,
   90,
   91,
   92,
   93,
   94,
   95,


In [21]:
generator.schema.schema

{'n_resp_ip_bytes': [EdgeType(n1='n_resp_ip_bytes', rel='e_resp_ip_bytes', n2='n_uid')],
 'n_detailed-label': [EdgeType(n1='n_detailed-label', rel='e_detailed-label', n2='n_uid')],
 'n_orig_pkts': [EdgeType(n1='n_orig_pkts', rel='e_orig_pkts', n2='n_uid')],
 'n_tunnel_parents': [EdgeType(n1='n_tunnel_parents', rel='e_tunnel_parents', n2='n_uid')],
 'n_id.orig_h': [EdgeType(n1='n_id.orig_h', rel='e_id.orig_h', n2='n_uid')],
 'n_ts': [EdgeType(n1='n_ts', rel='e_ts', n2='n_uid')],
 'n_missed_bytes': [EdgeType(n1='n_missed_bytes', rel='e_missed_bytes', n2='n_uid')],
 'n_duration': [EdgeType(n1='n_duration', rel='e_duration', n2='n_uid')],
 'n_uid': [EdgeType(n1='n_uid', rel='e_conn_state', n2='n_conn_state'),
  EdgeType(n1='n_uid', rel='e_detailed-label', n2='n_detailed-label'),
  EdgeType(n1='n_uid', rel='e_duration', n2='n_duration'),
  EdgeType(n1='n_uid', rel='e_history', n2='n_history'),
  EdgeType(n1='n_uid', rel='e_id.orig_h', n2='n_id.orig_h'),
  EdgeType(n1='n_uid', rel='e_id.orig

In [22]:
hinsage_layer_sizes = [8, 8, 8]

assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

x_inp, x_out = hinsage.in_out_tensors()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [23]:
from stellargraph.layer import link_classification
score_prediction = link_classification()(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [24]:
import tensorflow.keras.backend as K
from tensorflow.keras import Model, optimizers, losses, metrics

model = Model(inputs=x_inp, outputs=score_prediction)
model.compile(
    optimizer=optimizers.Adam(lr=1e-2),
    loss=losses.BinaryCrossentropy(),
    metrics=[metrics.BinaryAccuracy()]
)

In [25]:
#model.summary()

In [None]:
num_workers = 8

test_metrics = model.evaluate(test_gen, verbose=1)

print("Untrained model's Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
epochs = 10

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=epochs,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers,
)

In [None]:
sg.utils.plot_history(history)

In [None]:
# save model

import pickle
pickle.dump(model, open('model.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

In [None]:
test_metrics = model.evaluate(
    test_gen, use_multiprocessing=False, workers=num_workers, verbose=1
)

print("Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
###########################################################################################################
# ComplEx (NOT WORKING NOW!!!)

In [20]:
from stellargraph import datasets, utils
from tensorflow.keras import callbacks, optimizers, losses, metrics, regularizers, Model
import numpy as np
import pandas as pd

from stellargraph.mapper import KGTripleGenerator
from stellargraph.layer import ComplEx

from IPython.display import HTML

In [23]:
epochs = 50
embedding_dimension = 200
negative_samples = 10
batch_size = 50

In [24]:
hg_gen = KGTripleGenerator(
    hetereogeneousGraph, batch_size=batch_size  # ~100 batches per epoch
)

hg_complex = ComplEx(
    hg_gen,
    embedding_dimension=embedding_dimension,
    embeddings_regularizer=regularizers.l2(1e-7),
)

hg_inp, hg_out = hg_complex.in_out_tensors()

hg_model = Model(inputs=hg_inp, outputs=hg_out)

hg_model.compile(
    optimizer=optimizers.Adam(lr=0.001),
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=[metrics.BinaryAccuracy(threshold=0.0)],
)

hg_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 200)       172005400   input_1[0][0]                    
                                                                 input_3[0][0]                

In [None]:
hg_train_gen = hg_gen.flow(
    edges_train, negative_samples=negative_samples, shuffle=True
)
hg_valid_gen = hg_gen.flow(edges_val, negative_samples=negative_samples)

In [None]:
wn18_es = callbacks.EarlyStopping(monitor="val_loss", patience=10)
wn18_history = wn18_model.fit(
    hg_train_gen, validation_data=hg_valid_gen, epochs=epochs, callbacks=[wn18_es]
)

In [None]:
###########################################################################################################