In [1]:
from stellargraph import StellarGraph

In [2]:
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import GCN
from sklearn import model_selection
from stellargraph.layer import HinSAGE

In [3]:
import os
import py2neo
import pandas as pd

default_host = os.environ.get("STELLARGRAPH_NEO4J_HOST")

# Create the Neo4j Graph database object; the parameters can be edited to specify location and authentication
neo4j_graph = py2neo.Graph(host='localhost', port=7687, user='neo4j', password='anis')

In [4]:
node_labels = neo4j_graph.run(
    """
MATCH (n) RETURN distinct labels(n)[0] AS nodeLabel
    """
).to_data_frame()


In [5]:
nodeDict = {}

for idx, row in node_labels.itertuples():
    if row == 'uid':
        cypherQ = "MATCH (a:`" + row + "`) MATCH (a)-[r1:`label`]->(c) RETURN DISTINCT ID(a) as name, ID(a) as idFeature, case when c.id = 'Malicious' then 1 else 0 end as label"
    else:
        cypherQ = "MATCH (a:`" + row + "`) MATCH (n:`uid`)-[r1:`" + row + "`]->(a) MATCH(n)-[r2:`label`]->(c) RETURN DISTINCT toInteger(toString(ID(a)) + toString(ID(n))) as name, toInteger(toString(ID(a)) + toString(ID(n))) as idFeature, case when c.id = 'Malicious' then 1 else 0 end as label"
    currNode = neo4j_graph.run(cypherQ).to_data_frame().set_index('name')
    nodeDict[row] = currNode

In [6]:
rel_types = neo4j_graph.run(
    """
MATCH ()-[r]->() RETURN distinct type(r)
    """
).to_data_frame()

In [7]:
edgeDict = {}

for idx, row in rel_types.itertuples():
    currEdge = neo4j_graph.run("MATCH (a)-[r1:`" + row + "`]->(b) MATCH (a)-[r2:`label`]->(c) RETURN DISTINCT ID(r1) AS rid, ID(a) as source, toInteger(toString(ID(b)) + toString(ID(a))) as target, case when c.id = 'Malicious' then 1 else 0 end as label").to_data_frame().set_index('rid')
    edgeDict[row] = currEdge

In [8]:
hetereogeneousGraph = StellarGraph(nodeDict, edgeDict, is_directed=True)

In [9]:
print(hetereogeneousGraph.info())

StellarGraph: Directed multigraph
 Nodes: 860027, Edges: 810027

 Node types:
  uid: [50000]
    Features: float32 vector, length 2
    Edge types: uid-conn_state->conn_state, uid-detailed-label->detailed-label, uid-duration->duration, uid-history->history, uid-id.orig_h->id.orig_h, ... (15 more)
  tunnel_parents: [50000]
    Features: float32 vector, length 2
    Edge types: none
  ts: [50000]
    Features: float32 vector, length 2
    Edge types: none
  resp_pkts: [50000]
    Features: float32 vector, length 2
    Edge types: none
  resp_ip_bytes: [50000]
    Features: float32 vector, length 2
    Edge types: none
  proto: [50000]
    Features: float32 vector, length 2
    Edge types: none
  orig_pkts: [50000]
    Features: float32 vector, length 2
    Edge types: none
  orig_ip_bytes: [50000]
    Features: float32 vector, length 2
    Edge types: none
  missed_bytes: [50000]
    Features: float32 vector, length 2
    Edge types: none
  label: [50000]
    Features: float32 vector, le

In [10]:
#edges_train, edges_test = model_selection.train_test_split(pd.concat(list(edgeDict.values())), train_size=0.7, test_size=0.3)
edges_train, edges_test = model_selection.train_test_split(edgeDict["label"], train_size=0.7, test_size=0.3)

In [11]:
edgelist_train = list(edges_train[["source", "target"]].itertuples(index=False))#.rename(columns={"source":"uid", "target":"uid"}, inplace=False)
edgelist_test = list(edges_test[["source", "target"]].itertuples(index=False))
print(edgelist_train[0])

Pandas(source=264775, target=154483264775)


In [12]:
labels_train = list(edges_train[["label"]].itertuples(index=False))
labels_test = list(edges_test[["label"]].itertuples(index=False))

In [13]:
batch_size = 50
num_samples = [8, 8, 8]

generator = HinSAGELinkGenerator(hetereogeneousGraph, batch_size, num_samples, head_node_types=["uid", "label"])

In [14]:
train_gen = generator.flow(edgelist_train, labels_train, shuffle=True)
test_gen = generator.flow(edgelist_test, labels_test)

In [15]:
generator.schema.type_adjacency_list(generator.head_node_types, len(num_samples))

[('uid',
  [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
 ('label', []),
 ('conn_state', []),
 ('detailed-label', []),
 ('duration', []),
 ('history', []),
 ('id.orig_h', []),
 ('id.orig_p', []),
 ('id.resp_h', []),
 ('id.resp_p', []),
 ('label', []),
 ('missed_bytes', []),
 ('orig_bytes', []),
 ('orig_ip_bytes', []),
 ('orig_pkts', []),
 ('proto', []),
 ('resp_bytes', []),
 ('resp_ip_bytes', []),
 ('resp_pkts', []),
 ('service', []),
 ('ts', []),
 ('tunnel_parents', [])]

In [16]:
generator.schema.schema

{'id.orig_h': [],
 'resp_bytes': [],
 'missed_bytes': [],
 'orig_pkts': [],
 'resp_ip_bytes': [],
 'id.resp_p': [],
 'tunnel_parents': [],
 'history': [],
 'proto': [],
 'ts': [],
 'conn_state': [],
 'detailed-label': [],
 'uid': [EdgeType(n1='uid', rel='conn_state', n2='conn_state'),
  EdgeType(n1='uid', rel='detailed-label', n2='detailed-label'),
  EdgeType(n1='uid', rel='duration', n2='duration'),
  EdgeType(n1='uid', rel='history', n2='history'),
  EdgeType(n1='uid', rel='id.orig_h', n2='id.orig_h'),
  EdgeType(n1='uid', rel='id.orig_p', n2='id.orig_p'),
  EdgeType(n1='uid', rel='id.resp_h', n2='id.resp_h'),
  EdgeType(n1='uid', rel='id.resp_p', n2='id.resp_p'),
  EdgeType(n1='uid', rel='label', n2='label'),
  EdgeType(n1='uid', rel='missed_bytes', n2='missed_bytes'),
  EdgeType(n1='uid', rel='orig_bytes', n2='orig_bytes'),
  EdgeType(n1='uid', rel='orig_ip_bytes', n2='orig_ip_bytes'),
  EdgeType(n1='uid', rel='orig_pkts', n2='orig_pkts'),
  EdgeType(n1='uid', rel='proto', n2='prot

In [17]:
hinsage_layer_sizes = [8, 8, 8]

assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

IndexError: list index out of range