In [21]:
import xgt
import os

In [22]:
if os.environ.get('https_proxy'):
 del os.environ['https_proxy']
if os.environ.get('http_proxy'):
 del os.environ['http_proxy']

In [23]:
conn = xgt.Connection()
conn.server_version

'1.2.0'

In [24]:
try:
  devices = conn.get_vertex_frame('Devices')
except xgt.XgtNameError:
  devices = conn.create_vertex_frame(
              name='Devices',
              schema=[['device', xgt.TEXT]],
              key='device')
devices

<xgt.graph.VertexFrame at 0x7f3274380438>

In [25]:
try:
  netflow = conn.get_edge_frame('Netflow')
except xgt.XgtNameError:
  netflow = conn.create_edge_frame(
            name='Netflow',
            schema=[['epochtime', xgt.INT],
                    ['duration', xgt.INT],
                    ['srcDevice', xgt.TEXT],
                    ['dstDevice', xgt.TEXT],
                    ['protocol', xgt.INT],
                    ['srcPort', xgt.INT],
                    ['dstPort', xgt.INT],
                    ['srcPackets', xgt.INT],
                    ['dstPackets', xgt.INT],
                    ['srcBytes', xgt.INT],
                    ['dstBytes', xgt.INT]],
            source=devices,
            target=devices,
            source_key='srcDevice',
            target_key='dstDevice')
netflow

<xgt.graph.EdgeFrame at 0x7f3274382390>

In [26]:
try:
  events1v = conn.get_edge_frame('Events1v')
except xgt.XgtNameError:
  events1v = conn.create_edge_frame(
           name='Events1v',
           schema=[['epochtime', xgt.INT],
                   ['eventID', xgt.INT],
                   ['logHost', xgt.TEXT],
                   ['userName', xgt.TEXT],
                   ['domainName', xgt.TEXT],
                   ['logonID', xgt.INT],
                   ['processName', xgt.TEXT],
                   ['processID', xgt.INT],
                   ['parentProcessName', xgt.TEXT],
                   ['parentProcessID', xgt.INT]],
           source=devices,
           target=devices,
           source_key='logHost',
           target_key='logHost')
events1v

<xgt.graph.EdgeFrame at 0x7f32743827f0>

In [27]:
# Utility to print the data sizes currently in xGT
def print_data_summary():
  print('Devices (vertices): {:,}'.format(devices.num_vertices))
  print('Netflow (edges): {:,}'.format(netflow.num_edges))
  print('Host event (edges): {:,}'.format(events1v.num_edges))

print_data_summary()

Devices (vertices): 933,314
Netflow (edges): 17,882,795,024
Host event (edges): 1,468,936,024


In [28]:
%%time
if events1v.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/wls_day-04_1v.csv"]
    # urls = ["https://datasets.trovares.com/LANL/xgt/wls_day-85_1v.csv"]
    urls = ["xgtd://wls_day-{:02d}_1v.csv".format(_) for _ in range(2,91)]
    events1v.load(urls)

CPU times: user 0 ns, sys: 1.02 ms, total: 1.02 ms
Wall time: 614 µs


In [29]:
%%time
if netflow.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/nf_day-04.csv"]
    # urls = ["https://datasets.trovares.com/LANL/xgt/nf_day-85.csv"]
    urls = ["xgtd://nf_day-{:02d}.csv".format(_) for _ in range(2,91)]    
    netflow.load(urls)

CPU times: user 807 µs, sys: 183 µs, total: 990 µs
Wall time: 541 µs


In [30]:
# Generate a new edge frame for holding only the C2 (Command and Control) edges
conn.drop_frame('C2flow')
c2flow = conn.create_edge_frame(
            name='C2flow',
            schema=netflow.schema,
            source=devices,
            target=devices,
            source_key='srcDevice',
            target_key='dstDevice')
c2flow

<xgt.graph.EdgeFrame at 0x7f3274380d68>

In [31]:
# Utility method to execute query

def run_query(query, table_name = "answers", show_query=False):
    conn.drop_frame(table_name)
    if query[-1] != '\n':
        query += '\n'
    query += 'INTO {}'.format(table_name)
    if show_query:
        print("Query:\n" + query)
    job = conn.schedule_job(query)
    print("Launched job {}".format(job.id))
    conn.wait_for_job(job)
    table = conn.get_table_frame(table_name)
    return table

In [32]:
%%time

# A "forward" edge is one where the dstPort = 3128. This edge is copied verbatim to the C2flow edge frame.
q = """
MATCH (v0)-[edge:Netflow]->(v1)
WHERE edge.dstPort=3128
CREATE (v0)-[e:C2flow {epochtime : edge.epochtime,
  duration : edge.duration, protocol : edge.protocol,
  srcPort : edge.srcPort, dstPort : edge.dstPort,
  srcPackets : edge.srcPackets, dstPackets : edge.dstPackets,
  srcBytes : edge.srcBytes, dstBytes : edge.dstBytes}]->(v1)
RETURN count(*)
"""
r = run_query(q)
print('Number of answers: ' + '{:,}'.format(r.get_data()[0][0]))

Launched job 20
Number of answers: 4,249
CPU times: user 87.7 ms, sys: 32.4 ms, total: 120 ms
Wall time: 59.6 s


In [33]:
%%time

# A "reverse" edge is one where the srcPort = 3128
q = """
MATCH (v0)-[edge:Netflow]->(v1)
WHERE edge.srcPort=3128
CREATE (v1)-[e:C2flow {epochtime : edge.epochtime,
  duration : edge.duration, protocol : edge.protocol,
  srcPort : edge.dstPort, dstPort : edge.srcPort,
  srcPackets : edge.dstPackets, dstPackets : edge.srcPackets,
  srcBytes : edge.dstBytes, dstBytes : edge.srcBytes}]->(v0)
RETURN count(*)
"""
r = run_query(q)
print('Number of answers: ' + '{:,}'.format(r.get_data()[0][0]))

Launched job 23
Number of answers: 261,834
CPU times: user 87.8 ms, sys: 24 ms, total: 112 ms
Wall time: 59.6 s


In [34]:
%%time

# We begin the pattern description with the boot and program start events followed by a C2flow edge from our preprocessing

q = """
MATCH (A)-[boot:Events1v]->(A)-[program:Events1v]->(A)
        -[c2:C2flow]->(B)
WHERE A <> B
  AND boot.eventID = 4608
  AND program.eventID = 4688
  AND program.epochtime >= boot.epochtime
  AND c2.epochtime >= program.epochtime
  AND c2.epochtime - boot.epochtime < 4
RETURN COUNT(*)
"""
# Note the overall time limit on the sequence of the three events

r = run_query(q)
print('Number of boot, programstart, & c2 events: ' + '{:,}'.format(r.get_data()[0][0]))

Launched job 26
Number of boot, programstart, & c2 events: 5,084
CPU times: user 191 ms, sys: 56.7 ms, total: 248 ms
Wall time: 3min 34s


In [35]:
%%time

# Zombie-Reboot pattern
q = """
MATCH (A)-[boot:Events1v]->(A)-[program:Events1v]->(A)
    -[c2:C2flow]->(B)-[nf2:Netflow]->(C)
WHERE A <> B AND B <> C AND A <> C
  AND boot.eventID = 4608
  AND program.eventID = 4688
  AND program.epochtime >= boot.epochtime
  AND c2.epochtime >= program.epochtime
  AND c2.epochtime - boot.epochtime < 4
  AND nf2.duration >= 3600
  AND nf2.epochtime < boot.epochtime
  AND nf2.epochtime + nf2.duration >= c2.epochtime
RETURN COUNT(*)
"""

r = run_query(q)
print('Number of zombie reboot events: ' + '{:,}'.format(r.get_data()[0][0]))

Launched job 29
Number of zombie reboot events: 986,440
CPU times: user 4.77 s, sys: 1.41 s, total: 6.18 s
Wall time: 54min 36s
