In [1]:
import xgt
import os
import pandas

from platform import python_version
print (python_version())

3.7.4


In [2]:
if os.environ.get('https_proxy'):
 del os.environ['https_proxy']
if os.environ.get('http_proxy'):
 del os.environ['http_proxy']

In [3]:
conn=xgt.Connection()
conn.server_version

'1.3.0'

In [4]:
try:
  devices = conn.get_vertex_frame('Devices')
except xgt.XgtNameError:
  devices = conn.create_vertex_frame(
      name='Devices',
      schema=[['device', xgt.TEXT]],
      key='device')
devices

<xgt.graph.VertexFrame at 0x7f01e3628890>

In [5]:
try:
  netflow = conn.get_edge_frame('Netflow')
except xgt.XgtNameError:
  netflow = conn.create_edge_frame(
      name='Netflow',
      schema=[['epoch_time', xgt.INT],
              ['duration', xgt.INT],
              ['src_device', xgt.TEXT],
              ['dst_device', xgt.TEXT],
              ['protocol', xgt.INT],
              ['src_port', xgt.INT],
              ['dst_port', xgt.INT],
              ['src_packets', xgt.INT],
              ['dst_packets', xgt.INT],
              ['src_bytes', xgt.INT],
              ['dst_bytes', xgt.INT]],
      source=devices,
      target=devices,
      source_key='src_device',
      target_key='dst_device')
netflow

<xgt.graph.EdgeFrame at 0x7f01e127a410>

In [6]:
try:
  host_events = conn.get_edge_frame('HostEvents')
except xgt.XgtNameError:
  host_events = conn.create_edge_frame(
      name='HostEvents',
      schema=[['epoch_time', xgt.INT],
              ['event_id', xgt.INT],
              ['log_host', xgt.TEXT],
              ['user_name', xgt.TEXT],
              ['domain_name', xgt.TEXT],
              ['logon_id', xgt.INT],
              ['process_name', xgt.TEXT],
              ['process_id', xgt.INT],
              ['parent_process_name', xgt.TEXT],
              ['parent_process_id', xgt.INT]],
           source=devices,
           target=devices,
           source_key='log_host',
           target_key='log_host')
host_events

<xgt.graph.EdgeFrame at 0x7f01e127ad10>

In [7]:
try:
  auth_events = conn.get_edge_frame('AuthEvents')
except xgt.XgtNameError:
  auth_events = conn.create_edge_frame(
           name='AuthEvents',
           schema = [['epoch_time',xgt.INT],
                     ['event_id',xgt.INT],
                     ['log_host',xgt.TEXT],
                     ['logon_type',xgt.INT],
                     ['logon_type_description',xgt.TEXT],
                     ['user_name',xgt.TEXT],
                     ['domain_name',xgt.TEXT],
                     ['logon_id',xgt.INT],
                     ['subject_user_name',xgt.TEXT],
                     ['subject_domain_name',xgt.TEXT],
                     ['subject_logon_id',xgt.TEXT],
                     ['status',xgt.TEXT],
                     ['src',xgt.TEXT],
                     ['service_name',xgt.TEXT],
                     ['destination',xgt.TEXT],
                     ['authentication_package',xgt.TEXT],
                     ['failure_reason',xgt.TEXT],
                     ['process_name',xgt.TEXT],
                     ['process_id',xgt.INT],
                     ['parent_process_name',xgt.TEXT],
                     ['parent_process_id',xgt.INT]],
            source = 'Devices',
            target = 'Devices',
            source_key = 'src',
            target_key = 'destination')
auth_events

<xgt.graph.EdgeFrame at 0x7f01e128ef50>

In [8]:
# Utility to print the sizes of data currently in xGT
def print_data_summary():
  print('Devices (vertices): {:,}'.format(devices.num_vertices))
  print('Netflow (edges): {:,}'.format(netflow.num_edges))
  print('Host events (edges): {:,}'.format(host_events.num_edges))
  print('Authentication events (edges): {:,}'.format(auth_events.num_edges))
  print('Total (edges): {:,}'.format(
      netflow.num_edges + host_events.num_edges + auth_events.num_edges))
    
print_data_summary()

Devices (vertices): 12,288
Netflow (edges): 0
Host events (edges): 18,637,483
Authentication events (edges): 47,790,045
Total (edges): 66,427,528


In [9]:
%%time

# Load the HostEvents event data:
if host_events.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/wls_day-85_1v.csv"]
    urls = ["xgtd://nvme_data1/data_1v/wls_day-{:02d}_1v.csv".format(_) for _ in range(2,91)]
    # urls = ["xgtd://data_1v/wls_day-11_1v.csv"]
    host_events.load(urls)
    print_data_summary()

CPU times: user 713 µs, sys: 0 ns, total: 713 µs
Wall time: 615 µs


In [10]:
%%time

# Load the AuthEvents event data:
if auth_events.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/wls_day-85_2v.csv"]
    urls = ["xgtd://nvme_data9/data_2v/wls_day-{:02d}_2v.csv".format(_) for _ in range(2,91)]
    #urls = ["xgtd://data_2v/wls_day-11_2v.csv"]
    auth_events.load(urls)
    print_data_summary()

Devices (vertices): 18,925
Netflow (edges): 0
Host events (edges): 1,468,936,024
Authentication events (edges): 4,022,436,222
Total (edges): 5,491,372,246
CPU times: user 588 ms, sys: 374 ms, total: 962 ms
Wall time: 4min 52s


In [11]:
%%time

# Load the netflow data:
if netflow.num_edges == 0:
    #urls = ["https://datasets.trovares.com/LANL/xgt/nf_day-85.csv"]
    urls = ["xgtd://nvme_data1/data_nf/nf_day-{:02d}.csv".format(_) for _ in range(2,91)]
    # urls = ["xgtd://nvme_data1/data_nf/nf_day-11.csv"]
    netflow.load(urls)
    print_data_summary()

Devices (vertices): 933,714
Netflow (edges): 17,882,795,024
Host events (edges): 1,468,936,024
Authentication events (edges): 4,022,436,222
Total (edges): 23,374,167,270
CPU times: user 1.55 s, sys: 990 ms, total: 2.54 s
Wall time: 14min 47s


In [12]:
# Utility function to launch queries and show job number:
#   The job number may be useful if a long-running job needs
#   to be canceled.

def run_query(query, table_name = "answers", drop_answer_table=True, show_query=False):
    if drop_answer_table:
        conn.drop_frame(table_name)
    if query[-1] != '\n':
        query += '\n'
    query += 'INTO {}'.format(table_name)
    if show_query:
        print("Query:\n" + query)
    job = conn.schedule_job(query)
    print("Launched job {}".format(job.id))
    conn.wait_for_job(job)
    table = conn.get_table_frame(table_name)
    return table

In [13]:
# Generate a new edge frame for holding only the tgt_req_events edges
import time
query_start_time = time.time()

conn.drop_frame('tgt_req_events')
TGT_Req = conn.create_edge_frame(
            name='tgt_req_events',
            schema=auth_events.schema,
            source=devices,
            target=devices,
            source_key='src',
            target_key='destination')
TGT_Req

<xgt.graph.EdgeFrame at 0x7fb35d3a54d0>

In [None]:
# Generate a new edge frame for holding only the service_ticket_req edges
import time
query_start_time = time.time()

conn.drop_frame('service_ticket_req')
Service_Req = conn.create_edge_frame(
            name='service_ticket_req',
            schema=auth_events.schema,
            source=devices,
            target=devices,
            source_key='src',
            target_key='destination')
Service_Req

In [14]:
%%time

#Filter out all the Nodes in which 4768(TGT request has been logged)
q = """
MATCH (n1:Devices)-[r:AuthEvents]->[n2:Devices]
WHERE r.event_id = 4768
CREATE (n1)-[r1:tgt_req_events {epoch_time:r.epoch_time, event_id:r.event_id,
 log_host:r.log_host, logon_type:r.logon_type,
 logon_type_description:r.logon_type_description, user_name:r.user_name,
 domain_name:r.domain_name, logon_id:r.logon_id,
 subject_user_name:r.subject_user_name, subject_domain_name:r.subject_domain_name,
 subject_logon_id:r.subject_logon_id, status:r.status,
 src:r.src, service_name:r.service_name,
 destination:r.destination, authentication_package:r.authentication_package,
 failure_reason: r.failure_reason, process_name:r.process_name,
 process_id:r.process_id, parent_process_name:r.parent_process_name,
 parent_process_id:r.parent_process_id}]->(n2)
RETURN count(*)
"""
data = run_query(q)
print('Number of answers: {:,}'.format(data.get_data()[0][0]))

Launched job 3914
Number of answers: 2,176
CPU times: user 304 ms, sys: 165 ms, total: 469 ms
Wall time: 1min 53s


In [15]:
%%time

#Filter out all the Nodes in which 4769(Service ticket request has been logged)
q = """
MATCH (n1:Devices)-[r:AuthEvents]->[n2:Devices]
WHERE r.event_id = 4769
CREATE (n1)-[r1:service_ticket_req {epoch_time:r.epoch_time, event_id:r.event_id,
 log_host:r.log_host, logon_type:r.logon_type,
 logon_type_description:r.logon_type_description, user_name:r.user_name,
 domain_name:r.domain_name, logon_id:r.logon_id,
 subject_user_name:r.subject_user_name, subject_domain_name:r.subject_domain_name,
 subject_logon_id:r.subject_logon_id, status:r.status,
 src:r.src, service_name:r.service_name,
 destination:r.destination, authentication_package:r.authentication_package,
 failure_reason: r.failure_reason, process_name:r.process_name,
 process_id:r.process_id, parent_process_name:r.parent_process_name,
 parent_process_id:r.parent_process_id}]->(n2)
RETURN count(*)
"""
data = run_query(q)
print('Number of answers: {:,}'.format(data.get_data()[0][0]))

Launched job 5036
Number of answers: 755,260
CPU times: user 58.3 ms, sys: 42 ms, total: 100 ms
Wall time: 53 s


In [16]:
data=None
if service_ticket_req.num_edges == 0:
    print("service_ticket_req is empty")
elif service_ticket_req.num_edges <= 1000:
    data = service_ticket_req.get_data_pandas()
else:
    data = 'service_ticket_req (edges): {:,}'.format(service_ticket_req.num_edges)
data

'RDPflow (edges): 757,436'

In [17]:
# Utility to print the data sizes currently in xGT
def print_netflow_data_summary():
  print_data_summary()
  print('service_ticket_req (edges): {:,}'.format(service_ticket_req.num_edges))

print_netflow_data_summary()

Devices (vertices): 933,714
Netflow (edges): 17,882,795,024
Host events (edges): 1,468,936,024
Authentication events (edges): 4,022,436,222
Total (edges): 23,374,167,270
RDPFlow (edges): 757,436


In [None]:
# Delete all the service_ticket_req relationships which does not have tgt_req_events with in the interval

q = """
MATCH (n1:Devices)-[r1:service_ticket_req]->(n2:Devices),
      (n1)-[r2:tgt_req_events]->(n2)
WHERE r1.src = r2.src
AND r1.epoch_time > r2.epoch_time
AND r1.epoch_time - r2.epoch_time < 60
DELETE r1, r2
RETURN count(*)
"""
answer_table = run_query(q)
print('Number of answers: {:,}'.format(answer_table.num_rows))

In [18]:
%%time

#Lateral Movement Query
#time_threshold_between_step = 3600   # one hour
#time_threshold_hijack = 180          # three minutes
#time_threshold_one_step = 480        # eight minutes
q = """
MATCH (n1:Devices)-[r1:service_ticket_req]->(n2:Devices),
(n2)-[event:HostEvents]->(n2),
(n1)-[r3:AuthEvents]->(n3:Devices),
(n1)-[r4:Netflow]->(n3)
WHERE event.event_id = 4688 AND event.process_name = "lsass.exe"
AND r3.event_id = 4624
AND r3.src = r1.src
AND r3.destination = r1.service_name
AND r3.authentication_package = "Kerberos"
AND r1.epoch_time < r3.epoch_time
AND r3.epoch_time - r1.epoch_time < 60
AND r4.src_device = r1.src
AND r4.dst_device = r1.service_name
AND r3.epoch_time < r4.epoch_time
AND r4.epoch_time - r3.epoch_time < 20
AND r4.duration < 300
RETURN DISTINCT r1.epoch_time, r1.log_host, r1.src, r1.service_name, r1.authentication_package, r3.epoch_time, r4.epoch_time, r4.duration
"""
answer_table = run_query(q)
print('Number of answers: {:,}'.format(answer_table.num_rows))

Launched job 5547
Number of answers: 57,057
CPU times: user 296 ms, sys: 209 ms, total: 505 ms
Wall time: 7min 48s


In [19]:
%%time
# Build HijackEvents table

import time
start_optimized_query_time = time.time()

conn.drop_frame('HijackEvents')
hijack_events = conn.create_edge_frame(
    name   ='HijackEvents',
    schema = [['epoch_time', xgt.INT],
              ['src_host', xgt.TEXT],
              ['dst_host', xgt.TEXT]],
    source = devices,
    target = devices,
    source_key = 'src_host',
    target_key = 'dst_host')

query = """
MATCH (v0)-[edge:HostEvents]->(v0)
WHERE edge.process_name = "Proc249569.exe"
  AND edge.event_id = 4688
CREATE (v0)-[e:HijackEvents { epoch_time : edge.epoch_time }]->(v0)
RETURN count(*)
"""
run_query(query)
print('HijackEvents (edges): {:,}'.format(hijack_events.num_edges))

Launched job 10151
HijackEvents (edges): 11,715,150
CPU times: user 42.9 ms, sys: 46 ms, total: 88.9 ms
Wall time: 40.8 s


In [20]:
%%time
# Build a PrivEscEvents table

conn.drop_frame('PrivEscEvents')
priv_esc_events = conn.create_edge_frame(
    name   ='PrivEscEvents',
    schema = [['epoch_time', xgt.INT],
              ['src_host', xgt.TEXT],
              ['dst_host', xgt.TEXT]],
    source = devices,
    target = devices,
    source_key = 'src_host',
    target_key = 'dst_host')

query = """
MATCH (v0)-[edge:HostEvents]->(v0)
WHERE edge.process_name = "Proc336322.exe" OR
      edge.process_name = "Proc695356.exe"
  AND edge.event_id = 4688
CREATE (v0)-[e:PrivEscEvents { epoch_time : edge.epoch_time }]->(v0)
RETURN count(*)
"""
run_query(query)
print('PrivEscEvents (edges): {:,}'.format(priv_esc_events.num_edges))

Launched job 10398
PrivEscEvents (edges): 8,695,220
CPU times: user 27 ms, sys: 8.8 ms, total: 35.8 ms
Wall time: 17.4 s


In [21]:
%%time
# Now run the lateral movement query using these new index tables

q = """
MATCH (A)-[rdp1:RDPFlow]->(B)-[rdp2:RDPFlow]->(C),
      (A)-[hijack1:HijackEvents]->(A)-[priv_esc1:PrivEscEvents]->(A),
      (B)-[hijack2:HijackEvents]->(B)-[priv_esc2:PrivEscEvents]->(B)
WHERE A <> B AND B <> C AND A <> C 
  // Check time constraints on the overall pattern
  AND rdp1.epoch_time <= rdp2.epoch_time
  AND rdp2.epoch_time - rdp1.epoch_time < {0}

  // Check time constraints on step from A to B
  AND priv_esc1.epoch_time <= hijack1.epoch_time
  AND hijack1.epoch_time <= rdp1.epoch_time
  AND rdp1.epoch_time - hijack1.epoch_time < {1}
  AND rdp1.epoch_time - priv_esc1.epoch_time < {2}

  // Check time constraints on step from B to C
  AND priv_esc2.epoch_time <= hijack2.epoch_time
  AND hijack2.epoch_time <= rdp2.epoch_time
  AND rdp2.epoch_time - hijack2.epoch_time < {1}
  AND rdp2.epoch_time - priv_esc2.epoch_time < {2}
RETURN rdp1.src_device, rdp1.dst_device, rdp1.epoch_time, rdp2.dst_device, rdp2.epoch_time
""".format(time_threshold_between_step, time_threshold_hijack, time_threshold_one_step)
start_timer = time.time()
answer_table = run_query(q)
end_timer = time.time()
query_end_time = time.time()
print('Number of answers: {:,}'.format(answer_table.num_rows))

print("Time for query: {:,.2f}".format(end_timer - start_timer))
print("Overall time for query: {:,.2f}".format(query_end_time - query_start_time))

Launched job 10402
Number of answers: 57,057
Time for query: 4.45
Overall time for query: 697.91
CPU times: user 25.6 ms, sys: 6.39 ms, total: 32 ms
Wall time: 4.45 s


In [22]:
# retrieve the answer rows to the client in a pandas frame
data = answer_table.get_data_pandas()
data[0:10]

Unnamed: 0,rdp1_src_device,rdp1_dst_device,rdp1_epoch_time,rdp2_dst_device,rdp2_epoch_time
0,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
1,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
2,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
3,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
4,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
5,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
6,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
7,ActiveDirectory,EnterpriseAppServer,1827566,Comp005825,1830021
8,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
9,ActiveDirectory,EnterpriseAppServer,4374441,Comp872390,4375296
