In [1]:
import xgt
import os

In [2]:
if os.environ.get('https_proxy'):
 del os.environ['https_proxy']
if os.environ.get('http_proxy'):
 del os.environ['http_proxy']

In [3]:
conn=xgt.Connection()
#conn.__version__
#conn.about()
conn.server_version

'1.2.0'

In [4]:
# Create a Vertex Frame named "devices"
try:
  devices = conn.get_vertex_frame('Devices')
except xgt.XgtNameError:
  devices = conn.create_vertex_frame(
              name='Devices',
              schema=[['device', xgt.TEXT]],
              key='device')
devices

<xgt.graph.VertexFrame at 0x7fc3ee37e5f8>

In [5]:
# Create an Edge Frame named "netflow"

try:
  netflow = conn.get_edge_frame('Netflow')
except xgt.XgtNameError:
  netflow = conn.create_edge_frame(
            name='Netflow',
            schema=[['epochtime', xgt.INT],
                    ['duration', xgt.INT],
                    ['srcDevice', xgt.TEXT],
                    ['dstDevice', xgt.TEXT],
                    ['protocol', xgt.INT],
                    ['srcPort', xgt.INT],
                    ['dstPort', xgt.INT],
                    ['srcPackets', xgt.INT],
                    ['dstPackets', xgt.INT],
                    ['srcBytes', xgt.INT],
                    ['dstBytes', xgt.INT]],
            source=devices,
            target=devices,
            source_key='srcDevice',
            target_key='dstDevice')
netflow

<xgt.graph.EdgeFrame at 0x7fc3ee38c828>

In [6]:
try:
  events1v = conn.get_edge_frame('Events1v')
except xgt.XgtNameError:
  events1v = conn.create_edge_frame(
           name='Events1v',
           schema=[['epochtime', xgt.INT],
                   ['eventID', xgt.INT],
                   ['logHost', xgt.TEXT],
                   ['userName', xgt.TEXT],
                   ['domainName', xgt.TEXT],
                   ['logonID', xgt.INT],
                   ['processName', xgt.TEXT],
                   ['processID', xgt.INT],
                   ['parentProcessName', xgt.TEXT],
                   ['parentProcessID', xgt.INT]],
           source=devices,
           target=devices,
           source_key='logHost',
           target_key='logHost')
events1v

<xgt.graph.EdgeFrame at 0x7fc3ee38ce80>

In [7]:
# Utility to print the sizes of data currently in xGT
def print_data_summary():
  print('Devices (vertices): {:,}'.format(devices.num_vertices))
  print('Netflow (edges): {:,}'.format(netflow.num_edges))
  print('Host event 1-vertex (edges): {:,}'.format(events1v.num_edges))
  print('Total (edges): {:,}'.format(
      netflow.num_edges + events1v.num_edges))
    
print_data_summary()

Devices (vertices): 0
Netflow (edges): 0
Host event 1-vertex (edges): 0
Total (edges): 0


In [8]:
%%time
#Load the 1-sided host event data:
if events1v.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/wls_day-85_1v.csv"]
    urls = ["xgtd://wls_day-{:02d}_1v.csv".format(_) for _ in range(2,91)]
    events1v.load(urls)
    print_data_summary()

Devices (vertices): 13,491
Netflow (edges): 0
Host event 1-vertex (edges): 1,468,936,024
Total (edges): 1,468,936,024
CPU times: user 495 ms, sys: 110 ms, total: 605 ms
Wall time: 3min 50s


In [9]:
%%time
#Load netflow event
if netflow.num_edges == 0:
    # urls = ["https://datasets.trovares.com/LANL/xgt/nf_day-85.csv"]
    urls = ["xgtd://nf_day-{:02d}.csv".format(_) for _ in range(2,91)]
    netflow.load(urls)
    print_data_summary()

Devices (vertices): 932,165
Netflow (edges): 17,882,795,024
Host event 1-vertex (edges): 1,468,936,024
Total (edges): 19,351,731,048
CPU times: user 1.62 s, sys: 515 ms, total: 2.13 s
Wall time: 15min 16s


In [10]:
# Utility function to launch queries and show job number:
#   The job number may be useful if a long-running job needs
#   to be canceled.

def run_query(query, table_name = "answers", drop_answer_table=True, show_query=False):
    if drop_answer_table:
        conn.drop_frame(table_name)
    if query[-1] != '\n':
        query += '\n'
    query += 'INTO {}'.format(table_name)
    if show_query:
        print("Query:\n" + query)
    job = conn.schedule_job(query)
    print("Launched job {}".format(job.id))
    conn.wait_for_job(job)
    table = conn.get_table_frame(table_name)
    return table

In [11]:
# Generate a new edge frame for holding only the RDP edges
conn.drop_frame('RDPflow')
rdpflow = conn.create_edge_frame(
            name='RDPflow',
            schema=netflow.schema,
            source=devices,
            target=devices,
            source_key='srcDevice',
            target_key='dstDevice')
rdpflow

<xgt.graph.EdgeFrame at 0x7fc3ee37e9e8>

In [12]:
%%time
# Extract Forward RDP Edges
q = """
MATCH ()-[edge:Netflow]->()
WHERE edge.dstPort=3389
MERGE (v0: Devices { device : edge.srcDevice })
MERGE (v1: Devices { device : edge.dstDevice })
CREATE (v0)-[e:RDPflow {epochtime : edge.epochtime,
  duration : edge.duration, protocol : edge.protocol,
  srcPort : edge.srcPort, dstPort : edge.dstPort,
  srcPackets : edge.srcPackets, dstPackets : edge.dstPackets,
  srcBytes : edge.srcBytes, dstBytes : edge.dstBytes}]->(v1)
RETURN count(*)
"""
data = run_query(q)
print('Number of answers: {:,}'.format(data.get_data()[0][0]))

Launched job 6
Number of answers: 2,176
CPU times: user 228 ms, sys: 49 ms, total: 277 ms
Wall time: 25.6 s


In [13]:
%%time
# Extract Reverse RDP Edges
q = """
MATCH ()-[edge:Netflow]->()
WHERE edge.srcPort=3389
MERGE (v0: Devices { device : edge.srcDevice })
MERGE (v1: Devices { device : edge.dstDevice })
CREATE (v1)-[e:RDPflow {epochtime : edge.epochtime,
  duration : edge.duration, protocol : edge.protocol,
  srcPort : edge.dstPort, dstPort : edge.srcPort,
  srcPackets : edge.dstPackets, dstPackets : edge.srcPackets,
  srcBytes : edge.dstBytes, dstBytes : edge.srcBytes}]->(v0)
RETURN count(*)
"""
data = run_query(q)
print('Number of answers: {:,}'.format(data.get_data()[0][0]))

Launched job 9
Number of answers: 755,260
CPU times: user 161 ms, sys: 48.4 ms, total: 209 ms
Wall time: 25.5 s


In [14]:
#Resulting RDPFlow

data=None
if rdpflow.num_edges == 0:
    print("RDPflow is empty")
elif rdpflow.num_edges <= 1000:
    data = rdpflow.get_data_pandas()
else:
    data = 'RDPflow (edges): {:,}'.format(rdpflow.num_edges)
data

'RDPflow (edges): 757,436'

In [15]:
# Utility to print the data sizes currently in xGT
def print_netflow_data_summary():
  print_data_summary()
  print('RDPflow (edges): {:,}'.format(rdpflow.num_edges))

print_netflow_data_summary()

Devices (vertices): 932,165
Netflow (edges): 17,882,795,024
Host event 1-vertex (edges): 1,468,936,024
Total (edges): 19,351,731,048
RDPflow (edges): 757,436


In [None]:
%%time
time_threshold_between_step = 3600   # one hour
time_threshold_hijack = 180          # three minutes
time_threshold_one_step = 480        # eight minutes
q = """
MATCH (A)-[rdp1:RDPflow]->(B)-[rdp2:RDPflow]->(C),
      (A)-[hijack1:Events1v]->(A)-[privEsc1:Events1v]->(A),
      (B)-[hijack2:Events1v]->(B)-[privEsc2:Events1v]->(B)
WHERE A <> B AND B <> C AND A <> C 
  AND privEsc1.eventID = 4688 
  AND (privEsc1.processName = "Proc336322.exe" OR privEsc1.processName = "Proc695356.exe")
  AND hijack1.eventID = 4688 AND hijack1.processName = "Proc249569.exe"
  AND privEsc2.eventID = 4688 
  AND (privEsc2.processName = "Proc336322.exe" OR privEsc2.processName = "Proc695356.exe")
  AND hijack2.eventID = 4688 AND hijack2.processName = "Proc249569.exe"

  // Check time constraints on the overall pattern
  AND rdp1.epochtime <= rdp2.epochtime
  AND rdp2.epochtime - rdp1.epochtime < {0}

  // Check time constraints on step from A to B
  AND privEsc1.epochtime <= hijack1.epochtime
  AND hijack1.epochtime <= rdp1.epochtime
  AND rdp1.epochtime - hijack1.epochtime < {1}
  AND rdp1.epochtime - privEsc1.epochtime < {2}

  // Check time constraints on step from B to C
  AND privEsc2.epochtime <= hijack2.epochtime
  AND hijack2.epochtime <= rdp2.epochtime
  AND rdp2.epochtime - hijack2.epochtime < {1}
  AND rdp2.epochtime - privEsc2.epochtime < {2}
RETURN rdp1.srcDevice, rdp1.dstDevice, rdp1.epochtime, rdp2.dstDevice, rdp2.epochtime
""".format(time_threshold_between_step, time_threshold_hijack, time_threshold_one_step)
answer_table = run_query(q)
print('Number of answers: {:,}'.format(answer_table.num_rows))

In [None]:
# retrieve the answer rows to the client in a pandas frame
data = answer_table.get_data_pandas()
data