In [20]:
from glob import glob
import sys,os,os.path
import linecache
import re
import shutil
import subprocess
import sys
import time

import dask.dataframe as dd
from dask.distributed import Client
import numpy as np
import pandas as pd
import yaml

In [21]:
if os.environ.get('https_proxy'):
 del os.environ['https_proxy']
if os.environ.get('http_proxy'):
 del os.environ['http_proxy']

os.environ['ENSIGN_BASE']='/root/ashish/reservoir_lab/ENSIGN-42'
os.environ['ANACONDA_HOME']='/root/anaconda3/'
os.environ['PYTHONPATH']='/root/ashish/reservoir_lab/ENSIGN-42/ENSIGN_4.2/Ensign-Py3'

sys.path.append('/root/ashish/reservoir_lab/ENSIGN-42/ENSIGN_4.2/bin')
sys.path.append('/root/ashish/reservoir_lab/ENSIGN-42/ENSIGN_4.2/Ensign-Py3/bin')
sys.path.append('/root/anaconda3/bin/')

#os.environ['LD_LIBRARY_PATH']='/root/ashish/reservoir_lab/ENSIGN-42/ENSIGN_4.2/Ensign-CAPI/lib'

os.putenv('LD_LIBRARY_PATH','/root/ashish/reservoir_lab/ENSIGN-42/ENSIGN_4.2/Ensign-CAPI/lib')

#%env

In [22]:
# Import ENSIGN Package for Tensor Decomposition
import ensign.cp_decomp as cpd

# Import ENSIGN Package to Convert csv Input Data to Tensor
import ensign.csv2tensor as c2t

# Import ENSIGN Package for Report Generation
import ensign.report as report


import ensign.sptensor as spt

# Import ENSIGN Package for Visual Report Generation
import ensign.visualize as viz

#Import Anomaly Detection Modules
from ensign.cyber_detectors.beaconing_detector import beacon_scores_from_decomp_mem, decomp_has_beacon_mem
from ensign.cyber_detectors.network_mapping_detector import decomp_has_netmap_mem
from ensign.cyber_detectors.portscan_detector import decomp_has_portscan

In [23]:
# Get configuration
with open('/root/ashish/reservoir_lab/ENSIGN-42/workflow_cfg.yml', 'r') as f:
    cfg = yaml.safe_load(f)
    os.makedirs(cfg['save_dir'], exist_ok=True)

In [24]:
# Build tensor
tensor = None
if os.path.exists(os.path.join(cfg['save_dir'], 'tensor_data.txt')):
    print('Tensor detected in save dir: {}, skipping csv2tensor.'.format(cfg['save_dir']))
else:
    print('Converting CSV to tensor ...')

    print('Setting up Dask client ...')
    start_dask = '{}/ENSIGN_4.2/Ensign-Py3/ensign/ensign_dask/start_dask.sh'.format(os.environ['ENSIGN_BASE'])
    subprocess.run(['bash', start_dask, str(cfg['num_threads'])])
    client = Client('127.0.0.1:8786')

    dfs = []
    for path in cfg['input_file'].split(' '):
        for filename in glob(path):
            print(f'  Reading in {filename} ...')
            if cfg['bro_log']:
                col_names = linecache.getline(os.path.join(cfg['save_dir'], filename), 7).split()[1:]
                dfs.append(dd.read_csv(os.path.join(cfg['save_dir'], filename), sep='\t', skiprows=8, header=0, names=col_names, usecols=cfg['columns']))
            else:
                dfs.append(dd.read_csv(os.path.join(cfg['save_dir'], filename), usecols=cfg['columns']))

    start = time.time()
    tensor = c2t.df2tensor(
        dfs,
        dask_client=client,
        columns=cfg['columns'],
        types=cfg['types'],
        binning=cfg['binning']
    )
    end = time.time()
    print('  csv2tensor took {} seconds.'.format(end - start))

    print('Closing dask ...')
    client.close()
    stop_dask = '{}/ENSIGN_4.2/Ensign-Py3/ensign/ensign_dask/stop_dask.sh'.format(os.environ['ENSIGN_BASE'])
    subprocess.check_output(['bash', stop_dask])

    if cfg['dump_tensor_files']:
        print('  Writing tensor files to disk ...')
        start = time.time()
        spt.write_sptensor(cfg['save_dir'], tensor)
        end = time.time()
        print(f'    tensor dump took: {end - start} seconds')

Converting CSV to tensor ...
Setting up Dask client ...
  Reading in /nvme_data14/CICDataset/CICDDoS2019/Dataset/csv/01-12-Backup/DrDoS_NetBIOS.csv ...
  Reading in /nvme_data14/CICDataset/CICDDoS2019/Dataset/csv/01-12-Backup/DrDoS_MSSQL.csv ...
  Reading in /nvme_data14/CICDataset/CICDDoS2019/Dataset/csv/01-12-Backup/DrDoS_LDAP.csv ...
  Reading in /nvme_data14/CICDataset/CICDDoS2019/Dataset/csv/01-12-Backup/DrDoS_DNS.csv ...
  Reading in /nvme_data15/CICDataset/DrDoS_NTP.csv ...
  Reading in /nvme_data15/CICDataset/DrDoS_SNMP.csv ...
  Reading in /nvme_data15/CICDataset/DrDoS_SSDP.csv ...
  Reading in /nvme_data15/CICDataset/DrDoS_UDP.csv ...
  Reading in /nvme_data15/CICDataset/Syn.csv ...
  Reading in /nvme_data15/CICDataset/TFTP.csv ...
  Reading in /nvme_data15/CICDataset/UDPLag.csv ...
Validating ...
Filtering ...
Casting column types ...
Binning ...
  Binning mode 0 (second)
  Binning mode 1 (none)
  Binning mode 2 (none)
  Binning mode 3 (none)
  Binning mode 4 (log10)
Fusing 



  csv2tensor took 246.41413187980652 seconds.
Closing dask ...
  Writing tensor files to disk ...
  Writing ' Timestamp' to /root/ashish/reservoir_lab/ENSIGN-42/output/map_mode_0.txt (size=19031)
  Writing ' Source IP' to /root/ashish/reservoir_lab/ENSIGN-42/output/map_mode_1.txt (size=550)
  Writing ' Destination IP' to /root/ashish/reservoir_lab/ENSIGN-42/output/map_mode_2.txt (size=598)
  Writing ' Destination Port' to /root/ashish/reservoir_lab/ENSIGN-42/output/map_mode_3.txt (size=65535)
  Writing 'Flow Bytes/s' to /root/ashish/reservoir_lab/ENSIGN-42/output/map_mode_4.txt (size=10)
  Writing 43730652 nonzeroes to /root/ashish/reservoir_lab/ENSIGN-42/output/tensor_data.txt
    tensor dump took: 203.21294617652893 seconds


In [25]:
# CANDECOMP/PARAFAC (CP) decomposition 
if os.path.exists(os.path.join(cfg['save_dir'], 'weights.txt')):
    print('weights.txt detected in save dir: {}, reading in decomposition ...'.format(cfg['save_dir']))
    decomp = cpd.read_cp_decomp_dir(cfg['save_dir'])
else:
    print('Decomposing tensor w/ CP APR ...')
    if cfg['num_threads'] < 1:
        if 'OMP_NUM_THREADS' in os.environ.keys():
            del os.environ['OMP_NUM_THREADS']
    else:
        os.environ['OMP_NUM_THREADS'] = str(cfg['num_threads'])

    if tensor is None:
        tensor = spt.read_sptensor(cfg['save_dir'])

    start = time.time()
    decomp = cpd.cp_apr(
        tensor,
        cfg['rank'],
        max_outer_iter=cfg['max_outer_iter'],
        max_inner_iter=cfg['max_inner_iter'],
        mem_limit_gb=cfg['mem_limit_gb']
    )
    end = time.time()
    print('  decomposition took {} seconds.'.format(end - start))

    if cfg['dump_decomposition_files']:
        print('  Writing decomposition files to disk ...')
        start = time.time()
        cpd.write_cp_decomp_dir(cfg['save_dir'], decomp, write_tensor=False)
        end = time.time()
        print(f'    decomp dump took: {end - start} seconds')

Decomposing tensor w/ CP APR ...
  decomposition took 98.56361365318298 seconds.
  Writing decomposition files to disk ...
    decomp dump took: 1.9467532634735107 seconds


In [26]:
 # Generate Results Visualization
if os.path.exists(os.path.join(cfg['save_dir'], 'viz.pdf')):
    print('Visualization file detected in save dir: {}. Skipping visualization.'.format(cfg['save_dir']))
else:
    print('Starting visualization ...')
    start = time.time()

    with open(os.path.join(cfg['save_dir'], 'viz_beta_metadata.yml'), 'w') as f:
        yaml.dump({
            'time_mode': cfg['time_mode'],
            'port_mode': cfg['port_mode'],
            'use_detectors': cfg['use_detectors']
        }, f)

    viz.visualize(decomp, cfg['save_dir'])

    convert_args = ['convert', '-delay', '10'] + ['{}_comp_{}.png'.format(i, i) for i in range(decomp.rank)] + [os.path.join(cfg['save_dir'], 'viz.pdf')]
    print(subprocess.check_output(convert_args).decode('utf-8'))

    for i in range(decomp.rank):
        os.remove('{}_comp_{}.png'.format(i, i))

    os.remove(os.path.join(cfg['save_dir'], 'viz_beta_metadata.yml'))

    end = time.time()
    print('  visualization took {} seconds.'.format(end - start))

Starting visualization ...

  visualization took 93.31784105300903 seconds.


In [27]:
# Textual report generation
if os.path.exists(os.path.join(cfg['save_dir'], 'report.txt')):
    print('Report file detected in save dir: {}. Skipping report generation.'.format(cfg['save_dir']))
else:
    print('Generating textual report ...')
    start = time.time()

    domain_map_pattern = re.compile('domain_map_[0-9]+.tsv')
    domain_map_fns = sorted(filter(lambda x: domain_map_pattern.fullmatch(x) is not None,
                                       os.listdir(cfg['save_dir'])))
    domain_maps = {
        int(fn[11:fn.index('.tsv')]): report.read_domain_map(os.path.join(cfg['save_dir'], fn))
            for fn in domain_map_fns
    }

    with open(os.path.join(cfg['save_dir'], 'report.txt'), 'w') as f:
        time_mode = None if cfg['time_mode'] == -1 else cfg['time_mode']
        port_mode = None if cfg['port_mode'] == -1 else cfg['port_mode']
        text_report = report.generate_report(decomp, domain_maps, time_mode, port_mode, cfg['use_detectors'])
        f.write(text_report)

    end = time.time()
    print('  report generation took {} seconds.'.format(end - start))



Generating textual report ...
  report generation took 13.544945240020752 seconds.


In [28]:
# Apply Detectors
if cfg['use_detectors']:
    if os.path.exists(os.path.join(cfg['save_dir'], 'detector_log')):
        print('Detector log file detected in save dir: {}. Skipping detectors.'.format(cfg['save_dir']))
    else:
        print('Starting detectors ...')
        start = time.time()
        log = open(cfg['save_dir'] + '/detector_log', 'w')

        print('Running portscan detector ...')
        scans = decomp_has_portscan(decomp)
        log.write("PORT SCAN RESULTS\n")
        log.write("There are {} possible portscans\n\n".format(len(scans)))
        for comp, actor, machine in scans:
            log.write("Component {} likely contains a port scan\n".format(comp))
            log.write("Bad Actor: {}\n".format(actor))
            log.write("Affected Machine: {}\n\n".format(machine))

        print('Running network mapping detector ...')
        log.write("NETWORK MAPPING RESULTS\n")
        netmaps = decomp_has_netmap_mem(decomp.factors, decomp.labels)
        if len(netmaps) == 0:
            log.write('No suspected netmaps\n\n')
        for comp_id, bad_actor, attacked_port in netmaps:
            log.write('Component {} likely contains a network mapping\nBad actor: {}\nAttacked Port: {}\n\n'.format(comp_id, bad_actor, attacked_port))

        print('Running beacon detector ...')
        log.write("BEACON RESULTS\n")
        beacons = decomp_has_beacon_mem(decomp.factors)
        log.write("The following components display beaconing behavior: {}\n\n".format(beacons))

        log.write("BEACON SCORES\n")
        beacon_scores = beacon_scores_from_decomp_mem(decomp.factors)
        log.write("Beacon scores for all components: {}\n".format(beacon_scores))

        log.close()
        end = time.time()
        print('  detection took {} seconds.'.format(end - start))

Starting detectors ...
Running portscan detector ...
Running network mapping detector ...
Running beacon detector ...
  detection took 1.4661805629730225 seconds.
