# Comparison with SOTA

# Grill et al.; Detecting DGA malware using NetFlow; 

from IPython.display import Image
print("For each paper, we show the implementation (snippiet of the paper) and subsequently, implement it")
<center><img src="Detecting DGA malware using NetFlow.pdf - 1.png"/></center></br>

# Build the dataset in intervals of 5 minutes (based on the paper)

In [92]:
import csv
import json
import ast
import statistics
import matplotlib.pyplot as plt
import pandas as pd
import math


def load_events_dataset(csvfile_name: str, traffic_type: str, duration: int) -> (list, list):
    all_nxd_iarrivals = {}
    dns_ip_ratio = []
    with open(csvfile_name, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        row_indx = 0
        for row in reader:
            row_indx += 1

            if row[2] == '8.8.8.8':
                continue

            if traffic_type == "CTU": # benign
                start_col = 3
                family = 'benign'
                if len(row) <= 3:
                    continue
            elif traffic_type == "DGA":
                start_col = 4
                family = row[0]
                
                if len(row) <= 4:
                    continue    

            curr_nxd_iarrivals = []
            
            first_flag = 0
            first_tmstamp = -1
            collect = 1
            first_uniq_ips = 0
            for col_indx in range(start_col, len(row)):
                curr_col = row[col_indx]
                features = "\"" + curr_col + "\""
                features = features.replace("Decimal('", "")
                features = features.replace("')", "")
                try:
                    features = json.loads(features)
                    features = ast.literal_eval(features)
                    tmstamp = features['tmstamp']
                    if tmstamp == 0:
                        collect = 0
                        break
                    if first_flag == 0:
                        first_tmstamp = tmstamp
                        first_uniq_ips = features['uniq_pkt_cntctd_all']
                        first_flag = 1
                        
                    if tmstamp - first_tmstamp > duration*60:
                        break

                except Exception as e:
                    print(e)
                    print("\t\t", row_indx, col_indx, csvfile_name )
            
            
            if collect == 1:
                uniq_ips = features['uniq_pkt_cntctd_all'] - first_uniq_ips
                if traffic_type == "CTU":
                    dns_reqs = col_indx - 2
                else:
                    dns_reqs = col_indx - 3
                dns_ip_ratio.append([dns_reqs, uniq_ips])
                # print(row_indx, dns_reqs, uniq_ips)

    return dns_ip_ratio


# '''

#     print(dns_ip_ratio)
#     all_nxd_iarrivals = all_nxd_iarrivals['benign']
#     x = "       CTU-" + str(i)
#     median = statistics.median(all_nxd_iarrivals)
#     mean = statistics.mean(all_nxd_iarrivals)
#     data_ctu[x] = {"Mean": mean, "Median": median}
    
#     print(ctu_rel_path)
#     print("\t", statistics.median(all_nxd_iarrivals), statistics.mean(all_nxd_iarrivals))
# '''

# Load datasets CTU (normal) and DGA

In [86]:
X = [] 
y1 = []
y2 = []

durations = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
ctu_all_dns_ip_durations = []
dga_all_dns_ip_durations = []

for duration in durations:
    print("T = ", duration)
    ctu_all_dns_ip_ratio = []
    for i in range(42, 55): # 42, 55 !!! 
        ctu_rel_path = "Dataset\\dataset_DNSreqs\\CTU_DNSreqs\\data_no_infection_dns-" + str(i) + "_DNSreq_ft.csv"
        dns_ip_ratio = load_events_dataset(ctu_rel_path, traffic_type="CTU", duration=duration)
        ctu_all_dns_ip_durations.append(dns_ip_ratio)
        # ctu_all_dns_ip_ratio += dns_ip_ratio

    '''
        ctu_all_dns_ip_ratio is a list of lists. Each sublist corresponds to an endpoint, where the first element is DNS requests
        and the second element is the number of unique IP addresses contacted.

    '''

    dga_all_dns_ip_ratio = []
    dga_rel_path =  "Dataset\\dataset_DNSreqs\\DGA_DNSreqs\\trans_icc_DNSreq_ft.csv"
    dga_all_dns_ip_ratio = load_events_dataset(dga_rel_path, traffic_type="DGA", duration=duration)
    dga_all_dns_ip_durations.append(dga_all_dns_ip_ratio)


T =  1
T =  1.5
T =  2
T =  2.5
T =  3
T =  3.5
T =  4
T =  4.5
T =  5


In [90]:
if 'dga_all_dns_ip_durations' not in locals():
  # myVar exists.
    dga_all_dns_ip_durations = json.load(open("netflow_dataset_t1_to_t5.json"))

# Distribution of CTU (normal traffic)
<center><img src="Detecting DGA malware using NetFlow.pdf - 2.png"/></center> </br> <hr> <hr> <hr>
<center><img src="Detecting DGA malware using NetFlow.pdf - 3.png"/></center>

In [100]:
import math
import statistics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def anomaly_detector(x, mean, std_dev):
    if x <= mean + t1*std_dev:
        return 0
    elif x < mean + t2*std_dev and x > mean + t1*std_dev:
        return (x - (mean + t1*std_dev)) / ((t2 - t1)*std_dev)
    elif x >= mean + t2*std_dev:
        return 1

for duration_indx in range(len(durations)):
    
    ctu_test_accs = []   
    dga_accs = []
    for i in range(10):
        # X = ctu_all_dns_ip_ratio
        X = ctu_all_dns_ip_durations[duration_indx]
        y = [0]*len(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # train (get the mean and std_dev) on 80% of the traffic
        ctu_ratios = []
        for endpoint in X_train:
            x = endpoint[0]/(endpoint[1]+1)
            ctu_ratios.append(x)
        mean = statistics.mean(ctu_ratios)
        std_dev = statistics.stdev(ctu_ratios)
        # print("Mean = %f\nStd_dev = %f" %(mean, std_dev))

        # acc = 0
        # testing on CTU
        # for endpoint in X_test:
        #     x = endpoint[0]/(endpoint[1]+1)
        #     if anomaly_detector(x, mean, std_dev) <= 0.5:
        #         acc += 1

        # ctu_test_accs.append(tp/len(X_test))

        # testing on DGAs
        acc = 0
        for endpoint in dga_all_dns_ip_durations[duration_indx]:
            x = endpoint[0]/(endpoint[1]+1)
            if anomaly_detector(x, mean, std_dev) > 0.5:
                acc += 1
        
        # print(acc/len(dga_all_dns_ip_durations[duration_indx]))
        dga_accs.append(acc/len(dga_all_dns_ip_durations[duration_indx]))




    print("At duration = %f, average accuracy on DGAs 10 iterations = %f" %(durations[duration_indx],
                                                                            sum(dga_accs)/len(dga_accs)))

At duration = 1.000000, average accuracy on DGAs 10 iterations = 0.569579
At duration = 1.500000, average accuracy on DGAs 10 iterations = 0.875274
At duration = 2.000000, average accuracy on DGAs 10 iterations = 0.915179
At duration = 2.500000, average accuracy on DGAs 10 iterations = 0.901714
At duration = 3.000000, average accuracy on DGAs 10 iterations = 0.900115
At duration = 3.500000, average accuracy on DGAs 10 iterations = 0.889663
At duration = 4.000000, average accuracy on DGAs 10 iterations = 0.889020
At duration = 4.500000, average accuracy on DGAs 10 iterations = 0.897810
At duration = 5.000000, average accuracy on DGAs 10 iterations = 0.937107
