In [None]:
import pickle
import random
from datetime import datetime
import math
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pathlib import Path

In [None]:
def loadfile(path, filename):
    pickle_in = open(path + filename + ".pickle","rb")
    return pickle.load(pickle_in)

Path("C:/Users/Akarsh/Downloads/DP_scripts/store_emb/figures/").mkdir(parents=True, exist_ok=True)
path = "C:/Users/Akarsh/Downloads/DP_scripts/store_emb/"

main_str = str(437)

filename = "final_output_" + main_str
final_output = loadfile(path, filename)
display(final_output.head())

filename = "updated_train_" + main_str
updated_train = loadfile(path, filename).drop(['attackType'], axis=1)
display(updated_train.head())

print('original flows shape', updated_train.shape)
print('fake output shape', final_output.shape)
display(final_output.info())

In [None]:
def plot_violin_single(data, label_str, label_title):
    fig, ax = plt.subplots()
    ax.violinplot(data, showmeans=True, showmedians=True)
    ax.set_title(label_title)
    ax.set_xlabel('x-axis')
    ax.set_ylabel('y-axis')
    xticklabels = [label_str]
    ax.set_xticks([1])
    ax.set_xticklabels(xticklabels)
    ax.yaxis.grid(True)
    plt.savefig('C:/Users/Akarsh/Downloads/DP_scripts/store_emb/figures/' + label_title + '_' + main_str +'.png', 
                bbox_inches='tight')
    plt.show()

In [None]:
def plot_freq(data, title_):
    fig = plt.figure(figsize = (10, 3))
    title = fig.suptitle(title_ + " Frequency", fontsize=14)
    fig.subplots_adjust(top=0.85, wspace=0.3)

    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel(title_)
    ax.set_ylabel("Frequency")
    w_q = data.value_counts()
    w_q = (list(w_q.index), list(w_q.values))
    ax.tick_params(axis='both', which='major', labelsize=8.5)
    bar = ax.bar(w_q[0], w_q[1], color='steelblue', 
            edgecolor='black', linewidth=1)
    fig.savefig('C:/Users/Akarsh/Downloads/DP_scripts/store_emb/figures/' + title_ + "_Frequency" + '_' + main_str +'.png', 
                bbox_inches='tight')

plot_freq(updated_train['Proto'], "Real Protocol")
plot_freq(final_output['Proto'], "Fake Protocol")

plot_freq(updated_train['Flags'], "Real Flags")
plot_freq(final_output['Flags'], "Fake Flags")
print('Real flag length:', updated_train['Flags'].unique(), len(updated_train['Flags'].unique()))
print('Fake flag length:', final_output['Flags'].unique(), len(final_output['Flags'].unique()), '\n')

plot_freq(updated_train['class'], "Real Class")
plot_freq(final_output['class'], "Fake Class")

In [None]:
updated_train.hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0,
           xlabelsize=8, ylabelsize=8, grid=False)
plt.tight_layout(rect=(0, 0, 1.2, 1.2))
plt.savefig('C:/Users/Akarsh/Downloads/DP_scripts/store_emb/figures/' + 'real_hist_' + main_str +'.png', bbox_inches='tight')

final_output.apply(pd.to_numeric, errors='ignore').hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0,
           xlabelsize=8, ylabelsize=8, grid=False)
plt.tight_layout(rect=(0, 0, 1.2, 1.2))
plt.savefig('C:/Users/Akarsh/Downloads/DP_scripts/store_emb/figures/' + 'fake_hist_' + main_str +'.png', bbox_inches='tight')

In [None]:
plot_dist = True

if plot_dist:    
    plot_violin_single(updated_train['Day'].tolist(), 'Real', 'Real_' + 'Day')
    plot_violin_single(final_output['Day'].tolist(), 'Fake', 'Fake_' + 'Day')

    plot_violin_single(updated_train['Src Pt'].astype(int).tolist(), 'Real', 'Real_' + 'Src Pt')
    plot_violin_single(final_output['Src Pt'].astype(float).tolist(), 'Fake', 'Fake_' + 'Src Pt')

    plot_violin_single(updated_train['Dst Pt'].astype(int).tolist(), 'Real', 'Real_' + 'Dst Pt')
    plot_violin_single(final_output['Dst Pt'].astype(float).tolist(), 'Fake', 'Fake_' + 'Dst Pt')

    def int_encode(data):
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data)
        return integer_encoded

    plot_violin_single(list(int_encode(updated_train['Src IP Addr'])), 'Real', 'Real_' + 'Src IP Addr')
    plot_violin_single(list(int_encode(final_output['Src IP Addr'])), 'Fake', 'Fake_' + 'Src IP Addr')

    plot_violin_single(list(int_encode(updated_train['Dst IP Addr'])), 'Real', 'Real_' + 'Dst IP Addr')
    plot_violin_single(list(int_encode(final_output['Dst IP Addr'])), 'Fake', 'Fake_' + 'Dst IP Addr')

    plot_violin_single(list(int_encode(updated_train['Proto'])), 'Real', 'Real_' + 'Protocol')
    plot_violin_single(list(int_encode(final_output['Proto'])), 'Fake', 'Fake_' + 'Protocol')

    plot_violin_single(list(int_encode(updated_train['Flags'])), 'Real', 'Real_' + 'Flags')
    plot_violin_single(list(int_encode(final_output['Flags'])), 'Fake', 'Fake_' + 'Flags')

    plot_violin_single(list(int_encode(updated_train['class'])), 'Real', 'Real_' + 'Class')
    plot_violin_single(list(int_encode(final_output['class'])), 'Fake', 'Fake_' + 'Class')

    updated_real_mb_flows = []
    for valu in updated_train['Bytes'].astype(int).values:
        if valu >= (1024 * 1024):
            updated_real_mb_flows.append(valu)
    print(len(updated_real_mb_flows))
    plot_violin_single(updated_real_mb_flows, 'Real', 'Real_' + 'Bytes')

    updated_fake_mb_flows = []
    for valu in final_output['Bytes'].astype(int).values:
        if valu >= (1024 * 1024):
            updated_fake_mb_flows.append(valu)
    print(len(updated_fake_mb_flows))
    plot_violin_single(updated_fake_mb_flows, 'Fake', 'Fake_' + 'Bytes')

    updated_real_pkts = []
    for valu in updated_train['Packets'].astype(int).values:
        if valu >= 100:
            updated_real_pkts.append(valu)
    print(len(updated_real_pkts))
    plot_violin_single(updated_real_pkts, 'Real', 'Real_' + 'Packets')

    updated_fake_pkts = []
    for valu in final_output['Packets'].astype(int).values:
        if valu >= 100:
            updated_fake_pkts.append(valu)
    print(len(updated_fake_pkts))
    plot_violin_single(updated_fake_pkts, 'Fake', 'Fake_' + 'Packets')

In [None]:
# Check ids from the files
id_day = 0; id_time = 1; id_dur = 2; id_proto = 3; 
id_srcIP = 4; id_srcPt = 5; id_dstIP = 6; id_dstPt = 7
id_packets=8; id_bytzes= 9; id_flags = 10; id_clazz = 11

# Test 1: UDP should have no TCP flags
succeeded_flags_and_udp = 0; failed_flags_and_udp = 0
def checkFlagsAndUDP(flags,proto):
    global succeeded_flags_and_udp 
    global failed_flags_and_udp
    if proto == "UDP": 
        if len(flags) == 0:
            succeeded_flags_and_udp += 1 
        else:
            failed_flags_and_udp += 1


# Test 2: At least one IP should be internal
succeeded_one_ip_intern = 0; failed_one_ip_intern = 0
def checkOneIPIntern(srcIP, dstIP): 
    global succeeded_one_ip_intern 
    global failed_one_ip_intern
    if srcIP[:7] == "192.168" or dstIP[:7] == "192.168" or srcIP == "0.0.0.0" or dstIP == "255.255.255.255": 
        succeeded_one_ip_intern += 1 
    else:
        failed_one_ip_intern += 1


# Test 3: Port 80 and 443 only TCP
succeeded_tcp_80 = 0; failed_tcp_80 = 0
def checkPort80TCP(proto, srcPt, dstPt, clazz): 
    global succeeded_tcp_80
    global failed_tcp_80
    if srcPt == "80" or srcPt == "443" or dstPt == "80" or dstPt == "443": 
        if proto == "TCP": 
            succeeded_tcp_80 += 1
        elif clazz == "normal":
            failed_tcp_80 += 1


# Test 4: Port 53 only UDP
succeeded_udp_53 = 0; failed_udp_53 = 0
def checkPort53UDP(proto,srcPt,dstPt,clazz): 
    global succeeded_udp_53 
    global failed_udp_53
    if srcPt == "53" or dstPt == "53":
        if proto == "UDP": 
            succeeded_udp_53 += 1
        elif clazz == "normal":
            failed_udp_53 += 1


# Test 5: Multi/Broadcast Address appears in flow then, IP must be DST-IP
succeeded_multicast = 0; failed_multicast = 0
def checkMultiBroadcast(srcIP,dstIP,row):
    global succeeded_multicast
    global failed_multicast
    ip1_1 = int( srcIP.split(".")[0] ) 
    ip1_4 = int( srcIP.split(".")[3] )
    ip2_1 = int( dstIP.split(".")[0] ) 
    ip2_4 = int( dstIP.split(".")[3] )

    if (ip2_1 > 223 or (ip2_1 == 192 and ip2_4 == 255)) and ip1_1 < 224 and not(ip1_4 == 192 and ip1_4 == 255):  
        succeeded_multicast += 1 
    elif ip1_1 > 223 or (ip1_4 == 192 and ip1_4 == 255):  
        failed_multicast += 1


# Test 6: Netbios only to internal broadcasts and from internal hosts
succeeded_netbios = 0; failed_netbios = 0
def checkNetbios(srcIP,dstIP,dstPt,proto): 
    global succeeded_netbios 
    global failed_netbios
    ip1_1 = int( srcIP.split(".")[0] )
    ip1_2 = int( srcIP.split(".")[1] )
    ip2_1 = int( dstIP.split(".")[0] ) 
    ip2_4 = int( dstIP.split(".")[3] )
    if dstPt == "137" or dstPt == "138":
        if ip1_1 == 192 and ip1_2 == 168 and proto == "UDP" and ip2_1 == 192 and ip2_4 == 255:
            succeeded_netbios += 1
        else:
            failed_netbios += 1


# Test 7: Check Bytes 
succeeded_byte_packet = 0; failed_byte_packet = 0
def checkRelationBytePackets(bytzes,packets,row): 
    global succeeded_byte_packet 
    global failed_byte_packet

    if bytzes >= packets * 42 and bytzes <= packets * 65536:  
        succeeded_byte_packet += 1 
    else:
        failed_byte_packet += 1

In [None]:
for row in final_output.values:
    day   = row[id_day]
    time  = row[id_time].strip() 
    srcIP = row[id_srcIP].strip()
    srcPt = row[id_srcPt]
    dstIP = row[id_dstIP].strip()
    dstPt = row[id_dstPt]
    proto = row[id_proto].strip()
    flags = row[id_flags].strip()
    bytzes= row[id_bytzes]
    packets=row[id_packets]
    dur   = row[id_dur]
    clazz = row[id_clazz].strip()

    # Test 1: UDP should have no TCP flags 
    if flags == "......":
        flags = ""
    checkFlagsAndUDP(flags,proto)
    
    # Test 2: At least one IP should be internal
    checkOneIPIntern(srcIP,dstIP)
    
    # Test 3: Port 80 and 443 only TCP 
    checkPort80TCP(proto,srcPt,dstPt,clazz)
    
    # Test 4: Port 53 only UDP
    checkPort53UDP(proto,srcPt,dstPt,clazz)
    
    # Test 5: Multi/Broadcast Address appears in flow then, IP must be DST-IP 
    checkMultiBroadcast(srcIP,dstIP,row)
    
    # Test 6: Netbios only to internal broadcasts and from internal hosts
    checkNetbios(srcIP,dstIP,dstPt,proto)
    
    if "M" in bytzes: 
        bytzes = float(bytzes.split(" ")[0]) * 1024 * 1024
    else: 
        bytzes = float(bytzes)
    packets = float(packets)

    # Test 7: Check Byte/Packets Relation
    checkRelationBytePackets(bytzes,packets,row)

In [None]:
# Print Results
print("Test 1: UDP should not have TCP flags")
test1_per = float(succeeded_flags_and_udp)
if (succeeded_flags_and_udp+failed_flags_and_udp)  > 0:   
    test1_per /= (succeeded_flags_and_udp+failed_flags_and_udp) 
test1_per *= 100
test1_per = str(test1_per)
print(str(succeeded_flags_and_udp), " from ", str(succeeded_flags_and_udp+failed_flags_and_udp), " correct. (", test1_per, "%)", '\n')

print("Test 2: At least one IP should be internal") 
test2_per = float(succeeded_one_ip_intern) 
if (succeeded_one_ip_intern+failed_one_ip_intern)  > 0: 
    test2_per /= (succeeded_one_ip_intern+failed_one_ip_intern) 
test2_per *= 100
test2_per = str(test2_per)
print(str(succeeded_one_ip_intern), " from ", str(succeeded_one_ip_intern+failed_one_ip_intern), " correct. (", test2_per, "%)", '\n')

print("Test 3: Port 80 and 443 only TCP ") 
test3_per = float(succeeded_tcp_80) 
if (succeeded_tcp_80+failed_tcp_80) > 0:    
    test3_per /= (succeeded_tcp_80+failed_tcp_80) 
test3_per *= 100
test3_per = str(test3_per)
print(str(succeeded_tcp_80), " from ", str(succeeded_tcp_80+failed_tcp_80), " correct. (", test3_per, "%)", '\n')

print("Test 4: Port 53 only UDP ")
test4_per = float(succeeded_udp_53)
if (succeeded_udp_53+failed_udp_53) > 0:
    test4_per /= (succeeded_udp_53+failed_udp_53)
test4_per *= 100
test4_per = str(test4_per)
print(str(succeeded_udp_53), " from ", str(succeeded_udp_53+failed_udp_53), " correct. (", test4_per, "%)", '\n')

print("Test 5: Multi- and Broadcast Addresses only as a target") 
test5_per = float(succeeded_multicast) 
if succeeded_multicast+failed_multicast > 0:    
    test5_per /= (succeeded_multicast+failed_multicast) 
test5_per *= 100
test5_per = str(test5_per)
print(str(succeeded_multicast), " from ", str(succeeded_multicast+failed_multicast), " correct. (", test5_per, "%)", '\n')

print("Test 6: Netbios only to internal broadcasts and from internal hosts")
test6_per = float(succeeded_netbios) 
if succeeded_netbios+failed_netbios > 0:    
    test6_per /= (succeeded_netbios+failed_netbios) 
test6_per *= 100
test6_per = str(test6_per)
print(str(succeeded_netbios), " from ", str(succeeded_netbios+failed_netbios), " correct. (", test6_per, "%)", '\n')

print("Test 7: Check Byte Packet Relation")
test7_per = float(succeeded_byte_packet)
if succeeded_byte_packet+failed_byte_packet > 0:
    test7_per /= (succeeded_byte_packet+failed_byte_packet) 
test7_per *= 100
test7_per = str(test7_per)
print(str(succeeded_byte_packet), " from ", str(succeeded_byte_packet+failed_byte_packet), " correct. (", test7_per, "%)")

with open(path + "final_results.txt", "ab") as f:
    np.savetxt(f, ["Test 1: UDP should not have TCP flags", '\n', str(succeeded_flags_and_udp), " from ", 
                   str(succeeded_flags_and_udp+failed_flags_and_udp), " correct. (", test1_per, "%)", '\n'], fmt='%s', newline=" ")
    
    np.savetxt(f, ["Test 2: At least one IP should be internal", str(succeeded_one_ip_intern), " from ", 
                   str(succeeded_one_ip_intern+failed_one_ip_intern), " correct. (", test2_per, "%)", '\n'], fmt='%s', newline=" ")
    
    np.savetxt(f, ["Test 3: Port 80 and 443 only TCP ", str(succeeded_tcp_80), " from ", 
                   str(succeeded_tcp_80+failed_tcp_80), " correct. (", test3_per, "%)", '\n'], fmt='%s', newline=" ")
    
    np.savetxt(f, ["Test 4: Port 53 only UDP ", str(succeeded_udp_53), " from ", 
                   str(succeeded_udp_53+failed_udp_53), " correct. (", test4_per, "%)", '\n'], fmt='%s', newline=" ")
    
    np.savetxt(f, ["Test 5: Multi- and Broadcast Addresses only as a target", str(succeeded_multicast), " from ", 
                   str(succeeded_multicast+failed_multicast), " correct. (", test5_per, "%)", '\n'], fmt='%s', newline=" ")
    
    np.savetxt(f, ["Test 6: Netbios only to internal broadcasts and from internal hosts", str(succeeded_netbios), " from ", 
                   str(succeeded_netbios+failed_netbios), " correct. (", test6_per, "%)", '\n'], fmt='%s', newline=" ")    
    
    np.savetxt(f, ["Test 7: Check Byte Packet Relation", str(succeeded_byte_packet), " from ", 
                   str(succeeded_byte_packet+failed_byte_packet), " correct. (", test7_per, "%)", '\n'], fmt='%s', newline=" ")

In [None]:
length_attributes = 12
gen_flows = []; num_gen_flows = -1.0
for i in range(0, length_attributes):
    gen_flows.append(dict())
    
org_flows = []; num_org_flows = -1.0
for i in range(0,length_attributes):
    org_flows.append(dict())

# generated flows
counter = 0
for row in final_output.values:
    counter += 1
    if counter % 500000 == 0:
        print("Reading generated Flows: ", str(counter))

    for id in range(0, length_attributes):
        val = str(row[id]).strip()
        # Duration
        if id == 2:
            if "." in val:
                l = len( val.split(".")[1] ) 
                for r in range(l,3):
                    val = val + "0"
            else:
                val = val + ".000"
        if val in gen_flows[id]:
            count = gen_flows[id][val]
            count += 1
            gen_flows[id][val] = count
        else:
            gen_flows[id][val] = 1
num_gen_flows = float(counter)

In [None]:
# original flows 
counter = 0
for row in updated_train.values:
    counter += 1
    if counter % 1000000 == 0:
        print("Reading original Flows: ", str(counter))

    for id in range(0,length_attributes):
        val = str(row[id]).strip()
        if val in org_flows[id]:
            count = org_flows[id][val]
            count += 1
            org_flows[id][val] = count
        else:
            org_flows[id][val] = 1
num_org_flows = float(counter)

In [None]:
print("Calculating Euclidean distances...", '\n')
with open(path + "final_results.txt", "ab") as f:
    for i in range(0,length_attributes):
        gen_dict = gen_flows[i]
        org_dict = org_flows[i]

        dist = 0
        for key in org_dict:
            v1 = org_dict[key]
            v1 /= num_org_flows
            v2 = 0
            if key in gen_dict:
                v2 = gen_dict[key]
                v2 /= num_gen_flows
            d = v1 - v2
            dist = dist + (d*d)

        for key in gen_dict:
            if key not in org_dict:
                v2 = gen_dict[key]
                v2 /= num_gen_flows
                dist = dist + (v2*v2)
                
        f.write(b"\n")
        np.savetxt(f, [final_output.columns[i], math.sqrt(dist)], fmt='%s')
        print("Distance in Attribute ", final_output.columns[i], ":", math.sqrt(dist) )