# Automatic Detection of DGA-Enabled Malware Using SDN and Traffic Behavioral Modeling

# Loading dataset

In [4]:
import os
import csv
import ast

def read_data(csvfilename):
    fr = open(csvfilename, newline='')
    reader = csv.reader(fr, delimiter=',')
    row0 = 0
    X_udp = []
    X_http = []
    X_https = []
    for row in reader:
        if row0 == 0:
            # print(row)
            row0 = 1
            continue
        flow_id = row[1]
        flow_id = ast.literal_eval(flow_id)
        
        if flow_id[4] == 17:
            X_udp.append(row[2:])
        if flow_id[2] == 80 or flow_id[3] == 80:
            X_http.append(row[2:])
        if flow_id[2] == 443 or flow_id[3] == 443:
            X_https.append(row[2:])
    return X_udp, X_http, X_https


dgas_udp, dgas_http, dgas_https = read_data("SOTA_Comparison\dgas_flow_8_attrib.csv")


ctus = {}
for i in range (42,44):
    print(i)
    X_udp, X_http, X_https = read_data("SOTA_Comparison\CTU-" + str(i) + "_flow_8_attrib.csv")
    ctus[i] = [X_udp, X_http, X_https]
    
for i in range (45, 55):
    print(i)
    X_udp, X_http, X_https = read_data("SOTA_Comparison\CTU-" + str(i) + "_flow_8_attrib.csv")
    ctus[i] = [X_udp, X_http, X_https]
    


42
43
45
46
47
48
49
50
51
52
53
54


# Python Isolation Forest (IForest) train

In [14]:
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
import pandas as pd
from sklearn.metrics import *
import warnings
import pickle
import time

warnings.filterwarnings('ignore')

def train_iforest(X_train, y_train, X_test, y_test):
    clf = IsolationForest(n_estimators=10, contamination=0.01, max_features=8)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred[y_pred == -1] = 0
    acc = accuracy_score(y_test, y_pred)
    # print("done")
    return acc

def train_kmeans(X_train, y_train, X_test, y_test):
    # Normalize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    kmeans = KMeans(n_clusters=5, n_init=1)
    kmeans.fit(X_train_scaled, y_train)
    y_pred = kmeans.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    return acc

def train_oc_svm(X_train, y_train, X_test, y_test):
    clf = OneClassSVM(kernel='rbf', gamma=0.125, nu=0.05)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred[y_pred == -1] = 0
    acc = accuracy_score(y_test, y_pred)
    # print("done")
    return acc

def save_model(model, name):
    with open(name,'wb') as f:
        pickle.dump(model,f)

ll = [42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
udp_accs = {'irf': [], 'kmean': [], 'training_time_irf': [], 'training_time_kmeans': []}
http_accs = {'irf': [], 'kmean': [], 'training_time_irf': [], 'training_time_kmeans': []}
https_accs = {'irf': [], 'kmean': [], 'training_time_irf': [], 'training_time_kmeans': []}

for i in ll:
    print(i)
    
    # UDP 
    ctu_udp = ctus[i][0]
    min_len = min(len(ctu_udp), len(dgas_udp))
    X = ctu_udp[:min_len] + dgas_udp[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # print("\t UDP, data len = ", len(X))
    start_t = time.time()
    acc_rf = train_iforest(X_train, y_train, X_test, y_test)
    udp_accs['training_time_irf'].append(time.time() - start_t)
    start_t = time.time()
    acc_kmean = train_kmeans(X_train, y_train, X_test, y_test)
    udp_accs['training_time_kmeans'].append(time.time() - start_t)
    acc_svm = 0 # train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t\t UDP Accs (RF/kmean/SVM): ", acc_rf, "\t", acc_kmean, "\t", acc_svm)
    udp_accs['irf'].append(acc_rf)
    udp_accs['kmean'].append(acc_kmean)
#     save_model(acc_rf, "SOTA_comparison/saved_models/udp_iforest_" + str(i) + ".pkl")
#     save_model(acc_kmean, "SOTA_comparison/saved_models/udp_knn_" + str(i) + ".pkl")
#     save_model(acc_svm, "SOTA_comparison/saved_models/udp_svm_" + str(i) + ".pkl")
    

    
    # HTTP
    ctu_http = ctus[i][1]
    min_len = min(len(ctu_http), len(dgas_http))
    X = ctu_http[:min_len] + dgas_http[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # print("\t HTTP, data len = ", len(X))
    start_t = time.time()
    acc_rf = train_iforest(X_train, y_train, X_test, y_test)
    http_accs['training_time_irf'].append(time.time() - start_t)
    start_t = time.time()
    acc_kmean = train_kmeans(X_train, y_train, X_test, y_test)
    http_accs['training_time_kmeans'].append(time.time() - start_t)
    acc_svm = 0 # train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t\t HTTP Accs (RF/kmean/SVM): ", acc_rf, "\t", acc_kmean, "\t", acc_svm)
    http_accs['irf'].append(acc_rf)
    http_accs['kmean'].append(acc_kmean)
#     save_model(acc_rf, "SOTA_comparison/saved_models/http_iforest_" + str(i) + ".pkl")
#     save_model(acc_kmean, "SOTA_comparison/saved_models/http_knn_" + str(i) + ".pkl")
#     save_model(acc_svm, "SOTA_comparison/saved_models/http_svm_" + str(i) + ".pkl")
    
    
    # HTTPS 
    ctu_https = ctus[i][2]
    min_len = min(len(ctu_https), len(dgas_https))
    X = ctu_https[:min_len] + dgas_https[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # print("\t HTTPS, data len = ", len(X))
    start_t = time.time()
    acc_rf = train_iforest(X_train, y_train, X_test, y_test)
    https_accs['training_time_irf'].append(time.time() - start_t)
    start_t = time.time()
    acc_kmean = train_kmeans(X_train, y_train, X_test, y_test)
    https_accs['training_time_kmeans'].append(time.time() - start_t)
    acc_svm = 0 # train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t\t HTTPS Accs (RF/kmean/SVM): ", acc_rf, "\t", acc_kmean, "\t", acc_svm)
    https_accs['irf'].append(acc_rf)
    https_accs['kmean'].append(acc_kmean)
#     save_model(acc_rf, "SOTA_comparison/saved_models/https_iforest_" + str(i) + ".pkl")
#     save_model(acc_kmean, "SOTA_comparison/saved_models/https_knn_" + str(i) + ".pkl")
#     save_model(acc_svm, "SOTA_comparison/saved_models/https_svm_" + str(i) + ".pkl")
    
    

42
		 UDP Accs (RF/kmean/SVM):  0.5097778195121861 	 0.3454817286146447 	 0
		 HTTP Accs (RF/kmean/SVM):  0.5097728291219966 	 0.244630976593471 	 0
		 HTTPS Accs (RF/kmean/SVM):  0.506621633632566 	 0.6679406274997778 	 0
43
		 UDP Accs (RF/kmean/SVM):  0.5105775470473718 	 0.3643358270174966 	 0
		 HTTP Accs (RF/kmean/SVM):  0.51037609017891 	 0.5646523492709159 	 0
		 HTTPS Accs (RF/kmean/SVM):  0.5078659674695583 	 0.40885254644031643 	 0
45
		 UDP Accs (RF/kmean/SVM):  0.508610797420859 	 0.09583894940075043 	 0
		 HTTP Accs (RF/kmean/SVM):  0.5101175497259471 	 0.248440139267124 	 0
		 HTTPS Accs (RF/kmean/SVM):  0.5055550617722869 	 0.22460225757710425 	 0
46
		 UDP Accs (RF/kmean/SVM):  0.5095617529880478 	 0.419217248652449 	 0
		 HTTP Accs (RF/kmean/SVM):  0.5032995631564272 	 0.33841435077609444 	 0
		 HTTPS Accs (RF/kmean/SVM):  0.5037700282752121 	 0.25589066918001885 	 0
47
		 UDP Accs (RF/kmean/SVM):  0.5098596540446361 	 0.33762024643598054 	 0
		 HTTP Accs (RF/kmean/SV

In [18]:
udp_irf_acc = sum(udp_accs['irf']) / len(udp_accs['irf'])
udp_irf_time = sum(udp_accs['training_time_irf']) / len(udp_accs['training_time_irf'])

udp_kmeans_acc = sum(udp_accs['kmean']) / len(udp_accs['kmean'])
udp_kmeans_time = sum(udp_accs['training_time_kmeans']) / len(udp_accs['training_time_kmeans'])

http_irf_acc = sum(http_accs['irf']) / len(http_accs['irf'])
http_irf_time = sum(http_accs['training_time_irf']) / len(http_accs['training_time_irf'])

http_kmeans_acc = sum(http_accs['kmean']) / len(http_accs['kmean'])
http_kmeans_time = sum(http_accs['training_time_kmeans']) / len(http_accs['training_time_kmeans'])

https_irf_acc = sum(https_accs['irf']) / len(https_accs['irf'])
https_irf_time = sum(https_accs['training_time_irf']) / len(https_accs['training_time_irf'])

https_kmeans_acc = sum(https_accs['kmean']) / len(https_accs['kmean'])
https_kmeans_time = sum(https_accs['training_time_kmeans']) / len(https_accs['training_time_kmeans'])

print("UDP IRF/kmean accs = ", udp_irf_acc, udp_knn_acc, "\t\t Training times = ", udp_irf_time, udp_kmeans_time)
print("HTTP IRF/kmean accs = ", http_irf_acc, http_knn_acc,http_irf_time, "\t\t Training times = ", http_irf_time, http_kmeans_time)
print("HTTPS IRF/kmean accs = ", https_irf_acc, https_knn_acc, "\t\t Training times = ", https_irf_time, https_kmeans_time)

UDP IRF/kmean accs =  0.5091145707604188 0.4199823406057424 		 Training times =  6.319638768831889 5.579049686590831
HTTP IRF/kmean accs =  0.5086425720126847 0.31246546676536474 1.1948212186495464 		 Training times =  1.1948212186495464 1.1494445006052654
HTTPS IRF/kmean accs =  0.5062934355631924 0.4348678369344499 		 Training times =  0.26012925306955975 0.22841578722000122


In [3]:
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
import pandas as pd
from sklearn.metrics import *
import warnings
import pickle
warnings.filterwarnings('ignore')

def train_oc_svm(X_train, y_train, X_test, y_test):
    clf = OneClassSVM(kernel='rbf', gamma=0.125, nu=0.05, cache_size=7000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred[y_pred == -1] = 0
    acc = accuracy_score(y_test, y_pred)
    return acc

ll = [42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
for i in ll:
    print(i)
    
    # UDP 
    ctu_udp = ctus[i][0]
    min_len = min(len(ctu_udp), len(dgas_udp))
    X = ctu_udp[:min_len] + dgas_udp[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.55, random_state=42)
    print("\t UDP, data len = ", len(X))
    acc_svm = train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t\t Accs: ", acc_svm)
    

    # HTTP
    ctu_http = ctus[i][1]
    min_len = min(len(ctu_http), len(dgas_http))
    X = ctu_http[:min_len] + dgas_http[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.55, random_state=42)
    print("\t HTTP, data len = ", len(X))
    acc_svm = train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t HTTP: ", acc_svm)
    
    
    # HTTPS 
    ctu_https = ctus[i][2]
    min_len = min(len(ctu_https), len(dgas_https))
    X = ctu_https[:min_len] + dgas_https[:min_len]
    y = [0]*min_len + [1]*min_len
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.55, random_state=42)
    print("\t HTTPS, data len = ", len(X))
    acc_svm = train_oc_svm(X_train, y_train, X_test, y_test)
    print("\t HTTPS: ", acc_svm)
    

42
	 UDP, data len =  2172468


KeyboardInterrupt: 

In [None]:
# resources
# oc-svm complexity (not running bcz it is taking too long)
# https://datascience.stackexchange.com/questions/989/svm-using-scikit-learn-runs-endlessly-and-never-completes-execution
