In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_roc_curve
import numpy as np

%matplotlib inline

In [3]:
mailicious_files = ["bruteforce.csv", "capture_flood.csv", "capture_malariaDoS.csv", "malformed.csv", "slowite.csv"]
benign_files = ["capture_1w.csv", "normal.csv"]
#list of models that are trained, remove some to test fewer.
model_names = ["isolation_forest", "random_forest", "k_means", "one_class_svm"]
isolation_forest = None
random_forest = None
k_means = None
one_class_svm = None

In [4]:
df = pd.read_csv("all.csv")
df.head()

Unnamed: 0,index,packet_len,ip_len,ip_df,ip_mf,ip_ttl,tcp_len,tcp_pdu_size,tcp_ack,tcp_cwr,...,tcp_urg,tcp_src_port,tcp_dst_port,tcp_tdelta,tcp_l20_avg,mqtt_len,mqtt_topic_len,mqtt_msg_type,mqtt_qos_lvl,output
0,5380348,66,52,1,0,63,32,8,1,0,...,0,1883,35821,134,31,0,0,0,0,1
1,397915,66,52,1,0,64,32,8,1,0,...,0,35042,1883,30,-1216,0,0,0,0,0
2,11873128,66,52,1,0,64,32,8,1,0,...,0,44715,1883,45,0,0,0,0,0,1
3,2222605,66,52,1,0,64,32,8,1,0,...,0,52961,1883,53,1,0,0,0,0,1
4,11898510,66,52,1,0,64,32,8,1,0,...,0,59159,1883,64,0,0,0,0,0,1


In [4]:
def train_supervised(X, y):
    global random_forest, k_means, isolation_clf
    for model_name in model_names:
        if model_name == "random_forest":
            if random_forest == None:
                random_forest = RandomForestClassifier(max_depth=2, random_state=0)
            random_forest.fit(X, y)
            print("random_forest:", random_forest.score(X, y))    
        elif model_name == "k_means":
            if k_means == None:
                k_means = KMeans(n_clusters=2, random_state=0)
            k_means.fit(X)
            print("random_forest:", k_means.score(X, y))
            continue

In [12]:
def train_unsupervised(X):
    global random_forest, k_means, isolation_clf, one_class_svm
    for model_name in model_names:
        if model_names == "isolation_forest":
            if isolation_forest == None:
                isolation_forest = IsolationForest(random_state=0)
            isolation_forest.fit(X)   
        elif model_name == "one_class_svm":
            if one_class_svm == None:
                one_class_svm = OneClassSVM(gamma='auto')
            one_class_svm.fit(X)

In [None]:
def show_metrics(clf, X, y_test, y_pred_test):
    print("accuracy score: {}".format(accuracy_score(y_test, y_pred_test)
    print(classification_report(y_test, y_pred_test))
    plot_roc_curve(clf, X_test, y_test)

In [6]:
def test(X, y):
    global random_forest, k_means, isolation_clf
    for model_name in model_names:
        if model_names == "isolation_forest":
            if isolation_clf == None:
                print("isolation_forest model not trained or fitted")
            else:
                print("isolation_forest:::")
                show_metrics(isolation_clf, X, y, isolation_clf.predict(X))
        elif model_name == "random_forest":
            if random_forest == None or not check_is_fitted(random_forest):
                print("Random_forest model not trained or fitted")
            else:
                print("random forest:::")
                show_metrics(random_forest, X, y, random_forest.predict(X))
        elif model_name == "k_means":
            if k_means == None:
                print("k_means model not trained or fitted")
            else:
                print("k_means:::")
                show_metrics(k_means, X, y, k_means.predict(X))

In [6]:
y = df['output'].to_numpy().ravel()
df.drop(['output'], axis=1)
X = df.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
benign_df = pd.read_csv("benign.csv")
benign_df.drop(['output'], axis=1)
X = benign_df.to_numpy()
X1_train, X1_test = train_test_split(X, test_size=0.33, random_state=42)

In [None]:
train_unsupervised(X1_train)

In [7]:
train(X_train, y_train)

In [9]:
test(X_test, y_test)

random_forest: 1.0
