In [3]:
import pandas as pd
import numpy as np
from itertools import combinations, product

In [4]:
normal = pd.read_csv('../../Dataset/MCFP/malware.csv')
malware = pd.read_csv('../../Dataset/MCFP/normal.csv')

df = pd.concat([malware, normal])
df = df.sample(frac=1, replace=False, random_state=27)
df.head(5)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
100360,192.168.1.113-82.202.226.189-61944-443-6,82.202.226.189,443,192.168.1.113,61944,6,19/01/1970 03:38:29 AM,1101101,2,1,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
121468,10.0.2.15-52.94.220.16-56266-443-6,10.0.2.15,56266,52.94.220.16,443,6,01/01/1970 10:12:03 AM,6984634,8,10,...,0,1264518.0,0.0,1264518.0,1264518.0,5719560.0,0.0,5719560.0,5719560.0,0
37401,195.216.249.89-10.0.2.15-443-57438-6,10.0.2.15,57438,195.216.249.89,443,6,01/01/1970 09:47:24 AM,119272058,21,28,...,0,321522.6,870723.526291,2974285.0,1346.0,9617593.0,1250030.0,10090434.0,5699785.0,0
232565,185.33.220.26-10.0.2.15-443-50794-6,10.0.2.15,50794,185.33.220.26,443,6,01/01/1970 10:43:19 AM,10766858,9,18,...,0,776271.0,0.0,776271.0,776271.0,9987692.0,0.0,9987692.0,9987692.0,0
64279,192.168.1.113-92.53.91.20-53883-443-6,192.168.1.113,53883,92.53.91.20,443,6,10/01/1970 05:57:23 AM,1148309,2,4,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
df['Label'].value_counts()

Label
0    237835
1    116528
Name: count, dtype: int64

In [6]:
df_cp = df.copy()

In [7]:
def clean_df(df):
    # Remove the space before each feature names
    df.columns = df.columns.str.strip()
    print('dataset shape', df.shape)

    # This set of feature should have >= 0 values
    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('zero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('shape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('shape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('columns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df_cp = clean_df(df_cp)

dataset shape (354363, 84)
zero variance columns ['Protocol', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Fwd Seg Size Min'] dropped
shape after removing zero variance columns: (354363, 69)
122 rows dropped
shape after removing nan: (354241, 69)
shape after dropping duplicates: (240776, 69)
columns which have identical values [('Tot Fwd Pkts', 'Subflow Fwd Pkts'), ('Tot Bwd Pkts', 'Subflow Bwd Pkts'), ('Bwd PSH Flags', 'PSH Flag Cnt')] dropped
shape after removing identical value columns: (240776, 66)


In [9]:
drop_columns = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp']
df_cp = df_cp.drop(drop_columns, axis=1)

In [16]:
df_cp = df_cp.drop(['Init Bwd Win Byts'], axis=1)

In [10]:
df_cp.columns

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd PSH Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'ACK Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
       'Bwd Seg Size Avg', 'Subflow Fwd Byts', 'Subflow Bwd Byts',
       'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Active Mean', 'Active Std',
       'A

In [11]:
df_cp.shape

(240776, 60)

In [7]:
# df_cp = df_cpz.copy()

In [17]:
from sklearn.feature_selection import SelectKBest


def select_k_best(score, X, Y, k=20):
    selector = SelectKBest(score, k=k)
    selector.fit_transform(X, Y)
    names = X.columns.values[selector.get_support()]
    scores = selector.scores_[selector.get_support()]
    names_scores = list(zip(names, scores))
    df_reduced = pd.DataFrame(data=names_scores, columns=['feature_names', 'score'])

    df_reduced = df_reduced.sort_values(['score', 'feature_names'], ascending=[False, True])
    print(df_reduced)
    return df_reduced

In [19]:
from sklearn.feature_selection import mutual_info_classif, f_classif

frs_mi = select_k_best(mutual_info_classif, df_cp[df_cp.columns.difference(['Label'])], df_cp.Label, k = 58)

        feature_names     score
22       Flow IAT Max  0.514097
21      Flow Duration  0.510489
26        Flow Pkts/s  0.479085
23      Flow IAT Mean  0.478589
38         Fwd Pkts/s  0.474143
16         Bwd Pkts/s  0.472928
24       Flow IAT Min  0.419831
29        Fwd IAT Max  0.413511
33        Fwd IAT Tot  0.412804
5      Bwd Header Len  0.398011
30       Fwd IAT Mean  0.390519
31        Fwd IAT Min  0.386703
25       Flow IAT Std  0.363523
47        Pkt Len Std  0.307235
48        Pkt Len Var  0.306844
45       Pkt Len Mean  0.305790
49       Pkt Size Avg  0.305517
37    Fwd Pkt Len Std  0.300837
35   Fwd Pkt Len Mean  0.298744
39   Fwd Seg Size Avg  0.298652
53   Subflow Fwd Byts  0.296726
57    TotLen Fwd Pkts  0.294227
6         Bwd IAT Max  0.279532
44        Pkt Len Max  0.276974
10        Bwd IAT Tot  0.273731
12    Bwd Pkt Len Max  0.261420
7        Bwd IAT Mean  0.253682
20        Flow Byts/s  0.250771
40           Idle Max  0.246019
41          Idle Mean  0.245559
42      

In [28]:
features = frs_mi[frs_mi.score >= 0.3].feature_names
features

22       Flow IAT Max
21      Flow Duration
26        Flow Pkts/s
23      Flow IAT Mean
38         Fwd Pkts/s
16         Bwd Pkts/s
24       Flow IAT Min
29        Fwd IAT Max
33        Fwd IAT Tot
5      Bwd Header Len
30       Fwd IAT Mean
31        Fwd IAT Min
25       Flow IAT Std
47        Pkt Len Std
48        Pkt Len Var
45       Pkt Len Mean
49       Pkt Size Avg
37    Fwd Pkt Len Std
Name: feature_names, dtype: object

In [10]:
# X = df_cp.drop(['Label'], axis=1)
# y = df_cp.Label
features = ['Flow Duration', 'Flow IAT Max', 'Flow Pkts/s', 'Flow IAT Mean', 'Fwd Pkts/s', 'Bwd Pkts/s']
X = pd.get_dummies(df_cp[features])
# X = df_test.drop(['Label'], axis=1)
y = df_cp.Label

Test Ml

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from xgboost import XGBClassifier
from sklearn import metrics
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import time

In [35]:
def test_ml(X, y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, random_state = 0, test_size = 0.3)
    model = RandomForestClassifier(random_state=27)
    model.fit(X_train, y_train)
    
    tic = time.time()
    preds_val = model.predict(X_test)
    tac = time.time()
    print ("Test time: " + str(tac - tic))

    classification = metrics.classification_report(y_test, preds_val)
    confusion_matrix = metrics.confusion_matrix(y_test, preds_val)
    print(accuracy_score(y_test, preds_val))
    print(precision_score(y_test, preds_val))
    print(recall_score(y_test, preds_val))
    print(f1_score(y_test, preds_val))

test_ml(pd.get_dummies(df_cp[features]), df_cp.Label)
test_ml(df_cp.drop(['Label'], axis=1), df_cp.Label)

Test time: 1.3202862739562988
0.9912090041947587
0.9930441780920353
0.9887527903382749
0.9908938379246554
Test time: 0.9070513248443604
0.9974803760054268
0.9974811083123426
0.9973098277144983
0.9973954606600074


In [36]:
def test_ml2(X, y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, random_state = 0, test_size = 0.3)
    model = xgb.XGBClassifier(n_estimators=1000, max_depth=3, gamma=0.1, min_child_weight=5)
    model.fit(X_train, y_train)
    
    tic = time.time()
    preds_val = model.predict(X_test)
    tac = time.time()
    print ("Test time: " + str(tac - tic))

    classification = metrics.classification_report(y_test, preds_val)
    confusion_matrix = metrics.confusion_matrix(y_test, preds_val)
    print(accuracy_score(y_test, preds_val))
    print(precision_score(y_test, preds_val))
    print(recall_score(y_test, preds_val))
    print(f1_score(y_test, preds_val))

test_ml2(pd.get_dummies(df_cp[features]), df_cp.Label)
test_ml2(df_cp.drop(['Label'], axis=1), df_cp.Label)

Test time: 0.1848912239074707
0.992870294740631
0.993718628997562
0.9915288191860798
0.9926225163665535
Test time: 0.1916670799255371
0.9978818545540127
0.9980244509977954
0.9975960162555091
0.9978101876368632


In [39]:
def test_ml3(X, y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, random_state = 0, test_size = 0.3)
    model = svm.SVC(kernel='rbf', C=110, gamma=0.1)
    model.fit(X_train, y_train)
    
    tic = time.time()
    preds_val = model.predict(X_test)
    tac = time.time()
    print ("Test time: " + str(tac - tic))

    classification = metrics.classification_report(y_test, preds_val)
    confusion_matrix = metrics.confusion_matrix(y_test, preds_val)
    print(accuracy_score(y_test, preds_val))
    print(precision_score(y_test, preds_val))
    print(recall_score(y_test, preds_val))
    print(f1_score(y_test, preds_val))

test_ml3(pd.get_dummies(df_cp[features]), df_cp.Label)
test_ml3(df_cp.drop(['Label'], axis=1), df_cp.Label)

Test time: 173.5073480606079
0.9615826561267011
0.9657968664021547
0.9543815465628757
0.9600552748628924
Test time: 88.47316765785217
0.9958744618110835
0.996816382722423
0.9946482742830977
0.9957311482924593
