In [1]:
import pandas as pd
import numpy as np
from itertools import combinations, product

In [2]:
normal = pd.read_csv('../Dataset/MCFP/malware.csv')
malware = pd.read_csv('../Dataset/MCFP/normal.csv')

df = pd.concat([malware, normal])
df = df.sample(frac=1, replace=False, random_state=27)
df.head(5)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
100360,192.168.1.113-82.202.226.189-61944-443-6,82.202.226.189,443,192.168.1.113,61944,6,19/01/1970 03:38:29 AM,1101101,2,1,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
121468,10.0.2.15-52.94.220.16-56266-443-6,10.0.2.15,56266,52.94.220.16,443,6,01/01/1970 10:12:03 AM,6984634,8,10,...,0,1264518.0,0.0,1264518.0,1264518.0,5719560.0,0.0,5719560.0,5719560.0,0
37401,195.216.249.89-10.0.2.15-443-57438-6,10.0.2.15,57438,195.216.249.89,443,6,01/01/1970 09:47:24 AM,119272058,21,28,...,0,321522.6,870723.526291,2974285.0,1346.0,9617593.0,1250030.0,10090434.0,5699785.0,0
232565,185.33.220.26-10.0.2.15-443-50794-6,10.0.2.15,50794,185.33.220.26,443,6,01/01/1970 10:43:19 AM,10766858,9,18,...,0,776271.0,0.0,776271.0,776271.0,9987692.0,0.0,9987692.0,9987692.0,0
64279,192.168.1.113-92.53.91.20-53883-443-6,192.168.1.113,53883,92.53.91.20,443,6,10/01/1970 05:57:23 AM,1148309,2,4,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [3]:
df['Label'].value_counts()

Label
0    237835
1    116528
Name: count, dtype: int64

In [4]:
df_cp = df.copy()

In [5]:
def clean_df(df):
    # Remove the space before each feature names
    df.columns = df.columns.str.strip()
    print('dataset shape', df.shape)

    # This set of feature should have >= 0 values
    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('zero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('shape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('shape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('columns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df_cp = clean_df(df_cp)

dataset shape (354363, 84)
zero variance columns ['Protocol', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Fwd Seg Size Min'] dropped
shape after removing zero variance columns: (354363, 69)
122 rows dropped
shape after removing nan: (354241, 69)
shape after dropping duplicates: (240776, 69)
columns which have identical values [('Tot Fwd Pkts', 'Subflow Fwd Pkts'), ('Tot Bwd Pkts', 'Subflow Bwd Pkts'), ('Bwd PSH Flags', 'PSH Flag Cnt')] dropped
shape after removing identical value columns: (240776, 66)


In [6]:
df_cp.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp',
       'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd PSH Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'ACK Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
       'Bwd Seg Size Avg', 'Subflow Fwd Byts', 'Subflow Bwd Byts',
       'Init

In [7]:
drop_columns = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp']
df_cp = df_cp.drop(drop_columns, axis=1)

In [7]:
# df_cp = df_cpz.copy()

In [34]:
from sklearn.feature_selection import SelectKBest


def select_k_best(score, X, Y, k=20):
    selector = SelectKBest(score, k=k)
    selector.fit_transform(X, Y)
    names = X.columns.values[selector.get_support()]
    scores = selector.scores_[selector.get_support()]
    names_scores = list(zip(names, scores))
    df_reduced = pd.DataFrame(data=names_scores, columns=['feature_names', 'score'])

    df_reduced = df_reduced.sort_values(['score', 'feature_names'], ascending=[False, True])
    print(df_reduced)
    return df_reduced.feature_names

In [35]:
from sklearn.feature_selection import mutual_info_classif, f_classif

frs_mi = select_k_best(mutual_info_classif, df_cp[df_cp.columns.difference(['Label'])], df_cp.Label)

        feature_names     score
15  Init Bwd Win Byts  0.576895
3        Flow IAT Max  0.514007
2       Flow Duration  0.510121
7         Flow Pkts/s  0.478815
4       Flow IAT Mean  0.478207
13         Fwd Pkts/s  0.474387
1          Bwd Pkts/s  0.472658
5        Flow IAT Min  0.421042
11        Fwd IAT Tot  0.412886
8         Fwd IAT Max  0.412566
0      Bwd Header Len  0.397485
9        Fwd IAT Mean  0.389084
10        Fwd IAT Min  0.384907
6        Flow IAT Std  0.364212
16       Pkt Len Mean  0.306988
18        Pkt Len Var  0.305920
17        Pkt Len Std  0.305860
19       Pkt Size Avg  0.304943
12    Fwd Pkt Len Std  0.299988
14   Fwd Seg Size Avg  0.298739


In [None]:
frs_anova = select_k_best(f_classif, df_cp[df_cp.columns.difference(['Label'])], df_cp.Label)

In [10]:
correlation =df_cp.corr()
correlation['Label'].sort_values(ascending = False)

Label                     1.000000
Fwd IAT Total             0.033909
Init_Win_bytes_forward    0.029300
Fwd IAT Max               0.028861
PSH Flag Count            0.025404
                            ...   
ACK Flag Count           -0.040995
URG Flag Count           -0.043195
Bwd PSH Flags                  NaN
CWE Flag Count                 NaN
Fwd Avg Bytes/Bulk             NaN
Name: Label, Length: 63, dtype: float64

In [11]:
drop_columns = ['Bwd PSH Flags', 'CWE Flag Count', 'Fwd Avg Bytes/Bulk']
df_cp = df_cp.drop(drop_columns, axis=1)

In [17]:
corrlabel = correlation['Label'].sort_values(ascending = False)
features = corrlabel[corrlabel >= 0.02].index
features

Index(['Label', 'Fwd IAT Total', 'Init_Win_bytes_forward', 'Fwd IAT Max',
       'PSH Flag Count', 'Fwd IAT Std', 'Flow Duration', 'Min Packet Length',
       'Active Mean', 'Active Max', 'Active Min'],
      dtype='object')

In [26]:
X.head()

Unnamed: 0,Fwd IAT Total,Init_Win_bytes_forward,Fwd IAT Max,PSH Flag Count,Fwd IAT Std,Flow Duration,Min Packet Length,Active Mean,Active Max,Active Min
0,194.0,1420.0,194.0,0.0,0.0,194.0,0.0,0.0,0.0,0.0
1,5.0,1593.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
2,150895.0,65535.0,51773.0,1.0,20568.245793,199542.0,0.0,0.0,0.0,0.0
3,254.0,1594.0,254.0,0.0,0.0,254.0,0.0,0.0,0.0,0.0
4,0.0,349.0,0.0,0.0,0.0,2164751.0,0.0,0.0,0.0,0.0


In [27]:
ftpos = []
for ft in X.columns:
    if X.loc[X[ft] < 0].size > 0:
        ftpos.append(ft)
        print(ft)
        print(X.loc[X[ft] < 0].shape[0])

In [8]:
df_cp.columns

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd PSH Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'ACK Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
       'Bwd Seg Size Avg', 'Subflow Fwd Byts', 'Subflow Bwd Byts',
       'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Active Mean', 'Active Std',
       'A

In [10]:
# X = df_cp.drop(['Label'], axis=1)
# y = df_cp.Label
features = ['Flow Duration', 'Flow IAT Max', 'Flow Pkts/s', 'Flow IAT Mean', 'Fwd Pkts/s', 'Bwd Pkts/s']
X = pd.get_dummies(df_cp[features])
# X = df_test.drop(['Label'], axis=1)
y = df_cp.Label

Scaler

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

Test Ml

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, stratify = y, random_state = 0, test_size = 0.3)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier


In [16]:
model = RandomForestClassifier(random_state=27)
model.fit(X_train, y_train)

In [17]:
from sklearn import metrics

preds_val = model.predict(X_train)
classification = metrics.classification_report(y_train, preds_val)
confusion_matrix = metrics.confusion_matrix(y_train, preds_val)

In [18]:
print(accuracy_score(y_train, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

0.993265813471933
Confusion matrix:
 [[86376   637]
 [  498 81032]]
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     87013
           1       0.99      0.99      0.99     81530

    accuracy                           0.99    168543
   macro avg       0.99      0.99      0.99    168543
weighted avg       0.99      0.99      0.99    168543



In [26]:
from sklearn import metrics

preds_val = model.predict(X_test)
classification = metrics.classification_report(y_test, preds_val)
confusion_matrix = metrics.confusion_matrix(y_test, preds_val)

1.1618549823760986


In [20]:
print(accuracy_score(y_test, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

0.9752467708665015
Confusion matrix:
 [[36558   733]
 [ 1055 33887]]
Classification report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     37291
           1       0.98      0.97      0.97     34942

    accuracy                           0.98     72233
   macro avg       0.98      0.98      0.98     72233
weighted avg       0.98      0.98      0.98     72233



In [27]:
y_test.value_counts()

Label
0    37291
1    34942
Name: count, dtype: int64

In [21]:
import xgboost as xgb

model2 = xgb.XGBClassifier(n_estimators=1000, max_depth=3, gamma=0.1, min_child_weight=5)
model2.fit(X_train, y_train)

In [22]:
preds_val = model2.predict(X_train)
classification = metrics.classification_report(y_train, preds_val)
confusion_matrix = metrics.confusion_matrix(y_train, preds_val)

print(accuracy_score(y_train, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

0.9816070676325923
Confusion matrix:
 [[85789  1224]
 [ 1876 79654]]
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     87013
           1       0.98      0.98      0.98     81530

    accuracy                           0.98    168543
   macro avg       0.98      0.98      0.98    168543
weighted avg       0.98      0.98      0.98    168543



In [23]:
preds_val = model2.predict(X_test)
classification = metrics.classification_report(y_test, preds_val)
confusion_matrix = metrics.confusion_matrix(y_test, preds_val)

print(accuracy_score(y_test, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

0.975426744008971
Confusion matrix:
 [[36612   679]
 [ 1096 33846]]
Classification report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     37291
           1       0.98      0.97      0.97     34942

    accuracy                           0.98     72233
   macro avg       0.98      0.98      0.98     72233
weighted avg       0.98      0.98      0.98     72233



In [39]:
df_test = pd.get_dummies(df_cp[frs_mi])
X1 = df_test
y1 = df_cp.Label

In [24]:
from sklearn.model_selection import train_test_split

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, shuffle = True, stratify = y1, random_state = 0, test_size = 0.3)

model = RandomForestClassifier(random_state=27)
model.fit(X_train1, y_train1)

preds_val = model.predict(X_train1)
classification = metrics.classification_report(y_train1, preds_val)
confusion_matrix = metrics.confusion_matrix(y_train1, preds_val)

print(accuracy_score(y_train1, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

preds_val = model.predict(X_test1)
classification = metrics.classification_report(y_test1, preds_val)
confusion_matrix = metrics.confusion_matrix(y_test1, preds_val)

print(accuracy_score(y_test1, preds_val))
print("Confusion matrix:" "\n", confusion_matrix)
print("Classification report:" "\n", classification) 

NameError: name 'X1' is not defined

In [None]:
def model_run(X1, y1, features):
    X = pd.get_dummies(df_cp[features])
    y = df_cp.Label

    X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, shuffle = True, stratify = y1, random_state = 0, test_size = 0.3)

    model = RandomForestClassifier(random_state=27)
    model.fit(X_train1, y_train1)

    preds_val = model.predict(X_train1)
    classification = metrics.classification_report(y_train1, preds_val)
    confusion_matrix = metrics.confusion_matrix(y_train1, preds_val)

    print(accuracy_score(y_train1, preds_val))
    print("Confusion matrix:" "\n", confusion_matrix)
    print("Classification report:" "\n", classification) 

    preds_val = model.predict(X_test1)
    classification = metrics.classification_report(y_test1, preds_val)
    confusion_matrix = metrics.confusion_matrix(y_test1, preds_val)

    print(accuracy_score(y_test1, preds_val))
    print("Confusion matrix:" "\n", confusion_matrix)
    print("Classification report:" "\n", classification) 

In [None]:
for i in range(3, 20):
    model_run(X1, y1, select_k_best(mutual_info_classif, X, Y, k=i))