# Random Forest - Classification Step 5 (Binary classification - BENIGN/MALICIOUS)

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# importing dataset
ids_dataset = pd.read_csv("IDS_merged_dataset.csv")
ids_dataset.head()

  ids_dataset = pd.read_csv("IDS_merged_dataset.csv")


Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80.0,192.168.10.5,49188.0,6.0,03/07/2017 08:55:58,4.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80.0,192.168.10.5,49188.0,6.0,03/07/2017 08:55:58,1.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80.0,192.168.10.5,49188.0,6.0,03/07/2017 08:55:58,1.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80.0,192.168.10.5,49188.0,6.0,03/07/2017 08:55:58,1.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.14-8.253.185.121-49486-80-6,8.253.185.121,80.0,192.168.10.14,49486.0,6.0,03/07/2017 08:56:22,3.0,2.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [3]:
# Classes
ids_dataset[' Label'].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Infiltration', 'Web Attack – Brute Force', 'Web Attack – XSS',
       'Web Attack – Sql Injection', nan, 'Bot', 'DDoS', 'PortScan'],
      dtype=object)

In [4]:
ids_dataset.shape

(3119345, 85)

In [5]:
# dataset information
# ids_dataset.info()

In [6]:
# Dataset columns
# ids_dataset.columns

In [7]:
# # dropping unwanted columns
# columns_to_be_dropped = [
#     'Flow ID', 
#     ' Source IP', 
#     ' Source Port',
#     ' Destination IP', 
#     ' Destination Port', 
#     ' Protocol',
#     ' Timestamp',
#     ' Label'
# ]

# pre_pca_ids_dataset = ids_dataset.drop(columns_to_be_dropped, axis=1)
# pre_pca_ids_dataset.head()

In [8]:
selected_features = [' Destination Port', ' ACK Flag Count', ' Flow Packets/s', 'Fwd IAT Total', ' Flow Packets/s', 
                    ' Fwd IAT Mean', ' Packet Length Variance', ' Flow Duration', ' Flow IAT Max', ' Bwd Packet Length Std',
                    ' Fwd IAT Std', ' Bwd Packets/s', ' Flow IAT Std', 'Init_Win_bytes_forward', ' Label']
selected_dataset = ids_dataset[selected_features].copy()
selected_dataset.head()

Unnamed: 0,Destination Port,ACK Flag Count,Flow Packets/s,Fwd IAT Total,Flow Packets/s.1,Fwd IAT Mean,Packet Length Variance,Flow Duration,Flow IAT Max,Bwd Packet Length Std,Fwd IAT Std,Bwd Packets/s,Flow IAT Std,Init_Win_bytes_forward,Label
0,49188.0,1.0,500000.0,4.0,500000.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,329.0,BENIGN
1,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
2,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
3,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
4,49486.0,1.0,666666.7,3.0,666666.7,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,245.0,BENIGN


In [9]:
selected_dataset.shape

(3119345, 15)

In [10]:
# Dropping missing values
selected_dataset.dropna(inplace=True)

In [11]:
selected_dataset.shape

(2830743, 15)

In [12]:
# Converting infinity to nan
selected_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

In [13]:
# Dropping na
selected_dataset.dropna(inplace=True)

In [14]:
selected_dataset.shape

(2827876, 15)

In [15]:
selected_dataset[' Label'] = selected_dataset[' Label'].apply(lambda value: value if value == 'BENIGN' else 'MALICIOUS')
selected_dataset[' Label'].value_counts()

BENIGN       2271320
MALICIOUS     556556
Name:  Label, dtype: int64

In [16]:
desired_samples = selected_dataset[' Label'].value_counts()['BENIGN'] - selected_dataset[' Label'].value_counts()['MALICIOUS']
desired_samples = (desired_samples/selected_dataset[' Label'].value_counts()['BENIGN'])

In [17]:
# generating synthetic samples
smote = SMOTE(random_state=42, sampling_strategy=desired_samples)
X = selected_dataset.drop(' Label', axis=1)
y = selected_dataset[' Label']
new_X, new_y = smote.fit_resample(X, y)
new_X[' Label'] = new_y
print(new_X[' Label'].value_counts())

BENIGN       2271320
MALICIOUS    1714764
Name:  Label, dtype: int64


In [18]:
# combining generated data with original
dataset_with_synthetic_data = pd.concat([selected_dataset, new_X[new_X[' Label'] == 'MALICIOUS']])
dataset_with_synthetic_data.head()

Unnamed: 0,Destination Port,ACK Flag Count,Flow Packets/s,Fwd IAT Total,Flow Packets/s.1,Fwd IAT Mean,Packet Length Variance,Flow Duration,Flow IAT Max,Bwd Packet Length Std,Fwd IAT Std,Bwd Packets/s,Flow IAT Std,Init_Win_bytes_forward,Label
0,49188.0,1.0,500000.0,4.0,500000.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,329.0,BENIGN
1,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
2,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
3,49188.0,1.0,2000000.0,1.0,2000000.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,329.0,BENIGN
4,49486.0,1.0,666666.7,3.0,666666.7,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,245.0,BENIGN


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_with_synthetic_data.drop(' Label', axis=1), 
    dataset_with_synthetic_data[' Label'],
    test_size=0.2,
    random_state=42)

In [20]:
# random forest object
rf_model = RandomForestClassifier()

In [21]:
selected_dataset.columns

Index([' Destination Port', ' ACK Flag Count', ' Flow Packets/s',
       'Fwd IAT Total', ' Flow Packets/s', ' Fwd IAT Mean',
       ' Packet Length Variance', ' Flow Duration', ' Flow IAT Max',
       ' Bwd Packet Length Std', ' Fwd IAT Std', ' Bwd Packets/s',
       ' Flow IAT Std', 'Init_Win_bytes_forward', ' Label'],
      dtype='object')

In [22]:
# fitting model
rf_model.fit(X_train, y_train)

In [23]:
# model evaluation
y_pred = rf_model.predict(X_test)

In [24]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,453841,545
1,269,453873


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00    454386
   MALICIOUS       1.00      1.00      1.00    454142

    accuracy                           1.00    908528
   macro avg       1.00      1.00      1.00    908528
weighted avg       1.00      1.00      1.00    908528



In [26]:
rf_model.classes_

array(['BENIGN', 'MALICIOUS'], dtype=object)