# *Classification*

## Imports 

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
from sklearn.calibration import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

## Import Dataset

In [2]:
DATASET_DIRECTORY = '../CICIoT2023/'

In [3]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_set = df_sets[0]

In [4]:
data = pd.read_csv(DATASET_DIRECTORY + training_set)

## Pre-process

In [5]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

### Label Encoding

In [6]:
le = LabelEncoder()
data[y_column] = le.fit_transform(data[y_column])

### Scale

In [7]:
scaler = MinMaxScaler()

In [8]:
data[X_columns] = scaler.fit_transform(data[X_columns])

## Feature Selection

### Pearson Coef

In [9]:
all_cols = X_columns.copy()
all_cols.append(y_column)

In [10]:
correlation_matrix = data[all_cols].corr()

In [11]:
corr_with_label = abs(correlation_matrix[y_column])

In [12]:
selected_features = [lable for lable, value in corr_with_label.items() if value > 0.3]
selected_features.remove('label')
selected_features

['Protocol Type', 'UDP', 'ICMP', 'Min']

### RFE

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data[X_columns], data[y_column], test_size=0.2, random_state=42)

In [21]:
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, n_features_to_select=7, step=3)
selector = selector.fit(x_train, y_train)

In [22]:
rfe_mask = selector.get_support() #list of booleans for selected features
selected_features = [] 
for bool, feature in zip(rfe_mask, x_train.columns):
 if bool:
    selected_features.append(feature)
selected_features

['fin_flag_number',
 'psh_flag_number',
 'syn_count',
 'Min',
 'Tot size',
 'IAT',
 'Magnitue']

## Train Test Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data[X_columns], data[y_column], test_size=0.2, random_state=42)

In [30]:
len(selected_features)

7

In [36]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [37]:
y_pred = model.predict(X_test)

In [40]:
print('accuracy_score: ', accuracy_score(y_pred, y_test))
print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
print('f1_score: ', f1_score(y_pred, y_test, average='macro'))

accuracy_score:  0.9905525996061838
recall_score:  0.7429936565257733
precision_score:  0.702364330441092
f1_score:  0.7120174341254635
