## Imports 

In [1]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.calibration import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

from Constants import dict_2classes, dict_7classes, DATASET_DIRECTORY, X_columns, y_columns

In [2]:
5/len(X_columns)*100

10.869565217391305

## Import Dataset

In [3]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_set = df_sets[0]

In [4]:
data = pd.read_csv(DATASET_DIRECTORY + training_set)

## Pre-process

In [5]:
new_y = [dict_7classes[k] for k in data[y_columns[0]]]
data[y_columns[1]] = new_y
new_y = [dict_2classes[k] for k in data[y_columns[0]]]
data[y_columns[2]] = new_y


### Label Encoding

In [6]:
le : list[LabelEncoder] = []

for column in y_columns:
    le1 = LabelEncoder()
    data[column] = le1.fit_transform(data[column])
    le.append(le1)

### Scale

In [7]:
scaler = MinMaxScaler()

In [8]:
data[X_columns] = scaler.fit_transform(data[X_columns])

## Train Test Split

In [9]:
x_train, x_test, y_train, y_test = train_test_split(data[X_columns], data[y_columns], test_size=0.2, random_state=42)

## Feature Selection

In [10]:
feature_sets = [[], [], []]

### RFE

In [13]:
for column in [0]:
    for i in tqdm(range(3,40)):
        estimator = RandomForestClassifier()
        selector = RFE(estimator, n_features_to_select=i, step=3)
        selector = selector.fit(x_train, y_train[y_columns[column]])
        rfe_mask = selector.get_support() #list of booleans for selected features
        selected_features = [] 
        for bool, feature in zip(rfe_mask, x_train.columns):
            if bool:
                selected_features.append(feature)
        
        feature_sets[column].append(('RFE', selected_features))
        # selected_features

100%|██████████| 37/37 [7:00:59<00:00, 682.70s/it]   


## Processing

In [14]:
feature_sets

[[('RFE', ['Min', 'IAT', 'Magnitue']),
  ('RFE', ['syn_count', 'Min', 'IAT', 'Magnitue']),
  ('RFE', ['syn_count', 'AVG', 'Tot size', 'IAT', 'Magnitue']),
  ('RFE',
   ['fin_flag_number', 'syn_count', 'Min', 'Tot size', 'IAT', 'Magnitue']),
  ('RFE',
   ['fin_flag_number',
    'psh_flag_number',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']),
  ('RFE',
   ['Protocol Type',
    'fin_flag_number',
    'psh_flag_number',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']),
  ('RFE',
   ['Protocol Type',
    'fin_flag_number',
    'psh_flag_number',
    'syn_count',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']),
  ('RFE',
   ['Protocol Type',
    'psh_flag_number',
    'ack_count',
    'syn_count',
    'Tot sum',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']),
  ('RFE',
   ['Protocol Type',
    'fin_flag_number',
    'syn_flag_number',
    'psh_flag_number',
    'ack_count',
    'syn_count',
    'Min',
    'AVG',
    '

In [15]:
feature_set_dict = [[], [], []]

for index, sets in enumerate(feature_sets):
    for set_name, set in sets:
        feature_set_dict[index].append({
            'Method' : set_name,
            'features' : set,
        })


In [16]:
feature_sets = {
    'multi34' : feature_set_dict[0],
    'multi8' : feature_set_dict[1],
    'binary' : feature_set_dict[2],
}

In [17]:
feature_sets

{'multi34': [{'Method': 'RFE', 'features': ['Min', 'IAT', 'Magnitue']},
  {'Method': 'RFE', 'features': ['syn_count', 'Min', 'IAT', 'Magnitue']},
  {'Method': 'RFE',
   'features': ['syn_count', 'AVG', 'Tot size', 'IAT', 'Magnitue']},
  {'Method': 'RFE',
   'features': ['fin_flag_number',
    'syn_count',
    'Min',
    'Tot size',
    'IAT',
    'Magnitue']},
  {'Method': 'RFE',
   'features': ['fin_flag_number',
    'psh_flag_number',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']},
  {'Method': 'RFE',
   'features': ['Protocol Type',
    'fin_flag_number',
    'psh_flag_number',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']},
  {'Method': 'RFE',
   'features': ['Protocol Type',
    'fin_flag_number',
    'psh_flag_number',
    'syn_count',
    'Min',
    'AVG',
    'Tot size',
    'IAT',
    'Magnitue']},
  {'Method': 'RFE',
   'features': ['Protocol Type',
    'psh_flag_number',
    'ack_count',
    'syn_count',
    'Tot sum',
    'Min',
    'AV

## Read Previous Data 

In [26]:
feature_set_path = 'store/features.pkl'

In [27]:
with open(feature_set_path, 'rb') as file:
    prev_feature_sets : dict = pickle.load(file)

prev_feature_sets

{'multi34': [{'Method': 'Pearson 0.3',
   'features': ['Protocol Type', 'UDP', 'ICMP', 'Min']},
  {'Method': 'Pearson 0.1',
   'features': ['Header_Length',
    'Protocol Type',
    'fin_flag_number',
    'rst_flag_number',
    'psh_flag_number',
    'ack_flag_number',
    'ack_count',
    'fin_count',
    'TCP',
    'UDP',
    'ICMP',
    'Tot sum',
    'Min',
    'AVG',
    'Std',
    'Tot size',
    'Magnitue',
    'Radius']},
  {'Method': 'RFE 4 DT',
   'features': ['fin_flag_number', 'Min', 'IAT', 'Magnitue']},
  {'Method': 'RFE 4', 'features': ['Min', 'Tot size', 'IAT', 'Magnitue']}],
 'multi8': [{'Method': 'Pearson 0.3',
   'features': ['Protocol Type', 'Min', 'Magnitue']},
  {'Method': 'Pearson 0.1',
   'features': ['Header_Length',
    'Protocol Type',
    'fin_flag_number',
    'rst_flag_number',
    'psh_flag_number',
    'ack_flag_number',
    'ack_count',
    'TCP',
    'UDP',
    'ICMP',
    'Tot sum',
    'Min',
    'AVG',
    'Tot size',
    'Magnitue']},
  {'Method': '

In [30]:
for key, value in prev_feature_sets.items():
    prev_feature_sets[key] = value + feature_sets[key]

In [31]:
prev_feature_sets

{'multi34': [{'Method': 'Pearson 0.3',
   'features': ['Protocol Type', 'UDP', 'ICMP', 'Min']},
  {'Method': 'Pearson 0.1',
   'features': ['Header_Length',
    'Protocol Type',
    'fin_flag_number',
    'rst_flag_number',
    'psh_flag_number',
    'ack_flag_number',
    'ack_count',
    'fin_count',
    'TCP',
    'UDP',
    'ICMP',
    'Tot sum',
    'Min',
    'AVG',
    'Std',
    'Tot size',
    'Magnitue',
    'Radius']},
  {'Method': 'RFE 4 DT',
   'features': ['fin_flag_number', 'Min', 'IAT', 'Magnitue']},
  {'Method': 'RFE 4', 'features': ['Min', 'Tot size', 'IAT', 'Magnitue']},
  {'Method': 'RFE 5',
   'features': ['syn_count', 'Min', 'AVG', 'IAT', 'Magnitue']}],
 'multi8': [{'Method': 'Pearson 0.3',
   'features': ['Protocol Type', 'Min', 'Magnitue']},
  {'Method': 'Pearson 0.1',
   'features': ['Header_Length',
    'Protocol Type',
    'fin_flag_number',
    'rst_flag_number',
    'psh_flag_number',
    'ack_flag_number',
    'ack_count',
    'TCP',
    'UDP',
    'ICMP',

## Save File

In [18]:
with open('store/rfe_features3-39.pkl', 'wb') as file:
    pickle.dump(feature_sets, file)