In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import chi2, mutual_info_classif, RFE, SelectFromModel, SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
import warnings

In [2]:
pd.set_option('display.max_columns', None)
pd.options.mode.use_inf_as_na = True
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('/SSD/p76111262/all_filtered_data.csv')

In [4]:
columns_to_delete = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP']
data.drop(columns_to_delete, axis=1, inplace=True)

# 数据预处理

In [5]:
numerical_fea = list(data.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(data.columns)))

In [6]:
def get_numerical_serial_fea(data, feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 10:
            numerical_noserial_fea.append(fea)
        else:
            numerical_serial_fea.append(fea)
    return numerical_serial_fea, numerical_noserial_fea
numerical_serial_fea, numerical_noserial_fea = get_numerical_serial_fea(data, numerical_fea)

In [7]:
data.isnull().sum().to_dict()

{'Dst Port': 0,
 'Protocol': 0,
 'Timestamp': 0,
 'Flow Duration': 0,
 'Tot Fwd Pkts': 0,
 'Tot Bwd Pkts': 0,
 'TotLen Fwd Pkts': 0,
 'TotLen Bwd Pkts': 0,
 'Fwd Pkt Len Max': 0,
 'Fwd Pkt Len Min': 0,
 'Fwd Pkt Len Mean': 0,
 'Fwd Pkt Len Std': 0,
 'Bwd Pkt Len Max': 0,
 'Bwd Pkt Len Min': 0,
 'Bwd Pkt Len Mean': 0,
 'Bwd Pkt Len Std': 0,
 'Flow Byts/s': 19,
 'Flow Pkts/s': 19,
 'Flow IAT Mean': 0,
 'Flow IAT Std': 0,
 'Flow IAT Max': 0,
 'Flow IAT Min': 0,
 'Fwd IAT Tot': 0,
 'Fwd IAT Mean': 0,
 'Fwd IAT Std': 0,
 'Fwd IAT Max': 0,
 'Fwd IAT Min': 0,
 'Bwd IAT Tot': 0,
 'Bwd IAT Mean': 0,
 'Bwd IAT Std': 0,
 'Bwd IAT Max': 0,
 'Bwd IAT Min': 0,
 'Fwd PSH Flags': 0,
 'Bwd PSH Flags': 0,
 'Fwd URG Flags': 0,
 'Bwd URG Flags': 0,
 'Fwd Header Len': 0,
 'Bwd Header Len': 0,
 'Fwd Pkts/s': 0,
 'Bwd Pkts/s': 0,
 'Pkt Len Min': 0,
 'Pkt Len Max': 0,
 'Pkt Len Mean': 0,
 'Pkt Len Std': 0,
 'Pkt Len Var': 0,
 'FIN Flag Cnt': 0,
 'SYN Flag Cnt': 0,
 'RST Flag Cnt': 0,
 'PSH Flag Cnt': 0,
 'ACK

In [8]:
data[numerical_serial_fea] = data[numerical_serial_fea].fillna(data[numerical_serial_fea].mean())

In [9]:
data.isnull().sum().to_dict()

{'Dst Port': 0,
 'Protocol': 0,
 'Timestamp': 0,
 'Flow Duration': 0,
 'Tot Fwd Pkts': 0,
 'Tot Bwd Pkts': 0,
 'TotLen Fwd Pkts': 0,
 'TotLen Bwd Pkts': 0,
 'Fwd Pkt Len Max': 0,
 'Fwd Pkt Len Min': 0,
 'Fwd Pkt Len Mean': 0,
 'Fwd Pkt Len Std': 0,
 'Bwd Pkt Len Max': 0,
 'Bwd Pkt Len Min': 0,
 'Bwd Pkt Len Mean': 0,
 'Bwd Pkt Len Std': 0,
 'Flow Byts/s': 0,
 'Flow Pkts/s': 0,
 'Flow IAT Mean': 0,
 'Flow IAT Std': 0,
 'Flow IAT Max': 0,
 'Flow IAT Min': 0,
 'Fwd IAT Tot': 0,
 'Fwd IAT Mean': 0,
 'Fwd IAT Std': 0,
 'Fwd IAT Max': 0,
 'Fwd IAT Min': 0,
 'Bwd IAT Tot': 0,
 'Bwd IAT Mean': 0,
 'Bwd IAT Std': 0,
 'Bwd IAT Max': 0,
 'Bwd IAT Min': 0,
 'Fwd PSH Flags': 0,
 'Bwd PSH Flags': 0,
 'Fwd URG Flags': 0,
 'Bwd URG Flags': 0,
 'Fwd Header Len': 0,
 'Bwd Header Len': 0,
 'Fwd Pkts/s': 0,
 'Bwd Pkts/s': 0,
 'Pkt Len Min': 0,
 'Pkt Len Max': 0,
 'Pkt Len Mean': 0,
 'Pkt Len Std': 0,
 'Pkt Len Var': 0,
 'FIN Flag Cnt': 0,
 'SYN Flag Cnt': 0,
 'RST Flag Cnt': 0,
 'PSH Flag Cnt': 0,
 'ACK F

In [10]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'],format='%d/%m/%Y %H:%M:%S')

In [11]:
from sklearn.preprocessing import LabelEncoder

# 初始化LabelEncoder
label_encoder = LabelEncoder()

# 将标签编码为整数
data['Label'] = label_encoder.fit_transform(data['Label'])

# 异常值处理

In [12]:
def find_outliers_by_3segama(data, fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = 3 * data_std
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x: int(1) if x > upper_rule or x < lower_rule else int(0))
    return data

In [13]:
for fea in numerical_serial_fea:
    data = find_outliers_by_3segama(data, fea)
    print(data[fea+'_outliers'].value_counts())
    print(data.groupby(fea+'_outliers')['Label'].sum())
    print('*'*10)

0    636463
1      3537
Name: Dst Port_outliers, dtype: int64
Dst Port_outliers
0    4040640
1       8272
Name: Label, dtype: int64
**********
0    629958
1     10042
Name: Flow Duration_outliers, dtype: int64
Flow Duration_outliers
0    3991551
1      57361
Name: Label, dtype: int64
**********
0    639631
1       369
Name: Tot Fwd Pkts_outliers, dtype: int64
Tot Fwd Pkts_outliers
0    4047402
1       1510
Name: Label, dtype: int64
**********
0    636285
1      3715
Name: Tot Bwd Pkts_outliers, dtype: int64
Tot Bwd Pkts_outliers
0    4002746
1      46166
Name: Label, dtype: int64
**********
0    639630
1       370
Name: TotLen Fwd Pkts_outliers, dtype: int64
TotLen Fwd Pkts_outliers
0    4047391
1       1521
Name: Label, dtype: int64
**********
0    639803
1       197
Name: TotLen Bwd Pkts_outliers, dtype: int64
TotLen Bwd Pkts_outliers
0    4048252
1        660
Name: Label, dtype: int64
**********
0    626852
1     13148
Name: Fwd Pkt Len Max_outliers, dtype: int64
Fwd Pkt Len Max_out

# 特征选择

In [14]:
selector = VarianceThreshold(threshold=3)
selector = selector.fit(data[numerical_serial_fea])
features_mask = selector.get_support(indices=True)
selected_features = np.array(numerical_serial_fea)[features_mask]
print('Selected:', selected_features)
print('Deleted: ', [fea for fea in numerical_serial_fea if fea not in selected_features])

Selected: ['Dst Port' 'Flow Duration' 'Tot Fwd Pkts' 'Tot Bwd Pkts'
 'TotLen Fwd Pkts' 'TotLen Bwd Pkts' 'Fwd Pkt Len Max' 'Fwd Pkt Len Min'
 'Fwd Pkt Len Mean' 'Fwd Pkt Len Std' 'Bwd Pkt Len Max' 'Bwd Pkt Len Min'
 'Bwd Pkt Len Mean' 'Bwd Pkt Len Std' 'Flow Byts/s' 'Flow Pkts/s'
 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max' 'Flow IAT Min'
 'Fwd IAT Tot' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min'
 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min'
 'Fwd Header Len' 'Bwd Header Len' 'Fwd Pkts/s' 'Bwd Pkts/s' 'Pkt Len Min'
 'Pkt Len Max' 'Pkt Len Mean' 'Pkt Len Std' 'Pkt Len Var' 'Pkt Size Avg'
 'Fwd Seg Size Avg' 'Bwd Seg Size Avg' 'Subflow Fwd Pkts'
 'Subflow Fwd Byts' 'Subflow Bwd Pkts' 'Subflow Bwd Byts'
 'Init Fwd Win Byts' 'Init Bwd Win Byts' 'Fwd Act Data Pkts' 'Active Mean'
 'Active Std' 'Active Max' 'Active Min' 'Idle Mean' 'Idle Std' 'Idle Max'
 'Idle Min']
Deleted:  ['Down/Up Ratio']


In [15]:
pearsonr_result = []
for fea in numerical_serial_fea:
    pearsonr_result.append((fea, pearsonr(data[fea], data['Label'])))
sorted(pearsonr_result, key=lambda x: abs(x[1][0]), reverse=True)[:10]

[('Init Fwd Win Byts', (-0.6583273135868122, 0.0)),
 ('Fwd Pkts/s', (0.25230978911459334, 0.0)),
 ('Flow Pkts/s', (0.25228862335713786, 0.0)),
 ('Bwd Pkts/s', (0.25091529186485223, 0.0)),
 ('Pkt Len Var', (-0.2332686044709389, 0.0)),
 ('Pkt Size Avg', (-0.22406734202303166, 0.0)),
 ('Pkt Len Std', (-0.22405923451113813, 0.0)),
 ('Bwd Pkt Len Std', (-0.2223618515373867, 0.0)),
 ('Bwd Pkt Len Mean', (-0.21988225186136623, 0.0)),
 ('Bwd Seg Size Avg', (-0.21988225186136623, 0.0))]

In [16]:
# find out the features can not be applied by chi2
for fea in [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]:
    print(fea)
    print(chi2(np.array(data[fea]).reshape(-1, 1), np.array(data['Label']).reshape(-1, 1)))

Dst Port
(array([2.70721692e+09]), array([0.]))
Flow Duration
(array([1.38375675e+13]), array([0.]))
Tot Fwd Pkts
(array([6.64963453e+10]), array([0.]))
Tot Bwd Pkts
(array([2384510.98170183]), array([0.]))
TotLen Fwd Pkts
(array([2.13320013e+12]), array([0.]))
TotLen Bwd Pkts
(array([1.1444004e+10]), array([0.]))
Fwd Pkt Len Max
(array([44979368.88079263]), array([0.]))
Fwd Pkt Len Min
(array([73241373.86480519]), array([0.]))
Fwd Pkt Len Mean
(array([11079204.52930162]), array([0.]))
Fwd Pkt Len Std
(array([16107228.00942219]), array([0.]))
Bwd Pkt Len Max
(array([93907824.32482651]), array([0.]))
Bwd Pkt Len Min
(array([11987019.23828791]), array([0.]))
Bwd Pkt Len Mean
(array([29950309.47130034]), array([0.]))
Bwd Pkt Len Std
(array([46063889.9574808]), array([0.]))
Flow Byts/s
(array([3.28014824e+10]), array([0.]))
Flow Pkts/s
(array([5.40274167e+11]), array([0.]))
Flow IAT Mean
(array([5.67402945e+12]), array([0.]))
Flow IAT Std
(array([3.94717593e+12]), array([0.]))
Flow IAT Max

In [17]:
chi2_test_fea = [fea for fea in numerical_serial_fea if fea not in ['Init Fwd Win Byts', 'Init Bwd Win Byts']]
selector = SelectKBest(chi2, k=5)
selector = selector.fit(data[chi2_test_fea], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(chi2_test_fea)[features_mask]
print('Selected:', selected_features)

Selected: ['Flow Duration' 'Fwd IAT Tot' 'Bwd IAT Tot' 'Bwd IAT Mean' 'Bwd IAT Min']


In [18]:
selector = SelectKBest(mutual_info_classif, k=5)
selector = selector.fit(data[numerical_serial_fea], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(numerical_serial_fea)[features_mask]
print('Selected:', selected_features)

Selected: ['Flow Duration' 'Flow Pkts/s' 'Flow IAT Mean' 'Fwd Header Len'
 'Init Fwd Win Byts']


In [19]:
features = [fea for fea in data.columns if fea not in ['Timestamp', 'Label']]
selector = RFE(DecisionTreeClassifier(), n_features_to_select=5, step=1)
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Flow Byts/s' 'Bwd Pkts/s' 'Init Fwd Win Byts' 'Init Bwd Win Byts'
 'Fwd Seg Size Min']


In [20]:
selector = SelectFromModel(LogisticRegression(penalty='l2', C=10))
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Flow Duration' 'Flow Byts/s' 'Flow Pkts/s' 'Flow IAT Mean'
 'Flow IAT Std' 'Flow IAT Max' 'Flow IAT Min' 'Fwd IAT Tot' 'Fwd IAT Mean'
 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min' 'Bwd IAT Tot' 'Bwd IAT Mean'
 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min' 'Fwd Pkts/s' 'Bwd Pkts/s'
 'Pkt Len Var' 'Init Fwd Win Byts' 'Idle Mean' 'Idle Std' 'Idle Max'
 'Idle Min']


In [21]:
selector = SelectFromModel(DecisionTreeClassifier())
selector = selector.fit(data[features], data['Label'])
features_mask = selector.get_support(indices=True)
selected_features = np.array(features)[features_mask]
print('Selected:', selected_features)

Selected: ['Dst Port' 'Flow Byts/s' 'Fwd IAT Min' 'Bwd Pkts/s' 'Init Fwd Win Byts'
 'Init Bwd Win Byts' 'Fwd Seg Size Min']
