In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

### Reading and scalling the data

Testing different datasets - clean with all 79 columns, and final with two different selections of 15 most important features.

In [43]:
data_c = pd.read_parquet('./Local/2017_Clean/Combined.parquet')
data_f1 = pd.read_parquet('./Local/2017_Final/Combined_1.parquet')
data_f2 = pd.read_parquet('./Local/2017_Final/Combined_2.parquet')

data_c = data_c.drop(columns=[' Label'])
data_f1 = data_f1.drop(columns=['Label'])
data_f2 = data_f2.drop(columns=['Label'])

### Reducing the dimensions using PCA

In [47]:
datasets = {'Clean data': data_c, 'Final data 1': data_f1, 'Final data 2': data_f2}

In [48]:
for name, data in datasets.items():
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data)
    
    print(f'--- {name} ---')
    print('Explained variation per principal component (PC): {}'.format(pca.explained_variance_ratio_))
    print('Cumulative variance explained by 2 principal components: {:.2%}\n'.format(np.sum(pca.explained_variance_ratio_)))

--- Clean data ---
Explained variation per principal component (PC): [0.62860641 0.11010856]
Cumulative variance explained by 2 principal components: 73.87%

--- Final data 1 ---
Explained variation per principal component (PC): [0.79193891 0.20800771]
Cumulative variance explained by 2 principal components: 99.99%

--- Final data 2 ---
Explained variation per principal component (PC): [0.79197042 0.20801583]
Cumulative variance explained by 2 principal components: 100.00%



### Feature importance

In [49]:
for name, data in datasets.items():
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data)
    data_pca = pd.DataFrame(abs(pca.components_), columns=data.columns, index=['PC_1', 'PC_2'])

    print(f'--- {name} ---')
    print('Most important features:\n')
    print('PC 1:\n', (data_pca[data_pca > 0.3].iloc[0]).dropna())   
    print('PC 2:\n', (data_pca[data_pca > 0.3].iloc[1]).dropna())
    print('\n')

--- Clean data ---
Most important features:

PC 1:
  Flow Duration    0.435160
 Flow IAT Max     0.313612
Fwd IAT Total     0.434277
 Fwd IAT Max      0.314557
Idle Mean         0.301654
 Idle Max         0.312721
Name: PC_1, dtype: float64
PC 2:
  Flow Duration    0.309949
Fwd IAT Total     0.306129
Bwd IAT Total     0.624901
Name: PC_2, dtype: float64


--- Final data 1 ---
Most important features:

PC 1:
 Total Length of Bwd Packets    0.707105
Subflow Bwd Bytes              0.707095
Name: PC_1, dtype: float64
PC 2:
 Packet Length Variance    0.99999
Name: PC_2, dtype: float64


--- Final data 2 ---
Most important features:

PC 1:
 Total Length of Bwd Packets    0.707105
Subflow Bwd Bytes              0.707095
Name: PC_1, dtype: float64
PC 2:
 Packet Length Variance    0.999991
Name: PC_2, dtype: float64




### Hyperparameter tuning using the silhouette method

In [52]:
# 2 for Benign/Attack recognition
# 15 for Attack classification (14 Attack types + Benign)
parameters = [2]
parameter_grid = ParameterGrid({'n_clusters': parameters})

best_score = -1
best_grid = None
silhouette_scores = []

for p in parameter_grid:
    #kmeans_model = KMeans(**p).fit(data_c)
    kmeans_model = KMeans(**p).fit(data_f1)
    #kmeans_model = KMeans(**p).fit(data_f2)

    current_score = silhouette_score(data, kmeans_model.labels_)
    silhouette_scores.append(current_score)
    print('Parameter:', p, 'Score', current_score)
    if current_score > best_score:
        best_score = current_score
        best_grid = p

KeyboardInterrupt: 

In [None]:
plt.bar(range(len(silhouette_scores)), silhouette_scores, align='center', color='#722f59', width=0.5)
plt.xticks(range(len(silhouette_scores)), parameters)
plt.title('Silhouette Score', fontweight='bold')
plt.xlabel('Number of Clusters')
plt.show()