In [1]:
'''
all written function are here 

'''

'\nall written function are here \n\n'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D, Dropout
from tensorflow.keras.utils import to_categorical

In [32]:
# cleaning and pre processing data

dataPath = "data.csv"
data = pd.read_csv(dataPath)

# seperating features
label_column = 'Label' 
data_features = data.drop(columns=[label_column]).select_dtypes(include=['float64', 'int64'])
Labels = data[label_column]
# Add this to see excluded columns
excluded_columns = set(data.columns) - set(data_features.columns)
print("Excluded columns (including label):", excluded_columns)

#standardize
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_features)

top_10_features_list_RF = ['DIntPkt', 'Load', 'DstJitter', 'Dur', 'SrcLoad', 
                     'Rate', 'Packet_num', 'SIntPkt', 'DstLoad', 'SrcJitter']
top_10_features_rf = pd.DataFrame(data_features, columns=top_10_features_list_RF)
top_5_features_rf = pd.DataFrame(data_features, columns=top_10_features_list_RF[:5])

top_10_feaures_list_LDA =  ['pLoss','pDstLoss','Loss','SIntPktAct','pSrcLoss','SrcJitter', 'DstBytes','DstJitter',
                            'Rate','SrcBytes']
top_10_features_LDA = pd.DataFrame(data_features, columns=top_10_feaures_list_LDA)
top_5_features_LDA = pd.DataFrame(data_features, columns=top_10_feaures_list_LDA[:5])
                                   

print(top_10_features_LDA.columns)

Excluded columns (including label): {'Sport', 'Dir', 'SrcAddr', 'DstMac', 'Label', 'Flgs', 'SrcMac', 'Attack Category', 'DstAddr'}
Index(['pLoss', 'pDstLoss', 'Loss', 'SIntPktAct', 'pSrcLoss', 'SrcJitter',
       'DstBytes', 'DstJitter', 'Rate', 'SrcBytes'],
      dtype='object')


In [4]:
def print_classification_results(confusion_matrix, classification_report, accuracy):
    #prints the cm results, classification report, and accuracy
    tn, fp, fn, tp = confusion_matrix.ravel()
    total = tn + fp + fn + tp
    
    # Calculate percentages
    tn_pct = (tn/total) * 100
    fp_pct = (fp/total) * 100
    fn_pct = (fn/total) * 100
    tp_pct = (tp/total) * 100
    
    # Print confusion matrix results
    print("\nConfusion Matrix Results:")
    print(f"""
    True Negatives: {tn} ({tn_pct:.2f}%)
    False Positives: {fp} ({fp_pct:.2f}%)
    False Negatives: {fn} ({fn_pct:.2f}%)
    True Positives: {tp} ({tp_pct:.2f}%)
    """)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report)
    
    # Print accuracy
    print(f"\nModel Accuracy: {accuracy:.2%}")


In [5]:
def find_top_n_features(data_scaled, features, n):
    pca = PCA(n_components=n)
    pca.fit(data_scaled)
    
    components = pca.components_
    explained_variance = pca.explained_variance_ratio_
    
    feature_importance = pd.DataFrame()
    for i in range(n):
        feature_importance[f'PC{i+1}'] = abs(components[i])
    
    feature_importance.index = features.columns
    feature_importance['Overall_Importance'] = feature_importance.sum(axis=1)
    feature_importance = feature_importance.sort_values('Overall_Importance', ascending=False)

    print(f"Total explained variance ratio: {sum(explained_variance):.2%}")
    print("\nExplained variance ratio by component:")
    for i, var in enumerate(explained_variance):
        print(f"PC{i+1}: {var:.2%}")
    
    print(f"\nTop {n} most important features:")
    top_n_features = feature_importance['Overall_Importance'].head(n)
    print(top_n_features)
    
    # Add semicolon to suppress output
    return feature_importance, top_n_features;

In [6]:
featureImportance, top_n = find_top_n_features(data_scaled, data_features, 10)

Total explained variance ratio: 87.71%

Explained variance ratio by component:
PC1: 24.29%
PC2: 14.41%
PC3: 12.17%
PC4: 8.41%
PC5: 8.28%
PC6: 5.97%
PC7: 3.95%
PC8: 3.71%
PC9: 3.33%
PC10: 3.18%

Top 10 most important features:
Temp          1.598481
SrcBytes      1.539131
ST            1.538129
Pulse_Rate    1.483176
Heart_rate    1.481323
SYS           1.459602
sMinPktSz     1.453595
SIntPkt       1.451182
SrcLoad       1.441597
DIA           1.384256
Name: Overall_Importance, dtype: float64


In [42]:
def perform_kmeans_clustering(data_scaled, features, labels, n_components=2):
    """
    Perform K-means clustering on PCA components using selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        features: DataFrame with only the features to analyze
        labels: target labels
        n_components: number of PCA components to use
    """
    print("\nperformin k means on this many features: ", len(features.columns))
    
    
    # Convert data_scaled to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    
    # Select only the specified features
    selected_features = scaled_df[features.columns]
    
    # PCA on selected features
    pca = PCA(n_components=n_components)
    pc_features = pca.fit_transform(selected_features)
    
    pca_df = pd.DataFrame(
        data=pc_features,
        columns=[f'PC{i+1}' for i in range(n_components)]
    )
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        pca_df, labels, test_size=0.3, random_state=42
    )
    
    # K-means clustering
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X_train)
    y_pred = kmeans.predict(X_test)
    
    # Evaluation metrics
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)    
    accuracy = accuracy_score(y_test, y_pred)    
    return cm, class_report, accuracy

In [8]:
cm, class_report, accuracy = perform_kmeans_clustering(data_scaled, data_features, data[label_column], n_components=3)

print_classification_results(cm, class_report, accuracy)


performin k means on this many features:  36

Confusion Matrix Results:

    True Negatives: 4250 (86.81%)
    False Positives: 2 (0.04%)
    False Negatives: 644 (13.15%)
    True Positives: 0 (0.00%)
    

Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      4252
           1       0.00      0.00      0.00       644

    accuracy                           0.87      4896
   macro avg       0.43      0.50      0.46      4896
weighted avg       0.75      0.87      0.81      4896


Model Accuracy: 86.81%


In [9]:
# feature_importance, top_features = find_top_n_features(data_scaled, features, 10)


In [35]:
def train_random_forest(data_scaled, Labels, features, n_estimators):
    """
    Train Random Forest on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        Labels: target labels 
        features: DataFrame with features to analyze
        n_estimators: number of trees in forest
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"Training Random Forest with {n_estimators} estimators")
    print(f"Number of features used: {len(features.columns)}")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, Labels, test_size=0.3, random_state=42
    )

    # Train model
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_classifier.predict(X_test)

    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Print feature importance
    feature_importance = pd.DataFrame({
        'feature': features.columns,
        'importance': rf_classifier.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return cm, class_report, accuracy

In [25]:
forest_cm, forest_report, forest_accuracy = train_random_forest(data_scaled, Labels, data_features, 100)

print_classification_results(forest_cm, forest_report, forest_accuracy)

Training Random Forest with 100 estimators
Number of features used: 36

Top 10 Most Important Features:
       feature  importance
8      DIntPkt    0.133990
21        Load    0.096434
12   DstJitter    0.093912
17         Dur    0.086107
3      SrcLoad    0.082480
26        Rate    0.081926
27  Packet_num    0.071981
7      SIntPkt    0.067164
4      DstLoad    0.056263
11   SrcJitter    0.050777

Confusion Matrix Results:

    True Negatives: 4241 (86.62%)
    False Positives: 11 (0.22%)
    False Negatives: 313 (6.39%)
    True Positives: 331 (6.76%)
    

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      4252
           1       0.97      0.51      0.67       644

    accuracy                           0.93      4896
   macro avg       0.95      0.76      0.82      4896
weighted avg       0.94      0.93      0.92      4896


Model Accuracy: 93.38%


In [12]:
def perform_lda(data_scaled, labels, features, top_n=10):
    """
    Perform Linear Discriminant Analysis on selected features and optionally find top n important features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        labels: target labels
        features: DataFrame with features to analyze
        top_n: number of top features to return based on LDA coefficients
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy, top_features)
    """
    print(f"\nPerforming LDA on {len(features.columns)} features")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns].values
    
    # Split using the features DataFrame
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )

    # Initialize and fit LDA model
    lda = LDA()
    lda.fit(X_train, y_train)

    # Print LDA components information
    print("\nLDA Components Information:")
    print(f"Number of components: {lda.n_components}")
    print("\nExplained variance ratio:")
    print(lda.explained_variance_ratio_)
    
    # Print component coefficients
    print("\nLinear Discriminant Coefficients:")
    for i, component in enumerate(lda.coef_):
        print(f"\nLD{i+1} coefficients:")
        for feat, coef in zip(features.columns, component):
            print(f"{feat}: {coef:.4f}")

    # Make predictions and calculate metrics
    y_pred = lda.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Find top n important features if requested
    top_features = None
    if top_n:
        # Sum absolute coefficients across all discriminant functions
        feature_importance = pd.Series(np.sum(np.abs(lda.coef_), axis=0), index=features.columns)
        top_features = feature_importance.nlargest(top_n)
        print(f"\nTop {top_n} most important features:")
        print(top_features)
    
    return conf_matrix, class_report, accuracy, top_features


In [13]:
cm, report, accuracy, topFeatures = perform_lda(data_scaled, Labels, data_features, top_n=10)

print_classification_results(cm, report, accuracy)


Performing LDA on 36 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
Dport: 0.0000
SrcBytes: 4.5746
DstBytes: -9.3746
SrcLoad: -0.4146
DstLoad: -3.5580
SrcGap: -0.0000
DstGap: 0.0000
SIntPkt: -2.3615
DIntPkt: -0.4477
SIntPktAct: 453.4666
DIntPktAct: -0.0000
SrcJitter: -153.6353
DstJitter: 7.8913
sMaxPktSz: 0.0185
dMaxPktSz: -0.9048
sMinPktSz: -0.7674
dMinPktSz: 0.0000
Dur: 2.0393
Trans: 0.0000
TotPkts: 2.6564
TotBytes: -1.0525
Load: -2.3268
Loss: -552.0637
pLoss: 1173.2676
pSrcLoss: -384.6796
pDstLoss: -600.8920
Rate: 4.7021
Packet_num: -0.0375
Temp: -0.0457
SpO2: 0.0027
Pulse_Rate: 0.2497
SYS: -0.0287
DIA: 0.1639
Heart_rate: 0.0998
Resp_Rate: 0.0286
ST: 0.2382

Top 10 most important features:
pLoss         1173.267597
pDstLoss       600.891950
Loss           552.063711
SIntPktAct     453.466610
pSrcLoss       384.679565
SrcJitter      153.635281
DstBytes         9.374628
DstJitter  

In [36]:
def gaussian_naive_bayes(data_scaled, labels, features):
    """
    Perform Gaussian Naive Bayes on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        labels: target labels
        features: DataFrame with features to analyze
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"\nPerforming Gaussian Naive Bayes on {len(features.columns)} features")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )
    
    # Initialize and fit GNB model
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    
    # Make predictions and calculate metrics
    y_pred = gnb.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    return cm, class_report, accuracy

In [37]:
# Usage:
cm, report, acc = gaussian_naive_bayes(data_scaled, Labels, data_features)
print_classification_results(cm, report, acc)


Performing Gaussian Naive Bayes on 36 features

Confusion Matrix Results:

    True Negatives: 3843 (78.49%)
    False Positives: 409 (8.35%)
    False Negatives: 338 (6.90%)
    True Positives: 306 (6.25%)
    

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      4252
           1       0.43      0.48      0.45       644

    accuracy                           0.85      4896
   macro avg       0.67      0.69      0.68      4896
weighted avg       0.85      0.85      0.85      4896


Model Accuracy: 84.74%


In [38]:
def train_SVC(data_scaled, features, labels):
    """
    Train SVC models with different kernels using selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        features: DataFrame with features to analyze
        labels: target labels
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy) for best kernel
    """
    print("\nPerforming SVC on this many features: ", len(features.columns))
    
    # Convert data_scaled to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )

    kernelTypes = ['linear', 'rbf', 'poly']
    best_accuracy = 0
    best_metrics = None
    
    print("\nTraining SVC models with different kernels...")
    for kernelType in tqdm(kernelTypes, desc='Training'):
        model = SVC(kernel=kernelType, C=1)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        print(f"\nKernel: {kernelType}")
        print(f"Accuracy: {accuracy:.5f}")
        
    return cm, report, accuracy
    



In [31]:
# Usage:
cm, report, acc = train_SVC(data_scaled, data_features, Labels)
print_classification_results(cm, report, acc)


Performing SVC on this many features:  36

Training SVC models with different kernels...


Training:   0%|          | 0/3 [00:00<?, ?it/s]

Training:  33%|███▎      | 1/3 [00:06<00:13,  6.62s/it]


Kernel: linear
Accuracy: 0.92586


Training:  67%|██████▋   | 2/3 [00:08<00:03,  3.90s/it]


Kernel: rbf
Accuracy: 0.92933


Training: 100%|██████████| 3/3 [00:10<00:00,  3.35s/it]


Kernel: poly
Accuracy: 0.92851

Confusion Matrix Results:

    True Negatives: 4244 (86.68%)
    False Positives: 8 (0.16%)
    False Negatives: 342 (6.99%)
    True Positives: 302 (6.17%)
    

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      4252
           1       0.97      0.47      0.63       644

    accuracy                           0.93      4896
   macro avg       0.95      0.73      0.80      4896
weighted avg       0.93      0.93      0.92      4896


Model Accuracy: 92.85%





In [46]:
def CNN(data_scaled, labels, features):
    """
    Perform CNN classification on selected features with adaptive architecture
    """
    print(f"\nPerforming CNN on {len(features.columns)} features")
    
    # Data preparation (keep existing code until reshape)
    scaled_df = pd.DataFrame(data_scaled, columns=features.columns)
    selected_features = scaled_df[features.columns].values
    
    le = LabelEncoder()
    y_encoded = le.fit_transform(labels)
    y_categorical = to_categorical(y_encoded)

    # Reshape data
    num_features = selected_features.shape[1]
    grid_size = int(np.ceil(np.sqrt(num_features)))
    X_padded = np.zeros((selected_features.shape[0], grid_size * grid_size))
    X_padded[:, :num_features] = selected_features
    X_reshaped = X_padded.reshape(-1, grid_size, grid_size, 1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_reshaped, y_categorical, test_size=0.3, random_state=42
    )

    # Build adaptive CNN model based on input size
    model = Sequential()
    
    # First Conv2D layer with smaller kernel and no pooling for small inputs
    if grid_size < 5:
        model.add(Conv2D(32, kernel_size=(2,2), activation='relu', 
                        input_shape=(grid_size, grid_size, 1),
                        padding='same'))
    else:
        model.add(Conv2D(32, kernel_size=(3,3), activation='relu', 
                        input_shape=(grid_size, grid_size, 1),
                        padding='same'))
        model.add(MaxPooling2D(pool_size=(2,2)))
        
        # Add second conv layer only for larger inputs
        if grid_size >= 7:
            model.add(Conv2D(64, kernel_size=(2,2), activation='relu', padding='same'))
    
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))  # Reduced dense layer size
    model.add(Dropout(0.3))  # Reduced dropout
    model.add(Dense(y_categorical.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train model
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                       validation_data=(X_test, y_test), verbose=0)

    # Predictions and metrics (keep existing code)
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)

    cm = confusion_matrix(y_test_labels, y_pred)
    class_report = classification_report(y_test_labels, y_pred)
    accuracy = accuracy_score(y_test_labels, y_pred)
    
    return cm, class_report, accuracy

In [40]:
# Run the CNN model
cnn_cm, cnn_report, cnn_accuracy = CNN(data_scaled, Labels, data_features)

# Print the results
print_classification_results(cnn_cm, cnn_report, cnn_accuracy)


Performing CNN on 36 features


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Confusion Matrix Results:

    True Negatives: 4237 (86.54%)
    False Positives: 15 (0.31%)
    False Negatives: 312 (6.37%)
    True Positives: 332 (6.78%)
    

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      4252
           1       0.96      0.52      0.67       644

    accuracy                           0.93      4896
   macro avg       0.94      0.76      0.82      4896
weighted avg       0.93      0.93      0.92      4896


Model Accuracy: 93.32%


In [47]:


feature_sets = {
    'All Features': data_features,
    'Top 5 LDA': top_5_features_LDA,
    'Top 10 LDA': top_10_features_LDA,
    'Top 5 RF': top_5_features_rf,
    'Top 10 RF': top_10_features_rf
}

# Initialize results DataFrame
results_df = pd.DataFrame(index=feature_sets.keys(), columns=['LDA', 'Random Forest', 'K-Means Clustering', 'SVM', 'Gaussian Naive Bayes', 'CNN'])

# Run each model with each feature set
for feature_set_name, feature_set in feature_sets.items():
    # Select the features
    selected_features = data_features[feature_set.columns]
    selected_data_scaled = scaler.fit_transform(selected_features)
    
    # Split the data
    
    # LDA
    cm, report, accuracy, _ = perform_lda(selected_data_scaled, Labels, selected_features, top_n=10)
    results_df.loc[feature_set_name, 'LDA'] = accuracy
    
    # Random Forest
    cm, report, accuracy = train_random_forest(selected_data_scaled, Labels, selected_features, 100)
    results_df.loc[feature_set_name, 'Random Forest'] = accuracy
    
    # K-Means Clustering
    cm, report, accuracy = perform_kmeans_clustering(selected_data_scaled, selected_features, Labels, n_components=3)
    results_df.loc[feature_set_name, 'K-Means Clustering'] = accuracy
    
    # SVM
    cm, report, accuracy = train_SVC(selected_data_scaled, selected_features, Labels)
    results_df.loc[feature_set_name, 'SVM'] = accuracy
    
    # Gaussian Naive Bayes
    cm, report, accuracy = gaussian_naive_bayes(selected_data_scaled, Labels, selected_features)
    results_df.loc[feature_set_name, 'Gaussian Naive Bayes'] = accuracy
    
    # CNN
    cm, report, accuracy = CNN(selected_data_scaled, Labels, selected_features)
    results_df.loc[feature_set_name, 'CNN'] = accuracy

# Display the results
print(results_df)


Performing LDA on 36 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
Dport: 0.0000
SrcBytes: 4.5746
DstBytes: -9.3746
SrcLoad: -0.4146
DstLoad: -3.5580
SrcGap: -0.0000
DstGap: 0.0000
SIntPkt: -2.3615
DIntPkt: -0.4477
SIntPktAct: 453.4666
DIntPktAct: -0.0000
SrcJitter: -153.6353
DstJitter: 7.8913
sMaxPktSz: 0.0185
dMaxPktSz: -0.9048
sMinPktSz: -0.7674
dMinPktSz: 0.0000
Dur: 2.0393
Trans: 0.0000
TotPkts: 2.6564
TotBytes: -1.0525
Load: -2.3268
Loss: -552.0637
pLoss: 1173.2676
pSrcLoss: -384.6796
pDstLoss: -600.8920
Rate: 4.7021
Packet_num: -0.0375
Temp: -0.0457
SpO2: 0.0027
Pulse_Rate: 0.2497
SYS: -0.0287
DIA: 0.1639
Heart_rate: 0.0998
Resp_Rate: 0.0286
ST: 0.2382

Top 10 most important features:
pLoss         1173.267597
pDstLoss       600.891950
Loss           552.063711
SIntPktAct     453.466610
pSrcLoss       384.679565
SrcJitter      153.635281
DstBytes         9.374628
DstJitter  

Training:  33%|███▎      | 1/3 [00:06<00:13,  6.71s/it]


Kernel: linear
Accuracy: 0.92586


Training:  67%|██████▋   | 2/3 [00:08<00:03,  3.92s/it]


Kernel: rbf
Accuracy: 0.92933


Training: 100%|██████████| 3/3 [00:10<00:00,  3.34s/it]


Kernel: poly
Accuracy: 0.92851

Performing Gaussian Naive Bayes on 36 features

Performing CNN on 36 features



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Performing LDA on 5 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
pLoss: 3.4085
pDstLoss: -2.2251
Loss: -0.3130
SIntPktAct: 0.4211
pSrcLoss: -1.6207

Top 10 most important features:
pLoss         3.408538
pDstLoss      2.225052
pSrcLoss      1.620706
SIntPktAct    0.421074
Loss          0.313039
dtype: float64
Training Random Forest with 100 estimators
Number of features used: 5

Top 10 Most Important Features:
      feature  importance
1    pDstLoss    0.326323
0       pLoss    0.310721
2        Loss    0.258536
4    pSrcLoss    0.088448
3  SIntPktAct    0.015973

performin k means on this many features:  5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Performing SVC on this many features:  5

Training SVC models with different kernels...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training:  33%|███▎      | 1/3 [00:00<00:01,  1.45it/s]


Kernel: linear
Accuracy: 0.86846


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training:  67%|██████▋   | 2/3 [00:02<00:01,  1.06s/it]


Kernel: rbf
Accuracy: 0.86846


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Training: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Kernel: poly
Accuracy: 0.86846

Performing Gaussian Naive Bayes on 5 features

Performing CNN on 5 features


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Performing LDA on 10 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
pLoss: 393.8038
pDstLoss: -201.8607
Loss: -185.6725
SIntPktAct: 153.4113
pSrcLoss: -129.0672
SrcJitter: -52.4839
DstBytes: -1.7181
DstJitter: 2.5665
Rate: -1.5558
SrcBytes: 1.2818

Top 10 most important features:
pLoss         393.803799
pDstLoss      201.860726
Loss          185.672451
SIntPktAct    153.411333
pSrcLoss      129.067201
SrcJitter      52.483944
DstJitter       2.566473
DstBytes        1.718067
Rate            1.555772
SrcBytes        1.281833
dtype: float64
Training Random Forest with 100 estimators
Number of features used: 10

Top 10 Most Important Features:
      feature  importance
7   DstJitter    0.440178
8        Rate    0.337131
5   SrcJitter    0.208638
6    DstBytes    0.006746
9    SrcBytes    0.006033
0       pLoss    0.000368
4    pSrcLoss    0.000365
2        Loss    0.000298
1    pDstLo

Training:  33%|███▎      | 1/3 [00:01<00:02,  1.35s/it]


Kernel: linear
Accuracy: 0.92688


Training:  67%|██████▋   | 2/3 [00:02<00:01,  1.32s/it]


Kernel: rbf
Accuracy: 0.92892


Training: 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]


Kernel: poly
Accuracy: 0.92729

Performing Gaussian Naive Bayes on 10 features

Performing CNN on 10 features



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Performing LDA on 5 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
DIntPkt: -0.6907
Load: -1.0650
DstJitter: 1.8382
Dur: -0.6179
SrcLoad: -0.2895

Top 10 most important features:
DstJitter    1.838189
Load         1.064962
DIntPkt      0.690748
Dur          0.617912
SrcLoad      0.289481
dtype: float64
Training Random Forest with 100 estimators
Number of features used: 5

Top 10 Most Important Features:
     feature  importance
0    DIntPkt    0.250659
2  DstJitter    0.249182
4    SrcLoad    0.214627
3        Dur    0.151189
1       Load    0.134344

performin k means on this many features:  5

Performing SVC on this many features:  5

Training SVC models with different kernels...


Training:  33%|███▎      | 1/3 [00:02<00:04,  2.41s/it]


Kernel: linear
Accuracy: 0.92586


Training:  67%|██████▋   | 2/3 [00:04<00:01,  1.97s/it]


Kernel: rbf
Accuracy: 0.92729


Training: 100%|██████████| 3/3 [00:06<00:00,  2.11s/it]


Kernel: poly
Accuracy: 0.92872

Performing Gaussian Naive Bayes on 5 features

Performing CNN on 5 features



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Performing LDA on 10 features

LDA Components Information:
Number of components: None

Explained variance ratio:
[1.]

Linear Discriminant Coefficients:

LD1 coefficients:
DIntPkt: 0.0939
Load: -22.3502
DstJitter: 2.3100
Dur: -1.2466
SrcLoad: 0.0074
Rate: 57.9967
Packet_num: -0.1367
SIntPkt: -0.9139
DstLoad: -39.6188
SrcJitter: -0.2130

Top 10 most important features:
Rate          57.996661
DstLoad       39.618788
Load          22.350241
DstJitter      2.309951
Dur            1.246617
SIntPkt        0.913868
SrcJitter      0.213015
Packet_num     0.136689
DIntPkt        0.093929
SrcLoad        0.007363
dtype: float64
Training Random Forest with 100 estimators
Number of features used: 10


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Top 10 Most Important Features:
      feature  importance
0     DIntPkt    0.179262
2   DstJitter    0.167164
4     SrcLoad    0.122655
6  Packet_num    0.111005
9   SrcJitter    0.078561
3         Dur    0.075984
7     SIntPkt    0.068331
5        Rate    0.068188
1        Load    0.064495
8     DstLoad    0.064356

performin k means on this many features:  10

Performing SVC on this many features:  10

Training SVC models with different kernels...


Training:  33%|███▎      | 1/3 [00:01<00:03,  1.80s/it]


Kernel: linear
Accuracy: 0.92525


Training:  67%|██████▋   | 2/3 [00:03<00:01,  1.82s/it]


Kernel: rbf
Accuracy: 0.92708


Training: 100%|██████████| 3/3 [00:04<00:00,  1.65s/it]


Kernel: poly
Accuracy: 0.92708

Performing Gaussian Naive Bayes on 10 features

Performing CNN on 10 features



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                   LDA Random Forest K-Means Clustering       SVM  \
All Features  0.924428      0.933824           0.868056  0.928513   
Top 5 LDA     0.868464      0.868464           0.868056  0.868464   
Top 10 LDA    0.925858      0.919322           0.868056  0.927288   
Top 5 RF      0.910948      0.921773           0.831291  0.928717   
Top 10 RF     0.915237        0.9279           0.868464  0.927083   

             Gaussian Naive Bayes       CNN  
All Features             0.847426  0.931781  
Top 5 LDA                0.132966  0.868464  
Top 10 LDA               0.132966  0.928922  
Top 5 RF                 0.917688  0.926879  
Top 10 RF                0.866013  0.929126  
