In [None]:
'''
all written function are here 

'''

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D, Dropout
from tensorflow.keras.utils import to_categorical

In [22]:
# cleaning and pre processing data

dataPath = "data.csv"
data = pd.read_csv(dataPath)

# seperating features
label_column = 'Label' 
data_features = data.drop(columns=[label_column]).select_dtypes(include=['float64', 'int64'])
Labels = data[label_column]
# Add this to see excluded columns
excluded_columns = set(data.columns) - set(data_features.columns)
print("Excluded columns (including label):", excluded_columns)

#standardize
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_features)

top_10_features_list = ['DIntPkt', 'Load', 'DstJitter', 'Dur', 'SrcLoad', 
                     'Rate', 'Packet_num', 'SIntPkt', 'DstLoad', 'SrcJitter']
top_10_features = pd.DataFrame(data_features, columns=top_10_features_list)

top_5_features = pd.DataFrame(data_features, columns=top_10_features_list[:5])


print(top_10_features.columns)

Excluded columns (including label): {'Flgs', 'SrcMac', 'Dir', 'SrcAddr', 'Sport', 'DstAddr', 'Attack Category', 'Label', 'DstMac'}
Index(['DIntPkt', 'Load', 'DstJitter', 'Dur', 'SrcLoad', 'Rate', 'Packet_num',
       'SIntPkt', 'DstLoad', 'SrcJitter'],
      dtype='object')


In [23]:
def print_classification_results(confusion_matrix, classification_report, accuracy):
    #prints the cm results, classification report, and accuracy
    tn, fp, fn, tp = confusion_matrix.ravel()
    total = tn + fp + fn + tp
    
    # Calculate percentages
    tn_pct = (tn/total) * 100
    fp_pct = (fp/total) * 100
    fn_pct = (fn/total) * 100
    tp_pct = (tp/total) * 100
    
    # Print confusion matrix results
    print("\nConfusion Matrix Results:")
    print(f"""
    True Negatives: {tn} ({tn_pct:.2f}%)
    False Positives: {fp} ({fp_pct:.2f}%)
    False Negatives: {fn} ({fn_pct:.2f}%)
    True Positives: {tp} ({tp_pct:.2f}%)
    """)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report)
    
    # Print accuracy
    print(f"\nModel Accuracy: {accuracy:.2%}")


In [49]:
def find_top_n_features(data_scaled, features, n):
    pca = PCA(n_components=n)
    pca.fit(data_scaled)
    
    components = pca.components_
    explained_variance = pca.explained_variance_ratio_
    
    feature_importance = pd.DataFrame()
    for i in range(n):
        feature_importance[f'PC{i+1}'] = abs(components[i])
    
    feature_importance.index = features.columns
    feature_importance['Overall_Importance'] = feature_importance.sum(axis=1)
    feature_importance = feature_importance.sort_values('Overall_Importance', ascending=False)

    print(f"Total explained variance ratio: {sum(explained_variance):.2%}")
    print("\nExplained variance ratio by component:")
    for i, var in enumerate(explained_variance):
        print(f"PC{i+1}: {var:.2%}")
    
    print(f"\nTop {n} most important features:")
    top_n_features = feature_importance['Overall_Importance'].head(n)
    print(top_n_features)
    
    # Add semicolon to suppress output
    return feature_importance, top_n_features;

In [50]:
featureImportance, top_n = find_top_n_features(data_scaled, data_features, 10)

Total explained variance ratio: 87.71%

Explained variance ratio by component:
PC1: 24.29%
PC2: 14.41%
PC3: 12.17%
PC4: 8.41%
PC5: 8.28%
PC6: 5.97%
PC7: 3.95%
PC8: 3.71%
PC9: 3.33%
PC10: 3.18%

Top 10 most important features:
Temp          1.598481
SrcBytes      1.539131
ST            1.538129
Pulse_Rate    1.483176
Heart_rate    1.481323
SYS           1.459602
sMinPktSz     1.453595
SIntPkt       1.451182
SrcLoad       1.441597
DIA           1.384256
Name: Overall_Importance, dtype: float64


In [None]:
def perform_kmeans_clustering(data_scaled, features, labels, n_components=2):
    """
    Perform K-means clustering on PCA components using selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        features: DataFrame with only the features to analyze
        labels: target labels
        n_components: number of PCA components to use
    """
    print("\nperformin k means on this many features: ", len(features.columns))
    
    
    # Convert data_scaled to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    
    # Select only the specified features
    selected_features = scaled_df[features.columns]
    
    # PCA on selected features
    pca = PCA(n_components=n_components)
    pc_features = pca.fit_transform(selected_features)
    
    pca_df = pd.DataFrame(
        data=pc_features,
        columns=[f'PC{i+1}' for i in range(n_components)]
    )
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        pca_df, labels, test_size=0.1, random_state=42
    )
    
    # K-means clustering
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X_train)
    y_pred = kmeans.predict(X_test)
    
    # Evaluation metrics
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)    
    accuracy = accuracy_score(y_test, y_pred)    
    return cm, class_report, accuracy

In [None]:
cm, class_report, accuracy = perform_kmeans_clustering(data_scaled, data_features, data[label_column], n_components=3)

print_classification_results(cm, class_report, accuracy)

In [None]:
# feature_importance, top_features = find_top_n_features(data_scaled, features, 10)


In [None]:
def train_random_forest(data_scaled, Labels, features, n_estimators):
    """
    Train Random Forest on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        Labels: target labels 
        features: DataFrame with features to analyze
        n_estimators: number of trees in forest
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"Training Random Forest with {n_estimators} estimators")
    print(f"Number of features used: {len(features.columns)}")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, Labels, test_size=0.3, random_state=42
    )

    # Train model
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_classifier.predict(X_test)

    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Print feature importance
    feature_importance = pd.DataFrame({
        'feature': features.columns,
        'importance': rf_classifier.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    print("\nTop 5 Most Important Features:")
    print(feature_importance.head(5))
    
    return cm, class_report, accuracy

In [None]:
forest_cm, forest_report, forest_accuracy = train_random_forest(data_scaled, Labels, data_features, 100)

print_classification_results(forest_cm, forest_report, forest_accuracy)

In [None]:
def perform_lda(data_scaled, labels, features):
    """
    Perform Linear Discriminant Analysis on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        labels: target labels
        features: DataFrame with features to analyze
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"\nPerforming LDA on {len(features.columns)} features")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    # Select only specified features
    selected_features = scaled_df[features.columns].values
    
    # Split using the features DataFrame
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )

    # Initialize and fit LDA model
    lda = LDA()
    lda.fit(X_train, y_train)

    # Print LDA components information
    print("\nLDA Components Information:")
    print(f"Number of components: {lda.n_components}")
    print("\nExplained variance ratio:")
    print(lda.explained_variance_ratio_)
    
    # Print component coefficients
    print("\nLinear Discriminant Coefficients:")
    for i, component in enumerate(lda.coef_):
        print(f"\nLD{i+1} coefficients:")
        for feat, coef in zip(features.columns, component):
            print(f"{feat}: {coef:.4f}")

    # Make predictions and calculate metrics
    y_pred = lda.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    return conf_matrix, class_report, accuracy

In [None]:
cm, report, accuracy = perform_lda(data_scaled, Labels, top_10_features)

print_classification_results(cm, report, accuracy)

In [None]:
def gaussian_naive_bayes(data_scaled, labels, features):
    """
    Perform Gaussian Naive Bayes on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        labels: target labels
        features: DataFrame with features to analyze
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"\nPerforming Gaussian Naive Bayes on {len(features.columns)} features")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )
    
    # Initialize and fit GNB model
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    
    # Make predictions and calculate metrics
    y_pred = gnb.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    return cm, class_report, accuracy

In [None]:
# Usage:
cm, report, acc = gaussian_naive_bayes(data_scaled, Labels, top_10_features)
print_classification_results(cm, report, acc)

In [None]:
def train_SVC(data_scaled, features, labels):
    """
    Train SVC models with different kernels using selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        features: DataFrame with features to analyze
        labels: target labels
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy) for best kernel
    """
    print("\nPerforming SVC on this many features: ", len(features.columns))
    
    # Convert data_scaled to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        selected_features, labels, test_size=0.3, random_state=42
    )

    kernelTypes = ['linear', 'rbf', 'poly']
    best_accuracy = 0
    best_metrics = None
    
    print("\nTraining SVC models with different kernels...")
    for kernelType in tqdm(kernelTypes, desc='Training'):
        model = SVC(kernel=kernelType, C=1)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        print(f"\nKernel: {kernelType}")
        print(f"Accuracy: {accuracy:.2f}")
        
    return cm, report, accuracy
    



In [None]:
# Usage:
cm, report, acc = train_SVC(data_scaled, data_features, Labels)
print_classification_results(cm, report, acc)

In [None]:
def CNN(data_scaled, labels, features):
    """
    Perform CNN classification on selected features
    
    Args:
        data_scaled: scaled numpy array from StandardScaler
        labels: target labels
        features: DataFrame with features to analyze
    Returns:
        tuple: (confusion_matrix, classification_report, accuracy)
    """
    print(f"\nPerforming CNN on {len(features.columns)} features")
    
    # Convert scaled data to DataFrame with feature names
    scaled_df = pd.DataFrame(data_scaled, columns=data_features.columns)
    
    # Select only specified features
    selected_features = scaled_df[features.columns].values
    
    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(labels)
    y_categorical = to_categorical(y_encoded)

    # Reshape data for CNN
    num_features = selected_features.shape[1]
    grid_size = int(np.ceil(np.sqrt(num_features)))
    X_padded = np.zeros((selected_features.shape[0], grid_size * grid_size))
    X_padded[:, :num_features] = selected_features
    X_reshaped = X_padded.reshape(-1, grid_size, grid_size, 1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_reshaped, y_categorical, test_size=0.3, random_state=42
    )

    # Build CNN model
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3,3), activation='relu', input_shape=(grid_size, grid_size, 1)))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Conv2D(64, kernel_size=(2, 2), activation='relu'))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(y_categorical.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train model
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                       validation_data=(X_test, y_test), verbose=0)

    # Get predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)

    # Calculate metrics
    cm = confusion_matrix(y_test_labels, y_pred)
    class_report = classification_report(y_test_labels, y_pred)
    accuracy = accuracy_score(y_test_labels, y_pred)
    
    return cm, class_report, accuracy

In [None]:
# Run the CNN model
cnn_cm, cnn_report, cnn_accuracy = CNN(data_scaled, Labels, data_features)

# Print the results
print_classification_results(cnn_cm, cnn_report, cnn_accuracy)