# CLUSTER 6 CLASSES

This notebook analyzes network traffic data using clustering techniques to identify patterns in network attacks. We'll explore two different approaches:
1. Standard K-means clustering with 6 clusters
2. Modified approach using DBSCAN with merged DoS/DDoS categories

In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.utils import resample
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import time
from rich import print
from tabulate import tabulate
import joblib

In [16]:
df = pd.read_csv('/kaggle/input/dataset/train_sel_hclust.csv')
#df = pd.read_csv('../dataset/train_sel_hclust.csv')

In [None]:
def save_model(model, scaler, label_encoder, base_filename):
    """
    Save model and associated transformers.
    
    Parameters:
    - model: trained clustering model
    - scaler: fitted scaler
    - label_encoder: fitted label encoder
    - base_filename: base name for the saved files
    """
    joblib.dump(model, f'{base_filename}_model.joblib')
    joblib.dump(scaler, f'{base_filename}_scaler.joblib')
    joblib.dump(label_encoder, f'{base_filename}_encoder.joblib')

def load_model(base_filename):
    """
    Load saved model and associated transformers.
    
    Parameters:
    - base_filename: base name for the saved files
    
    Returns:
    - model, scaler, label_encoder
    """
    model = joblib.load(f'{base_filename}_model.joblib')
    scaler = joblib.load(f'{base_filename}_scaler.joblib')
    label_encoder = joblib.load(f'{base_filename}_encoder.joblib')
    return model, scaler, label_encoder

## 2. Data Preprocessing Functions

In [19]:
def balance_dataset(df, method='median_multiplier', multiplier=2, column='category'):
    """
    Balance the dataset by downsampling majority classes.
    
    Parameters:
    - df: pandas DataFrame with 'category' or 'attack' column
    - method: str, approach for calculating threshold
    - multiplier: float, multiplier for threshold calculation
    
    Returns:
    - balanced DataFrame
    """
    counts = df[column].value_counts()

    # Calculate threshold
    if method == 'median_multiplier':
        threshold = np.median(counts) * multiplier
    elif method == 'mean_multiplier':
        threshold = np.mean(counts) * multiplier
    elif method == 'quantile':
        threshold = counts.quantile(0.75)
    else:
        raise ValueError("Method must be 'median_multiplier', 'mean_multiplier', or 'quantile'")
    
    # Balance categories
    balanced_dfs = []
    for value in counts.index:
        value_df = df[df[column] == value]
        if len(value_df) > threshold:
            value_df = resample(value_df, replace=False, n_samples=int(threshold), random_state=42)
        balanced_dfs.append(value_df)
    
    balanced_df = pd.concat(balanced_dfs)
    comparison_data = [
        [value, counts[value], balanced_df[column].value_counts().get(value, 0)]
        for value in sorted(counts.index)
    ]

    comparison_data.sort(key=lambda x: x[1], reverse=True)

    print(tabulate(
        comparison_data,
        headers=[column, 'Original', 'After Balance'],
        tablefmt='psql'
    ))
    
    return balanced_df.sample(frac=0.1, random_state=42)

def prepare_features(df):
    """Prepare features for clustering."""
    return df.drop(['category', 'attack', 'is_benign'], axis=1)

## 3. Clustering Functions

In [None]:
def calculate_clustering_metrics(X_scaled, clusters):
    """Calculate clustering quality metrics."""
    if len(set(clusters)) > 1:
        metrics = {
            'Silhouette Score': silhouette_score(X_scaled, clusters),
            'Calinski-Harabasz Score': calinski_harabasz_score(X_scaled, clusters),
            'Davies-Bouldin Score': davies_bouldin_score(X_scaled, clusters)
        }
        print("\nClustering Quality Metrics:")
        print(tabulate(
            metrics.items(),
            headers=['Metric', 'Score'],
            tablefmt='psql'
        ))
    
def visualize_clusters(X_scaled, clusters, model=None, title_suffix=""):
    """Visualize clusters using t-SNE."""
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_scaled)
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1],
                         c=clusters,
                         cmap='viridis',
                         alpha=0.6)
    
    if hasattr(model, 'cluster_centers_'):
        centers_tsne = tsne.fit_transform(model.cluster_centers_)
        plt.scatter(centers_tsne[:, 0], centers_tsne[:, 1],
                   c='red', marker='x', s=200, linewidths=3,
                   label='Cluster Centers')
        plt.legend()
    
    plt.xlabel('t-SNE dimension 1')
    plt.ylabel('t-SNE dimension 2')
    plt.title(f't-SNE Cluster Visualization {title_suffix}')
    plt.colorbar(scatter, label='Cluster')
    plt.show()

In [None]:
def save_model(model, scaler, base_filename):
    """Save model and associated transformers."""
    joblib.dump(model, f'{base_filename}_model.joblib')
    joblib.dump(scaler, f'{base_filename}_scaler.joblib')

def load_model(base_filename):
    """Load saved model and associated transformers."""
    model = joblib.load(f'{base_filename}_model.joblib')
    scaler = joblib.load(f'{base_filename}_scaler.joblib')
    return model, scaler

In [None]:
def run_clustering_analysis(df, model_type='kmeans', label_column=None, **kwargs):
    """
    Run clustering analysis with specified model type and parameters.
    
    Parameters:
    - df: pandas DataFrame containing the data
    - model_type: str, one of ['kmeans', 'dbscan', 'attack_based']
    - label_column: str, column name to use for labels (required for attack_based)
    - **kwargs: additional parameters for specific clustering methods
        For kmeans: n_clusters (default=6)
        For dbscan: eps (default=0.3), min_samples (default=10)
    
    Returns:
    - tuple of (model, scaler, clusters, label_encoder if applicable)
    """
    print(f"\n=== Running {model_type.upper()} Analysis ===")
    
    # Extract parameters from kwargs with defaults
    to_merge = kwargs.get('to_merge', False)
    scaler_type = kwargs.get('scaler_type', 'standard')
    
    if to_merge:
        df['category'] = df['category'].replace({
            'DDoS': 'DOS_DDOS',
            'DoS': 'DOS_DDOS'
        })
    
    df = balance_dataset(df)
    scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()

    if model_type == 'kmeans':
        n_clusters = df[label_column].nunique()
        X = prepare_features(df)
        X_scaled = scaler.fit_transform(X)
        
        start_time = time.time()
        model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = model.fit_predict(X_scaled)
        
        calculate_clustering_metrics(X_scaled, clusters)
        visualize_clusters(X_scaled, clusters, model, "K-Means")
        print(f"\nClustering time: {time.time() - start_time:.2f} seconds")
        
        return model, scaler, clusters
    
    elif model_type == 'dbscan':
        X = prepare_features(df)
        X_scaled = scaler.fit_transform(X)
        
        eps = kwargs.get('eps', 0.3)
        min_samples = kwargs.get('min_samples', 10)
        
        start_time = time.time()
        model = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = model.fit_predict(X_scaled)
        
        calculate_clustering_metrics(X_scaled, clusters)
        visualize_clusters(X_scaled, clusters, title_suffix="DBSCAN")
        print(f"\nClustering time: {time.time() - start_time:.2f} seconds")
        
        return model, scaler, clusters
    
    else:
        raise ValueError("model_type must be one of ['kmeans', 'dbscan']")

## 4. Analysis: K-means Clustering

### Why K-means?

We start with K-means clustering because:
1. It's a simple and effective algorithm for our initial analysis
2. We have a reasonable assumption about the number of clusters (based on attack categories)
3. It works well with standardized numerical features

In [None]:
model, scaler, clusters = run_clustering_analysis(df, model_type='kmeans', label_column='category')
save_model(model, scaler, 'kmeans_model_category')

## 5. Modified Approach: DBSCAN with Merged Categories

### Why modify the approach?

Based on the K-means results, we observed:
1. Significant overlap between DDoS and DoS categories
2. Some clusters might not be spherical (a limitation of K-means)
3. StandardScaler might not be optimal for our feature distributions

Therefore, we implement these changes:
1. Merge DDoS and DoS into a single category
2. Use DBSCAN for non-spherical clusters
3. Switch to MinMaxScaler for better feature scaling

In [None]:
model, scaler, clusters = run_clustering_analysis(df, model_type='dbscan', label_column='category', to_merge=True)
save_model(model, scaler, 'dbscan_model_category')

Given the results, it is plausible to assert that without any external assistance, I have successfully identified 17 categories. It is not unreasonable to hypothesize that I could potentially identify all 19 categories in the context of network attacks.

In [None]:
model, scaler, clusters = run_clustering_analysis(df, model_type='kmeans', label_column='attack')
save_model(model, scaler, 'kmeans_model_attack')

model, scaler, clusters = run_clustering_analysis(df, model_type='dbscan', label_column='attack')
save_model(model, scaler, 'dbscan_model_attack')