# CLUSTER 6 CLASSES

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/dataset/train_sel_hclust.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_clustering_comparison(X, n_clusters=6):
    # Setup the figure
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    fig.suptitle('Clustering Comparison: Raw vs Normalized Data', fontsize=16)
    
    # 1. Prepare data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 2. Perform clustering on both datasets
    mbk_raw = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    mbk_norm = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    
    clusters_raw = mbk_raw.fit_predict(X)
    clusters_norm = mbk_norm.fit_predict(X_scaled)
    
    # 3. PCA Visualization
    pca = PCA(n_components=2)
    X_pca_raw = pca.fit_transform(X)
    X_pca_norm = pca.fit_transform(X_scaled)
    
    # Plot PCA results
    scatter_pca_raw = axes[0,0].scatter(X_pca_raw[:, 0], X_pca_raw[:, 1], 
                                      c=clusters_raw, cmap='tab10', alpha=0.6)
    axes[0,0].set_title('PCA - Raw Data')
    axes[0,0].set_xlabel('First Principal Component')
    axes[0,0].set_ylabel('Second Principal Component')
    plt.colorbar(scatter_pca_raw, ax=axes[0,0])
    
    scatter_pca_norm = axes[0,1].scatter(X_pca_norm[:, 0], X_pca_norm[:, 1], 
                                       c=clusters_norm, cmap='tab10', alpha=0.6)
    axes[0,1].set_title('PCA - Normalized Data')
    axes[0,1].set_xlabel('First Principal Component')
    axes[0,1].set_ylabel('Second Principal Component')
    plt.colorbar(scatter_pca_norm, ax=axes[0,1])
    
    # 4. t-SNE Visualization (on a sample if data is large)
    sample_size = min(10000, len(X))
    sample_idx = np.random.choice(len(X), sample_size, replace=False)
    
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne_raw = tsne.fit_transform(X[sample_idx])
    X_tsne_norm = tsne.fit_transform(X_scaled[sample_idx])
    
    # Plot t-SNE results
    scatter_tsne_raw = axes[1,0].scatter(X_tsne_raw[:, 0], X_tsne_raw[:, 1], 
                                       c=clusters_raw[sample_idx], cmap='tab10', alpha=0.6)
    axes[1,0].set_title('t-SNE - Raw Data (Sample)')
    plt.colorbar(scatter_tsne_raw, ax=axes[1,0])
    
    scatter_tsne_norm = axes[1,1].scatter(X_tsne_norm[:, 0], X_tsne_norm[:, 1], 
                                        c=clusters_norm[sample_idx], cmap='tab10', alpha=0.6)
    axes[1,1].set_title('t-SNE - Normalized Data (Sample)')
    plt.colorbar(scatter_tsne_norm, ax=axes[1,1])
    
    plt.tight_layout()
    plt.show()
    
    # 5. Additional visualization: Cluster sizes
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    pd.Series(clusters_raw).value_counts().sort_index().plot(
        kind='bar', ax=ax1, title='Cluster Sizes - Raw Data')
    ax1.set_xlabel('Cluster')
    ax1.set_ylabel('Number of Samples')
    
    pd.Series(clusters_norm).value_counts().sort_index().plot(
        kind='bar', ax=ax2, title='Cluster Sizes - Normalized Data')
    ax2.set_xlabel('Cluster')
    ax2.set_ylabel('Number of Samples')
    
    plt.tight_layout()
    plt.show()
    
    return clusters_raw, clusters_norm

# Usage
# Assuming df is your dataframe
X = df.drop(['category', 'attack', 'is_benign'], axis=1, errors='ignore')
clusters_raw, clusters_norm = visualize_clustering_comparison(X)

In [None]:
def perform_clustering(X, y, n_clusters):
    # Initialize clustering
    mbk = MiniBatchKMeans(
        n_clusters=n_clusters,
        batch_size=10000,
        random_state=42
    )
    
    # Fit and get clusters
    clusters = mbk.fit_predict(X)
    
    # Internal validation (using X)
    print("\nInternal Validation Metrics:")
    print(f"Number of clusters: {n_clusters}")
    print(f"Silhouette Score: {silhouette_score(X, clusters):.3f}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz_score(X, clusters):.3f}")
    print(f"Davies-Bouldin Score: {davies_bouldin_score(X, clusters):.3f}")
    
    # External validation (comparing with y)
    print("\nExternal Validation Metrics (compared to true labels):")
    print(f"Adjusted Rand Index: {adjusted_rand_score(y, clusters):.3f}")
    print(f"Normalized Mutual Information: {normalized_mutual_info_score(y, clusters):.3f}")
    
    # Visualization using PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot 1: Clusters
    scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10', alpha=0.6)
    ax1.set_title('Clustering Results')
    ax1.set_xlabel('First Principal Component')
    ax1.set_ylabel('Second Principal Component')
    plt.colorbar(scatter1, ax=ax1, label='Cluster Labels')
    
    # Plot 2: True Labels
    scatter2 = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.6)
    ax2.set_title('True Labels')
    ax2.set_xlabel('First Principal Component')
    ax2.set_ylabel('Second Principal Component')
    plt.colorbar(scatter2, ax=ax2, label='True Labels')
    
    # Add a third plot for cluster sizes
    plt.figure(figsize=(10, 6))
    cluster_sizes = pd.Series(clusters).value_counts().sort_index()
    cluster_sizes.plot(kind='bar')
    plt.title('Cluster Sizes')
    plt.xlabel('Cluster')
    plt.ylabel('Number of Samples')
    
    plt.tight_layout()
    plt.show()
    
    return clusters, mbk

# Preprocess data
def prepare_data(df):
    # Remove target columns for clustering
    X = df.drop(['category', 'attack', 'is_benign'], axis=1)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled

# Main clustering process
def multi_level_clustering(df):
    X_scaled = prepare_data(df)
    
    # 1. Binary clustering (is_benign)
    print("\nClustering for Binary Classification:")
    clusters_binary, model_binary = perform_clustering(X_scaled, df['is_benign'], n_clusters=2)
    
    # 2. Main categories clustering
    print("\nClustering for Main Categories:")
    clusters_main, model_main = perform_clustering(X_scaled, df['category'], n_clusters=6)
    
    # 3. Detailed attack types clustering
    print("\nClustering for Attack Types:")
    clusters_attack, model_attack = perform_clustering(X_scaled, df['attack'], n_clusters=19)
    
    # Add results to original dataframe
    results_df = df.copy()
    results_df['binary_cluster'] = clusters_binary
    results_df['main_category_cluster'] = clusters_main
    results_df['attack_type_cluster'] = clusters_attack
    
    return results_df

# Usage
# Assuming df is your dataframe
clustered_df = multi_level_clustering(df)

# Analyze results
def analyze_cluster_distribution(df):
    print("\nBinary Clustering Distribution:")
    print(pd.crosstab(df['binary_cluster'], df['is_benign']))
    
    print("\nMain Categories Clustering Distribution:")
    print(pd.crosstab(df['main_category_cluster'], df['category']))
    
    print("\nAttack Types Clustering Distribution:")
    print(pd.crosstab(df['attack_type_cluster'], df['attack']))

# Analyze results
analyze_cluster_distribution(clustered_df)