# CLUSTER 6 CLASSES

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans  # Changed from MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import resample
import matplotlib.pyplot as plt
import time
from rich import print

In [None]:
df = pd.read_csv('/kaggle/input/dataset/train_sel_hclust.csv')

In [None]:
def balanced_clustering(df, method='median_multiplier', multiplier=1):
    # Print original data size
    print(f"Original dataset size: {len(df)}")
    print("Original category distribution:")
    print(df['category'].value_counts())
    
    start_time = time.time()
    
    categories = df['category'].value_counts()
    
    if method == 'median_multiplier':
        threshold = np.median(categories) * multiplier
    elif method == 'mean_multiplier':
        threshold = np.mean(categories) * multiplier
    elif method == 'quantile':
        threshold = categories.quantile(0.75)
    else:
        raise ValueError("Method must be 'median_multiplier', 'mean_multiplier', or 'quantile'")
    
    balanced_dfs = []
    for cat in categories.index:
        cat_df = df[df['category'] == cat]
        if len(cat_df) > threshold:
            cat_df = resample(cat_df, replace=False, n_samples=int(threshold), random_state=42)
        balanced_dfs.append(cat_df)
    
    balanced_df = pd.concat(balanced_dfs)
    
    # Print balanced data size
    print(f"\nBalanced dataset size: {len(balanced_df)}")
    print("Balanced category distribution:")
    print(balanced_df['category'].value_counts())
    
    X = balanced_df.drop(['category', 'attack', 'is_benign'], axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Using regular KMeans instead of MiniBatchKMeans
    kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    balanced_df['cluster'] = clusters
    
    end_time = time.time()
    print(f"\nClustering time: {end_time - start_time:.2f} seconds")
    
    return balanced_df, kmeans

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.utils import resample
import time

def evaluate_clustering(df, model='median_multiplier', multiplier=1):
    print(f"Original dataset size: {len(df)}")
    print("Original category distribution:")
    print(df['category'].value_counts())
    
    start_time = time.time()
    
    categories = df['category'].value_counts()
    
    if model == 'median_multiplier':
        threshold = np.median(categories) * multiplier
    elif model == 'mean_multiplier':
        threshold = np.mean(categories) * multiplier
    elif model == 'quantile':
        threshold = categories.quantile(0.75)
    
    balanced_dfs = []
    for cat in categories.index:
        cat_df = df[df['category'] == cat]
        if len(cat_df) > threshold:
            cat_df = resample(cat_df, replace=False, n_samples=int(threshold), random_state=42)
        balanced_dfs.append(cat_df)
    
    balanced_df = pd.concat(balanced_dfs)
    
    print(f"\nBalanced dataset size: {len(balanced_df)}")
    print("Balanced category distribution:")
    print(balanced_df['category'].value_counts())
    
    X = balanced_df.drop(['category', 'attack', 'is_benign'], axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kmeans = KMeans(n_clusters=6, random_state=42, n_init=20)
    clusters = kmeans.fit_predict(X_scaled)
    balanced_df['cluster'] = clusters
    
    # Calculate clustering quality metrics
    silhouette = silhouette_score(X_scaled, clusters)
    calinski = calinski_harabasz_score(X_scaled, clusters)
    davies = davies_bouldin_score(X_scaled, clusters)
    
    print("\nClustering Quality Metrics:")
    print(f"Silhouette Score: {silhouette:.3f} (ranges from -1 to 1, higher is better)")
    print(f"Calinski-Harabasz Score: {calinski:.3f} (higher is better)")
    print(f"Davies-Bouldin Score: {davies:.3f} (lower is better)")
    
    # Analyze cluster sizes
    print("\nCluster Size Distribution:")
    print(pd.Series(clusters).value_counts())
    
    # Analyze category distribution within clusters
    print("\nCategory Distribution within Clusters:")
    print(pd.crosstab(balanced_df['cluster'], balanced_df['category']))
    
    end_time = time.time()
    print(f"\nClustering time: {end_time - start_time:.2f} seconds")
    
    return balanced_df, kmeans

# Run evaluation
balanced_df, model = evaluate_clustering(df, model='median_multiplier', multiplier=2)

In [None]:
def visualize_clusters_2d(balanced_df, model, features):
    # Get the feature columns
    X = balanced_df[features]
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Create 2D plot
    plt.figure(figsize=(12, 8))
    
    # Plot points colored by cluster
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
                          c=balanced_df['cluster'], cmap='viridis',
                          alpha=0.6)
    
    # Plot cluster centers
    if hasattr(model, 'cluster_centers_'):
        centers_pca = pca.transform(model.cluster_centers_)
        plt.scatter(centers_pca[:, 0], centers_pca[:, 1],
                    c='red', marker='x', s=200, linewidths=3,
                    label='Cluster Centers')
    
    # Add labels and title
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
    plt.title('2D Cluster Visualization using PCA')
    
    # Add colorbar
    plt.colorbar(scatter, label='Cluster')
    plt.legend()
    
    # Print explained variance
    total_var = sum(pca.explained_variance_ratio_)
    print(f"\nTotal explained variance by two components: {total_var:.2%}")
    print("\nIndividual explained variance ratios:")
    for i, ratio in enumerate(pca.explained_variance_ratio_):
        print(f"PC{i+1}: {ratio:.2%}")
    
    plt.show()


In [None]:
feature_columns = balanced_df.drop(['category', 'attack', 'is_benign', 'cluster'], axis=1).columns
visualize_clusters_2d(balanced_df, model, feature_columns)