# Clustering Numerical Data: A Comprehensive Guide


This notebook provides a comprehensive guide to clustering numerical data using various clustering techniques, including K-Means, Hierarchical Clustering, and DBSCAN. It covers the basic steps in data analysis such as cleaning, normalization, parameter selection, and visualization, as well as various methods for determining optimal parameters.


In [None]:

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

# Setting up seaborn for better visualization
sns.set(style='whitegrid')


## Step 1: Load the Dataset

In [None]:

# Step 1: Load the dataset
def load_data(file_path):
    """
    Load the dataset from a CSV file.
    """
    df = pd.read_csv(file_path)
    return df

# Example usage (replace 'your_dataset.csv' with the path to your dataset)
file_path = 'your_dataset.csv'  # Replace with your dataset path
df = load_data(file_path)
df.head()


## Step 2: Data Cleaning

In [None]:

# Step 2: Data Cleaning
def clean_data(df):
    """
    Clean the dataset by handling missing values and dropping irrelevant columns.
    """
    # Dropping columns that are not needed (customize as necessary)
    df = df.drop(columns=['Unnamed: 0'], errors='ignore')
    
    # Handling missing values by filling them with the median
    df = df.fillna(df.median())
    
    return df

# Clean the data
df = clean_data(df)
df.head()


## Step 3: Data Normalization

In [None]:

# Step 3: Data Normalization
def normalize_data(df):
    """
    Normalize the data to have zero mean and unit variance.
    """
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    return df_scaled

# Choose features for clustering
features = df.select_dtypes(include=[np.number]).columns  # Use numerical columns only
df_features = df[features]

# Normalize the data
df_scaled = normalize_data(df_features)


## Step 4: Clustering Methods

### 4.1 K-Means Clustering

In [None]:

# Step 4: K-Means Clustering
def kmeans_clustering(data, n_clusters):
    """
    Perform K-Means clustering.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(data)
    return labels, kmeans.inertia_

# Example K-Means Clustering
kmeans_labels, kmeans_inertia = kmeans_clustering(df_scaled, n_clusters=3)


### 4.2 Hierarchical Clustering

In [None]:

# Step 5: Hierarchical Clustering
def hierarchical_clustering(data, n_clusters):
    """
    Perform Agglomerative hierarchical clustering.
    """
    hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    labels = hc.fit_predict(data)
    return labels

# Example Hierarchical Clustering
hc_labels = hierarchical_clustering(df_scaled, n_clusters=3)


### 4.3 DBSCAN Clustering

In [None]:

# Step 6: DBSCAN Clustering
def dbscan_clustering(data, eps, min_samples):
    """
    Perform DBSCAN clustering.
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(data)
    return labels

# Example DBSCAN Clustering
dbscan_labels = dbscan_clustering(df_scaled, eps=0.5, min_samples=5)


## Step 5: Evaluation Metrics

In [None]:

# Step 7: Evaluation Metrics
def evaluate_clustering(data, labels):
    """
    Evaluate the clustering results using Silhouette Score and Davies-Bouldin Index.
    """
    if len(set(labels)) > 1:  # At least 2 clusters are needed for silhouette score
        silhouette = silhouette_score(data, labels)
        db_score = davies_bouldin_score(data, labels)
    else:
        silhouette = np.nan
        db_score = np.nan
    
    return silhouette, db_score

# Evaluation Example
kmeans_silhouette, kmeans_db_score = evaluate_clustering(df_scaled, kmeans_labels)
hc_silhouette, hc_db_score = evaluate_clustering(df_scaled, hc_labels)
dbscan_silhouette, dbscan_db_score = evaluate_clustering(df_scaled, dbscan_labels)

print(f'K-Means: Silhouette Score = {kmeans_silhouette:.2f}, Davies-Bouldin Score = {kmeans_db_score:.2f}')
print(f'Hierarchical: Silhouette Score = {hc_silhouette:.2f}, Davies-Bouldin Score = {hc_db_score:.2f}')
print(f'DBSCAN: Silhouette Score = {dbscan_silhouette:.2f}, Davies-Bouldin Score = {dbscan_db_score:.2f}')


## Step 6: Visualization of Clustering Results

In [None]:

# Step 8: Visualization of Clustering Results
def plot_clusters(data, labels, title):
    """
    Visualize the clustering results using a scatter plot.
    """
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=data[:, 0], y=data[:, 1], hue=labels, palette='viridis', legend='full')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

# Plotting the results
plot_clusters(df_scaled, kmeans_labels, 'K-Means Clustering')
plot_clusters(df_scaled, hc_labels, 'Hierarchical Clustering')
plot_clusters(df_scaled, dbscan_labels, 'DBSCAN Clustering')


## Step 7: Methods for Choosing Parameters

### 7.1 Elbow Method for K-Means

In [None]:

def plot_elbow_method(data, max_clusters=10):
    """
    Plot the Elbow Method to determine the optimal number of clusters for K-Means.
    """
    inertias = []
    for n in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=n, random_state=42)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_clusters + 1), inertias, marker='o')
    plt.title('Elbow Method for K-Means')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.show()

# Call the Elbow Method plot
plot_elbow_method(df_scaled, max_clusters=10)


### 7.2 Silhouette Analysis for K-Means

In [None]:

def plot_silhouette_analysis(data, max_clusters=10):
    """
    Plot Silhouette Scores for different numbers of clusters to determine the optimal number.
    """
    silhouette_scores = []
    for n in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n, random_state=42)
        labels = kmeans.fit_predict(data)
        silhouette_scores.append(silhouette_score(data, labels))
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
    plt.title('Silhouette Analysis for K-Means')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.show()

# Call the Silhouette Analysis plot
plot_silhouette_analysis(df_scaled, max_clusters=10)


### 7.3 k-Nearest Neighbors Distance Plot for DBSCAN

In [None]:

def plot_k_distance(data, min_samples):
    """
    Plot k-nearest neighbors distances to help find the optimal eps value for DBSCAN.
    """
    neigh = NearestNeighbors(n_neighbors=min_samples)
    nbrs = neigh.fit(data)
    distances, indices = nbrs.kneighbors(data)
    
    # Sort distances and plot
    sorted_distances = np.sort(distances[:, min_samples - 1])
    plt.figure(figsize=(8, 6))
    plt.plot(sorted_distances)
    plt.title(f'k-Distance Plot for DBSCAN (min_samples={min_samples})')
    plt.xlabel('Data Points (sorted)')
    plt.ylabel(f'{min_samples}-Nearest Neighbor Distance')
    plt.show()

# Call the k-distance plot
plot_k_distance(df_scaled, min_samples=5)
