In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from DataMatrix import generate_data_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

import warnings

warnings.filterwarnings("ignore")

In [3]:
X_train1, y_train1, X_test1, y_test1 = generate_data_matrix(method="mean")
# X_train2, y_train2, X_test2, y_test2 = generate_data_matrix(method="flatten")

n_clusters = 0
print("X_train shape: ", X_train1.shape)
print("y_train shape: ", y_train1.shape)
print("X_test shape: ", X_test1.shape)
print("y_test shape: ", y_test1.shape)

X_train shape:  (7296, 45)
y_train shape:  (7296,)
X_test shape:  (1824, 45)
y_test shape:  (1824,)


### DBSCAN Algorithm


In [4]:
def get_DBSCAN(X, min_samples, eps):
    # Initialize labels
    labels = np.zeros(X.shape[0])

    # C is the cluster counter
    C = 0
    for i in range(X.shape[0]):

        # Skip if already labeled
        if labels[i] != 0:
            continue

        # Find neighbors within eps
        neighbors = np.where(np.linalg.norm(X - X[i], axis=1) <= eps)[0]

        # Mark as noise
        if len(neighbors) < min_samples:
            labels[i] = -1
            continue

        # New cluster
        C += 1

        # Assign cluster label to point
        labels[i] = C

        # Set of points to expand
        S = list(neighbors)

        i = 0
        while i < len(S):
            j = S[i]
            if labels[j] == -1:
                labels[j] = C
            elif labels[j] == 0:
                labels[j] = C
                neighbors_j = np.where(np.linalg.norm(X - X[j], axis=1) <= eps)[0]
                if len(neighbors_j) >= min_samples:
                    S += list(set(neighbors_j) - set(S))
            i += 1
    n_clusters = C
    print("Number of Clusters: ", C)
    return labels

#### Labels Mapping Function with Ground Truth Labels


In [5]:
def map_labels(y_true, y_pred):
    # Create a confusion matrix
    D = confusion_matrix(y_true, y_pred)

    # Swap the rows and columns of the confusion matrix to match the labels
    rows, cols = linear_sum_assignment(D, maximize=True)

    # Create a dictionary to map the labels
    label_map = {col: row for row, col in zip(rows, cols)}

    # Map the labels in y_pred, skipping over any points that have a label of -1
    y_pred = np.array([label_map[label] if label != -1 else -1 for label in y_pred])

    return y_pred

### Hyperparamters


In [14]:
n_samples = 7
eps = 2.7

### Implemented DBSCAN


#### Method 1 (Mean)


In [15]:
y_pred = get_DBSCAN(X_train1, n_samples, eps)
y_pred_mapped = map_labels(y_train1, y_pred)

# Compute the accuracy
precision = precision_score(y_train1, y_pred_mapped, average="weighted")
recall = recall_score(y_train1, y_pred_mapped, average="weighted")
f1 = f1_score(y_train1, y_pred_mapped, average="weighted")

print("Number of Noise Points: ", np.sum(y_pred_mapped == -1))
print("Implemented DBSCAN Evaluation Metrics:")
print("Precision: {:.3f}%".format(precision * 100))
print("Recall:    {:.3f}%".format(recall * 100))
print("F1-Score:  {:.3f}%".format(f1 * 100))

Number of Clusters:  32
Number of Noise Points:  764
Implemented DBSCAN Evaluation Metrics:
Precision: 51.115%
Recall:    26.782%
F1-Score:  25.635%


#### Method 2 (Flatten)


In [9]:
# y_pred = get_DBSCAN(X_train2, n_samples, eps)
# y_pred_mapped = map_labels(y_train1, y_pred)

# # Compute the accuracy
# precision = precision_score(y_train2, y_pred_mapped, average="weighted")
# recall = recall_score(y_train2, y_pred_mapped, average="weighted")
# f1 = f1_score(y_train2, y_pred_mapped, average="weighted")

# print("Implemented DBSCAN Evaluation Metrics:")
# print("Precision: {:.3f}%".format(precision * 100))
# print("Recall:    {:.3f}%".format(recall * 100))
# print("F1-Score:  {:.3f}%".format(f1 * 100))

### DBSCAN in Scikit-Learn


#### Method 1 (Mean)


In [16]:
dbscan1 = DBSCAN(eps=eps, min_samples=n_samples)

# Fit the model to the data
dbscan1.fit(X_train1)

# Print the cluster labels for each data point
y_pred_sklearn = dbscan1.labels_
num_clusters = len(np.unique(y_pred_sklearn[y_pred_sklearn != -1]))
print("Number of clusters: ", num_clusters)
y_pred_sklearn_mapped = map_labels(y_train1, y_pred_sklearn)

percision = precision_score(y_train1, y_pred_sklearn_mapped, average="weighted")
recall = recall_score(y_train1, y_pred_sklearn_mapped, average="weighted")
f1 = f1_score(y_train1, y_pred_sklearn_mapped, average="weighted")

print("Number of Noise Points: ", (y_pred_sklearn == -1).sum())
print("Sklearn DBSCAN Evaluation Metrics:")
print("Precision: {:.3f}%".format(precision * 100))
print("Recall:    {:.3f}%".format(recall * 100))
print("F1-Score:  {:.3f}%".format(f1 * 100))

Number of clusters:  32
Number of Noise Points:  764
Sklearn DBSCAN Evaluation Metrics:
Precision: 51.115%
Recall:    8.525%
F1-Score:  6.134%


#### Method 2 (Flatten)


In [11]:
# dbscan2 = DBSCAN(eps=eps, min_samples=n_samples)

# # Fit the model to the data
# dbscan2.fit(X_train2)

# # Print the cluster labels for each data point
# y_pred_sklearn = dbscan2.labels_
# y_pred_sklearn_mapped = map_labels(y_train2, y_pred_sklearn)

# percision = precision_score(y_train2, y_pred_sklearn_mapped, average="weighted")
# recall = recall_score(y_train2, y_pred_sklearn_mapped, average="weighted")
# f1 = f1_score(y_train2, y_pred_sklearn_mapped, average="weighted")

# print("Sklearn DBSCAN Evaluation Metrics:")
# print("Precision: {:.3f}%".format(precision * 100))
# print("Recall:    {:.3f}%".format(recall * 100))
# print("F1-Score:  {:.3f}%".format(f1 * 100))