In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from DataMatrix import generate_data_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import pairwise_distances
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    confusion_matrix,
)

import warnings

warnings.filterwarnings("ignore")

In [7]:
X_train, y_train, X_test, y_test = generate_data_matrix(method="mean")

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (7296, 45)
y_train shape:  (7296,)
X_test shape:  (1824, 45)
y_test shape:  (1824,)


In [12]:
def get_DBSCAN(X, min_samples, eps):
    # Initialize labels
    labels = np.zeros(X.shape[0])

    # C is the cluster counter
    C = 0
    for i in range(X.shape[0]):

        # Skip if already labeled
        if labels[i] != 0:
            continue

        # Find neighbors within eps
        neighbors = np.where(np.linalg.norm(X - X[i], axis=1) <= eps)[0]

        # Mark as noise
        if len(neighbors) < min_samples:
            labels[i] = -1
            continue

        # New cluster
        C += 1

        # Assign cluster label to point
        labels[i] = C

        # Set of points to expand
        S = list(neighbors)

        i = 0
        while i < len(S):
            j = S[i]
            if labels[j] == -1:
                labels[j] = C
            elif labels[j] == 0:
                labels[j] = C
                neighbors_j = np.where(np.linalg.norm(X - X[j], axis=1) <= eps)[0]
                if len(neighbors_j) >= min_samples:
                    S += list(set(neighbors_j) - set(S))
            i += 1

    return labels

In [20]:
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix


def map_labels(y_true, y_pred):
    # Create a confusion matrix
    D = confusion_matrix(y_true, y_pred)

    # Swap the rows and columns of the confusion matrix to match the labels
    rows, cols = linear_sum_assignment(D, maximize=True)

    # Create a dictionary to map the labels
    label_map = {col: row for row, col in zip(rows, cols)}

    # Map the labels in y_pred, skipping over any points that have a label of -1
    y_pred = np.array([label_map[label] if label != -1 else -1 for label in y_pred])

    return y_pred


# Map the labels
n_samples = np.arange(0.1, 5, 0.1)
eps = np.arange(1, 10)
print(len(n_samples)* len(eps))

for sample in n_samples:
    for e in eps:
        y_pred = get_DBSCAN(X_train, sample, e)
        y_pred_mapped = map_labels(y_train, y_pred)

        # Compute the accuracy
        percision = precision_score(y_train, y_pred_mapped, average="weighted")
        recall = recall_score(y_train, y_pred_mapped, average="weighted")
        f1 = f1_score(y_train, y_pred_mapped, average="weighted")

        # Print the accuracy
        print(f"n_samples: {sample} eps: {e}")
        print(f"Percision: {percision * 100:.2f}%")
        print(f"Recall: {recall * 100:.2f}%")
        print(f"F1 Score: {f1 * 100:.2f}%")

441
