In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataMatrix import generate_data_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

X_train, y_train, X_test, y_test = generate_data_matrix(method="mean")

In [ ]:
def dbscan(data, min_samples, eps):

    # Initializing labels to -1
    labels = np.full(data.shape[0], -1, dtype=int)

    # Initializing visited set
    visited = set()

    # Initializing core_dict to store core points and map to their neighborhoods
    core_dict = {}

    # Computing the distance matrix
    distanceMatrix = pairwise_distances(data, metric='euclidean')

    for i in range(data.shape[0]):

        # Skipping if already visited
        if i in visited:
            continue

        # Visiting the point
        visited.add(i)

        # Finding neighbors
        neighbors = [j for j in range(data.shape[0]) if distanceMatrix[i][j] <= eps]

        if len(neighbors) < min_samples: # If not a core point
            labels[i] = -1
        else: # If a core point

            # Assigning label to core point
            labels[i] = i

            # Adding core point to core_dict
            core_dict[i] = neighbors

            for j in neighbors:
                # Skipping if already visited
                if j in visited:
                    continue

                # Finding neighbors of neighbors
                neighbors2 = [k for k in range(data.shape[0]) if distanceMatrix[j][k] <= eps]

                # Visiting the point
                visited.add(j)

                # Assigning label of the neighbor to the same one as core point
                labels[j] = i

                # Adding neighbors of neighbors to core_dict which will be assigned the same label as core point later in the last loop
                if len(neighbors2) >= min_samples:
                    core_dict[j] = neighbors2

    # Assigning labels to non-core points based on their core point
    for label,neighborhood in core_dict.items():
        for neighbor in neighborhood:
            if labels[neighbor] == -1:
                labels[neighbor] = label

    # Convert labels to integers
    labels = labels.astype(int)
    return labels

In [ ]:
# Testing the DBSCAN algorithm
labelsInNumbers = dbscan(x_train, 5, 0.5)

# Print number of noise points (label = -1)
print('Number of noise points: ', np.count_nonzero(labelsInNumbers == -1))

# Create array of strings to store the final labels
labelsInString = np.empty(labelsInNumbers.shape[0], dtype=object)

# The number of clusters will be equal to the number of unique labels
clusters = np.unique(labelsInNumbers)
print('Number of clusters: ', clusters.shape[0])

for i in range(clusters.shape[0]):

    # Finding the indices of the points in the cluster
    labels = np.where(labelsInNumbers == clusters[i])

    # Creating a dictionary to count the number of each label in the cluster
    counterLabels = {}
    for label in y_train[labels]:
        counterLabels[label] = counterLabels.get(label, 0) + 1

    # Finding the most common label
    maxLabel = max(counterLabels, key=counterLabels.get)

    # Assigning the most common label to all the points in the cluster
    labelsInString[labels] = maxLabel

In [ ]:
y_pred = labelsInString

# Evaluating the algorithm
print("Macro: ")
print("Precision: ", precision_score(y_train, y_pred, average='macro'))
print("Recall: ", recall_score(y_train, y_pred, average='macro'))
print("F1 score: ", f1_score(y_train, y_pred, average='macro'))
print("Accuracy: ", accuracy_score(y_train, y_pred))

print("-" * 50)
print("Weighted: ")
print("Precision: ", precision_score(y_train, y_pred, average='weighted'))
print("Recall: ", recall_score(y_train, y_pred, average='weighted'))
print("F1 score: ", f1_score(y_train, y_pred, average='weighted'))
print("Accuracy: ", accuracy_score(y_train, y_pred))

print("-" * 50)
print(classification_report(y_train, y_pred))

# Now, let's measure the conditional entropy of the clusters to see if they are well separated
totalEntropy = 0
for i in range(clusters.shape[0]):
    entropy = 0
    # Getting the label indices of the points in the cluster
    labels = np.where(labelsInNumbers == clusters[i])

    # Getting the actual labels of the points in the cluster
    labels = y_train[labels]

    # Getting the counts of each label in each cluster
    labels, counts = np.unique(labels, return_counts=True)

    entropy = -np.sum(counts / np.sum(counts) * np.log2(counts / np.sum(counts)))

    totalEntropy += entropy

# Dividing by the number of clusters to get the average conditional entropy
totalEntropy /= clusters.shape[0]
print("Average conditional entropy: ", totalEntropy)