In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataMatrix import generate_data_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import pairwise_distances
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
X_train, y_train, X_test, y_test = generate_data_matrix(method="mean")

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (7296, 45)
y_train shape:  (7296,)
X_test shape:  (1824, 45)
y_test shape:  (1824,)


In [3]:
def get_DBSCAN(data, min_samples, eps):

    # Initializing labels to -1
    labels = np.full(data.shape[0], -1, dtype=int)
    visited = set()
    core_dict = {}
    distanceMatrix = pairwise_distances(data, metric='euclidean')

    for i in range(data.shape[0]):
        if i in visited:
            continue

        visited.add(i)

        # Finding neighbors with distance less than eps
        neighbors = [j for j in range(data.shape[0]) if distanceMatrix[i][j] <= eps]

        # If not a core point
        if len(neighbors) < min_samples:
            labels[i] = -1
            
        # If a core point
        else:
            # Assigning label to core point
            labels[i] = i

            # Adding core point to core_dict
            core_dict[i] = neighbors

            for j in neighbors:
                # Skipping if already visited
                if j in visited:
                    continue

                # Finding neighbors of neighbors
                neighbors2 = [k for k in range(data.shape[0]) if distanceMatrix[j][k] <= eps]

                # Visiting the point
                visited.add(j)

                # Assigning label of the neighbor to the same one as core point
                labels[j] = i

                # Adding neighbors of neighbors to core_dict which will be assigned the same label as core point later in the last loop
                if len(neighbors2) >= min_samples:
                    core_dict[j] = neighbors2

    # Assigning labels to non-core points based on their core point
    for label,neighborhood in core_dict.items():
        for neighbor in neighborhood:
            if labels[neighbor] == -1:
                labels[neighbor] = label

    # Convert labels to integers
    labels = labels.astype(int)
    return labels

In [9]:
# Testing the DBSCAN algorithm
labelsInNumbers = get_DBSCAN(X_train, 5, 0.5)

# Print number of noise points (label = -1)
print('Number of noise points: ', np.count_nonzero(labelsInNumbers == -1))

# Create array of strings to store the final labels
labelsInString = np.empty(labelsInNumbers.shape[0], dtype=object)

# The number of clusters will be equal to the number of unique labels
clusters = np.unique(labelsInNumbers)
print('Number of clusters: ', clusters.shape[0])

for i in range(clusters.shape[0]):

    # Finding the indices of the points in the cluster
    labels = np.where(labelsInNumbers == clusters[i])
    
    # Creating a dictionary to count the number of each label in the cluster
    counterLabels = {}
    for label in y_train[labels]:
        counterLabels[label] = counterLabels.get(label, 0) + 1

    # Finding the most common label
    maxLabel = max(counterLabels, key=counterLabels.get)

    # Assigning the most common label to all the points in the cluster
    labelsInString[labels] = maxLabel

Number of noise points:  5638
Number of clusters:  106


In [13]:
y_pred = labelsInString
mask = y_pred != -1
y_pred_no_noise = y_pred[mask]
y_train_no_noise = y_train[mask]

# Evaluating the algorithm
print("Macro: ")
print("Precision: ", precision_score(y_train_no_noise, y_pred_no_noise, average='macro'))
print("Recall: ", recall_score(y_train_no_noise, y_pred_no_noise, average='macro'))
print("F1 score: ", f1_score(y_train_no_noise, y_pred_no_noise, average='macro'))
print("Accuracy: ", accuracy_score(y_train_no_noise, y_pred_no_noise))

print("-" * 50)
print("Weighted: ")
print("Precision: ", precision_score(y_train_no_noise, y_pred_no_noise, average='weighted'))
print("Recall: ", recall_score(y_train_no_noise, y_pred_no_noise, average='weighted'))
print("F1 score: ", f1_score(y_train_no_noise, y_pred_no_noise, average='weighted'))
print("Accuracy: ", accuracy_score(y_train_no_noise, y_pred_no_noise))

Macro: 


ValueError: Classification metrics can't handle a mix of multiclass and unknown targets