In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import math

In [2]:
class DBSCAN_mn :
    # initialize
    def __init__(self, eps, min_samples) :
        self.eps = eps
        self.min_samples = min_samples
        self.visit = []
        self.clusters = []
        self.list_of_cluster = []
        self.noise = []
        self.core = [] # for predict
        self.dataset = []
        
    def fit(self, dataset) : # list
        self.visit = dataset[:].tolist() # create label for visited data
        self.dataset = dataset[:].tolist()
        C = -1 # create cluster
        for data in self.dataset :
            if data in self.visit :
                self.visit.remove(data)
                # find all neighbor for sample data
                data_neighbor = self.find_neighbors(data)
                if len(data_neighbor) < self.min_samples : self.noise.append(data)
                else :
                    C += 1
                    self.expand_cluster(data, data_neighbor, C)
        self.create_cluster()
                    
    def expand_cluster(self, sample, sample_neighbor, C) :
        # first delete clustered element before because it's not core
        for inst in self.clusters :
                if sample in inst : self.clusters[self.clusters.index(inst)].remove(sample)
        self.clusters.insert(C, [sample])
        self.core.append(sample)
        
        for data in sample_neighbor :
            if data in self.visit : # is not visited yet
                self.visit.remove(data)
                data_neighbor = self.find_neighbors(data)
                if len(data_neighbor) >= self.min_samples :
                    self.core.append(data)
                    for elmt in data_neighbor :
                        if elmt not in sample_neighbor : sample_neighbor.append(elmt)
            cluster = False
            for inst in self.clusters :
                if data in inst :
                    cluster = True
                    break
            if cluster == False : 
                self.clusters[C].append(data)
                if data in self.noise : self.noise.remove(data)
    
    def find_neighbors(self, sample) :
        neighbor = []
        for data in self.dataset :
            distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(sample, data)])) # calculate euclidian distance
            if distance <= self.eps : neighbor.append(data)        
        return neighbor
    
    def create_cluster(self) :
        for data in self.dataset :
            for i in range(len(self.clusters)) :
                if data in self.clusters[i] : 
                    self.list_of_cluster.append(i)
            if data in self.noise : 
                self.list_of_cluster.append(-1)
        self.list_of_cluster = np.array(self.list_of_cluster)
        
    def predict(self, datatest) :
        pred = []
        for data in datatest.tolist() :
            appended = False
            for core_ in self.core :
                distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(core_, data)])) # calculate euclidian distance
                #print("distance between ", data, " and ", core_, " is ", distance)
                if distance <= self.eps : 
                    #print("it is in cluster ", self.dataset.index(core_))
                    pred.append(self.list_of_cluster[self.dataset.index(core_)])
                    appended = True
                    break
            if appended == False : 
                pred.append(-1)
        return np.array(pred)

### Iris Dataset

In [3]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [4]:
# Split data into test and train, but sort ordinal first to reduce mislabel
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
np.size(X_train,0)
X_temp = np.concatenate((X_train, np.array([y_train]).T), axis=1)
X_temp = X_temp[X_temp[:, np.size(X,1)-1].argsort()]
X_train = np.array([i[:-1] for i in X_temp.tolist()])
y_train = np.array([i[-1] for i in X_temp.tolist()]).astype(int)

In [9]:
print("======== minpts : 2 && eps : 0.3 ========")
print("WITH SPLIT ", len(y_train), ":", len(y_test))
print("\nDBSCAN Clustering from Scratch")
model1_mn = DBSCAN_mn(0.3,2)
model1_mn.fit(X_train)
y_predict = model1_mn.predict(X_test)
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_predict))
print("Accuracy Score :")
print(accuracy_score(y_test, y_predict))

print("______________________________")
print("\nWITHOUT SPLIT")
print("\nDBSCAN Clustering from scratch")
model1 = DBSCAN_mn(0.3, 2)
model1.fit(X)
pred1_mn = model1.predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred1_mn))
print("Accuracy Score :")
print(accuracy_score(y, pred1_mn))

print("\nDBSCAN Clustering from sklearn")
pred1_sk = DBSCAN(eps=0.3, min_samples=2).fit_predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred1_sk))
print("Accuracy Score :")
print(accuracy_score(y, pred1_sk))

print("\n\n")

print("======== minpts : 2 && eps : 0.7 ========")
print("WITH SPLIT ", len(y_train), ":", len(y_test))
print("\nDBSCAN Clustering from Scratch")
model2_mn = DBSCAN_mn(0.7,2)
model2_mn.fit(X_train)
y_predict2 = model2_mn.predict(X_test)
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_predict2))
print("Accuracy Score :")
print(accuracy_score(y_test, y_predict2))

print("______________________________")
print("\nWITHOUT SPLIT")
print("\nDBSCAN Clustering from scratch")
model2 = DBSCAN_mn(0.7, 2)
model2.fit(X)
pred2_mn = model2.predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred2_mn))
print("Accuracy Score :")
print(accuracy_score(y, pred2_mn))

print("\nDBSCAN Clustering from sklearn")
pred2_sk = DBSCAN(eps=0.7, min_samples=2).fit_predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred2_sk))
print("Accuracy Score :")
print(accuracy_score(y, pred2_sk))

print("\n\n")

print("======== minpts : 2 && eps : 3 ========")
print("WITH SPLIT ", len(y_train), ":", len(y_test))
print("\nDBSCAN Clustering from Scratch")
model3_mn = DBSCAN_mn(3,2)
model3_mn.fit(X_train)
y_predict3 = model3_mn.predict(X_test)
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_predict3))
print("Accuracy Score :")
print(accuracy_score(y_test, y_predict3))

print("______________________________")
print("\nWITHOUT SPLIT")
print("\nDBSCAN Clustering from scratch")
model3 = DBSCAN_mn(3, 2)
model3.fit(X)
pred3_mn = model3.predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred3_mn))
print("Accuracy Score :")
print(accuracy_score(y, pred3_mn))

print("\nDBSCAN Clustering from sklearn")
pred3_sk = DBSCAN(eps=3, min_samples=2).fit_predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred3_sk))
print("Accuracy Score :")
print(accuracy_score(y, pred3_sk))

print("\n\n")

print("======== minpts : 20 && eps : 0.75 ========")
print("WITH SPLIT ", len(y_train), ":", len(y_test))
print("\nDBSCAN Clustering from Scratch")
model4_mn = DBSCAN_mn(0.75,20)
model4_mn.fit(X_train)
y_predict4 = model4_mn.predict(X_test)
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_predict4))
print("Accuracy Score :")
print(accuracy_score(y_test, y_predict4))

print("______________________________")
print("\nWITHOUT SPLIT")
print("\nDBSCAN Clustering from scratch")
model4 = DBSCAN_mn(0.75, 20)
model4.fit(X)
pred4_mn = model4.predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred4_mn))
print("Accuracy Score :")
print(accuracy_score(y, pred4_mn))

print("\nDBSCAN Clustering from sklearn")
pred4_sk = DBSCAN(eps=0.75, min_samples=20).fit_predict(X)
print("Confusion Matrix :")
print(confusion_matrix(y, pred4_sk))
print("Accuracy Score :")
print(accuracy_score(y, pred4_sk))

print("\n\n")

WITH SPLIT  135 : 15

DBSCAN Clustering from Scratch
Confusion Matrix :
[[0 0 0 0 0 0]
 [1 2 0 0 0 0]
 [6 0 2 0 0 0]
 [2 0 0 0 1 1]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Accuracy Score :
0.26666666666666666
______________________________

WITHOUT SPLIT

DBSCAN Clustering from scratch
Confusion Matrix :
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [10 38  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [14  0  0  3  2 14  6  2  4  2  2  1  0  0  0  0  0  0  0  0  0  0]
 [19  0  0  0  0  0  0  0  0  0  0  6  3  3  2  2  2  2  5  2  2  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0