In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import matplotlib.cm as cm
from sklearn.cluster import DBSCAN,KMeans
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets import load_breast_cancer,fetch_lfw_people
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline

In [None]:
X, y = make_blobs(
    n_samples = 300, 
    centers = 4,
    cluster_std = 0.60, 
    random_state = 0
)
plt.scatter(X[:, 0], X[:, 1], s = 50)

In [None]:
scan = DBSCAN()
scan

In [None]:
y_scan = scan.fit_predict(X)

In [None]:
y_scan

In [None]:
num_clusters = len(np.unique(y_scan))
num_clusters

In [None]:
np.bincount(y_scan + 1)

In [None]:
cancer_data = load_breast_cancer()

In [None]:
scaler = StandardScaler().fit(cancer_data.data)
X_cancer = scaler.transform(cancer_data.data)

In [None]:
scan = DBSCAN()
scan_cancer = scan.fit_predict(X_cancer)

In [None]:
len(np.bincount(scan_cancer + 1))

In [None]:
def apply_dbscan(data, min_clusters, min_samples = [1, 2, 3, 4, 5], eps = [0.5, 1.0, 1.5, 2.0]):
    for i in range(len(min_samples)):
        for j in range(len(eps)):
            scan = DBSCAN(eps = eps[j], min_samples = min_samples[i])
            scan_data = scan.fit_predict(data)
            if(len(np.bincount(scan_data + 1)) > min_clusters):
                print(
                    'eps = ',
                    eps[j],
                    'min_samples = ',
                    min_samples[i],
                    'num_clusters =',
                    len(np.bincount(scan_data + 1)),
                    'clusters=',
                    np.bincount(scan_data + 1)
                )
    
    


In [None]:
apply_dbscan(
    X_cancer, 
    min_clusters = 2, 
    min_samples = [1,2,3,4,5],
    eps = [3.0, 3.5, 4.0, 4.5, 5, 5.5, 6, 10, 11, 12, 13, 14, 15]
)

In [None]:
scan = DBSCAN(eps= 5.5, min_samples = 2)
scan_cancer = scan.fit_predict(X_cancer)
print(np.bincount(scan_cancer + 1))

In [None]:
np.argwhere(scan_cancer == 1)

In [None]:
cancer_data.data[]

In [None]:
def display_dbscan_images(labels,data,targets,max_images=10):
    for cluster in range(max(labels) + 1):
        mask = (labels == cluster)
        num_images = np.sum(mask)
        if(num_images > max_images):
            continue
        fig, axes = plt.subplots(
            1, 
            num_images, 
            figsize = (num_images * 1.5, 4),
            subplot_kw = {'xticks': (), 'yticks': ()}
        )
        for image, label, ax in zip(data[mask], targets[mask], axes):
            ax.imshow(image.reshape(image_shape),cmap = 'gray')
            ax.set_title(people.target_names[label].split()[-1])

In [None]:
people = fetch_lfw_people(min_faces_per_person = 20, resize = 0.7)
mask = np.zeros(people.target.shape, dtype = np.bool)
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:50]] = 1
people_data_filtered = people.data[mask]
people_target_filtered = people.target[mask]
image_shape = people.images[0].shape

In [None]:
pca = PCA(n_components = 100, whiten = True, random_state = 314)
X_scaled = people_data_filtered / 255.
X_pca = pca.fit_transform(X_scaled)
print("X_pca.shape: {}".format(X_pca.shape))

In [None]:
apply_dbscan(
    X_pca,
    2,
    min_samples = [1,2,3,4,5],
    eps = [3.0,3.5,4.0,4.5,5,5.5,6,7,10,11,12]
)

In [None]:
dbscan = DBSCAN(min_samples = 3, eps = 7)
clusters = dbscan.fit_predict(X_pca)
print("unique labels: {}".format(np.unique(clusters)))
print("number of points per cluster: {}".format(np.bincount(clusters + 1)))

In [None]:
display_dbscan_images(
    clusters,
    people_data_filtered,
    people_target_filtered
)

In [None]:
image_shape = people.images[0].shape
fig, axes = plt.subplots(3,10,figsize=(15, 8),subplot_kw=dict(xticks=[], yticks=[]))
noise = people_data_filtered[clusters == -1]
for image,ax in zip(noise,axes.ravel()):
    ax.imshow(image.reshape(image_shape), cmap='gray')

In [None]:
tsne = TSNE(random_state = 42)
X_tsne = tsne.fit_transform(X_scaled)
print("X_tsne.shape: {}".format(X_tsne.shape))

In [None]:
apply_dbscan(
    X_tsne,
    2,
    min_samples = [1,2,3,4,5],
    eps = [3.0,3.5,4.0,4.5,5,5.5,6,7,10,11,12]
)

In [None]:
dbscan = DBSCAN(min_samples = 3, eps = 3)
labels = dbscan.fit_predict(X_tsne)

In [None]:
print("unique labels: {}".format(np.unique(labels)))
print("number of points per cluster: {}".format(np.bincount(labels + 1)))

In [None]:
display_dbscan_images(
    labels,
    people_data_filtered,
    people_target_filtered
)

In [None]:
image_shape = people.images[0].shape
fig, axes = plt.subplots(3, 10, figsize=(15, 8), subplot_kw = dict(xticks=[], yticks=[]))
noise = people_data_filtered[labels == -1]
for image,ax in zip(noise,axes.ravel()):
    ax.imshow(image.reshape(image_shape), cmap='gray')