In [7]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

In [4]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(postgres_user, postgres_pw, postgres_host,
                                                           postgres_port, postgres_db))

df = pd.read_sql_query('SELECT * FROM heartdisease', con=engine)

engine.dispose()

# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

Apply DBSCAN to the heart disease data by trying different values for eps and min_samples parameters. You'll realize that it's really hard to get a two cluster solution using DBSCAN if not impossible.

In [26]:
#standardizing features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

eps_val = [2, 3, 4]
min_samp = [3, 4, 5, 6, 7]

for i in eps_val:
    for j in min_samp:
        #define DBSCAN cluster
        dbscan_cluster = DBSCAN(eps=i, min_samples=j)

        clusters = dbscan_cluster.fit_predict(X_std)

        #calculating metrics
        ari = metrics.adjusted_rand_score(y, clusters)
        ss = metrics.silhouette_score(X_std, clusters, metric='euclidean')
        
        print('for eps = {} and min_samples = {}'.format(i, j))
        print('Adj. Rand Index: {}'.format(ari))
        print('Silhouette Score: {}'.format(ss))
        print('\n')


for eps = 2 and min_samples = 3
Adj. Rand Index: 0.02347593686222429
Silhouette Score: -0.08928343209927057


for eps = 2 and min_samples = 4
Adj. Rand Index: 0.018230172604469658
Silhouette Score: -0.10841712711520761


for eps = 2 and min_samples = 5
Adj. Rand Index: 0.014409669379190209
Silhouette Score: -0.11852766126558806


for eps = 2 and min_samples = 6
Adj. Rand Index: 0.014947738155427543
Silhouette Score: -0.10960107669691291


for eps = 2 and min_samples = 7
Adj. Rand Index: 0.013514854268594646
Silhouette Score: -0.11140886429253431


for eps = 3 and min_samples = 3
Adj. Rand Index: 0.020646688946500475
Silhouette Score: 0.07069485137365351


for eps = 3 and min_samples = 4
Adj. Rand Index: 0.018784313764717165
Silhouette Score: 0.06913933996869988


for eps = 3 and min_samples = 5
Adj. Rand Index: 0.01705867403503028
Silhouette Score: 0.0035947969725671717


for eps = 3 and min_samples = 6
Adj. Rand Index: 0.013859023145093265
Silhouette Score: 0.045665635045545584


for 

Apply DBSCAN by setting parameters eps=1, min_samples=1, metric="euclidean". Then, increase the value of min_samples. What's the effect of increasing min_samples on the number of clusters DBSCAN identifies?

In [40]:
min_samp = [1, 2, 3, 4, 5]

for i in min_samp:
    #define DBSCAN cluster
    dbscan_cluster = DBSCAN(eps=1, min_samples=i, metric='euclidean')
    clusters = dbscan_cluster.fit_predict(X_std)

    print('for eps = {} and min_samples = {}'.format(1, i))
    print('unique clusters: {}'.format(len(np.unique(clusters))))
    print('\n')


for eps = 1 and min_samples = 1
unique clusters: 294


for eps = 1 and min_samples = 2
unique clusters: 9


for eps = 1 and min_samples = 3
unique clusters: 2


for eps = 1 and min_samples = 4
unique clusters: 1


for eps = 1 and min_samples = 5
unique clusters: 1




By increasing the value of min_samples we can see that the value of clusters shrinks. This is because we are increasing the density requirement. When we have min_samples = 1 then each sample becomes a cluster. As this threshold increases the number of unique clusters drops quickly as we are using more strict definitions of what a 'cluster' is. 

Apply DBSCAN by setting parameters eps=1, min_samples=1, metric="euclidean". Then, increase the value of eps. What's the effect of increasing eps on the number of clusters DBSCAN identifies?

In [42]:
eps_val = [1, 2, 3, 4, 5]

for i in eps_val:
    #define DBSCAN cluster
    dbscan_cluster = DBSCAN(eps=i, min_samples=1, metric='euclidean')
    clusters = dbscan_cluster.fit_predict(X_std)

    print('for eps = {} and min_samples = {}'.format(i, 1))
    print('unique clusters: {}'.format(len(np.unique(clusters))))
    print('\n')

for eps = 1 and min_samples = 1
unique clusters: 294


for eps = 2 and min_samples = 1
unique clusters: 178


for eps = 3 and min_samples = 1
unique clusters: 34


for eps = 4 and min_samples = 1
unique clusters: 3


for eps = 5 and min_samples = 1
unique clusters: 1




By increasing the radius around the points of interest (epsilon) we decrease the number of clusters that are identified because we are increasing the area that each cluster covers. 