In [11]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn import metrics

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

In [5]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(postgres_user, postgres_pw, postgres_host,
                                                           postgres_port, postgres_db))

df = pd.read_sql_query('SELECT * FROM heartdisease', con=engine)

engine.dispose()

# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [24]:
#standardizing X values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

#Define agglomerative clustering 
gmm_cluster = GaussianMixture(n_components=2, random_state=123)

#Fit model
clusters = gmm_cluster.fit_predict(X_std)

In [25]:
ari = metrics.adjusted_rand_score(y, clusters)
sil_score = metrics.silhouette_score(X_std, clusters, metric='euclidean')

print('adjusted rand index: {}'.format(ari))
print('silhouette score: {}'.format(sil_score))

adjusted rand index: 0.18389186035089963
silhouette score: 0.13628813153331445


GMM scores lower than both k-means and hierarchical clustering in terms of ARI and silhouette scores. 

GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

    full: This is the default. Each component has its own general covariance matrix.
    tied: All components share the same general covariance matrix.
    diag: Each component has its own diagonal covariance matrix.
    spherical: Each component has its own single variance.

Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [26]:
covariance_types = ['full', 'tied', 'diag', 'spherical']

for i in covariance_types:
    #Define agglomerative clustering 
    gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type=i)

    #Fit model
    clusters = gmm_cluster.fit_predict(X_std)

    ari = metrics.adjusted_rand_score(y, clusters)
    sil_score = metrics.silhouette_score(X_std, clusters, metric='euclidean')
    
    print('covariance type: {}'.format(i))
    print('adjusted rand index: {}'.format(ari))
    print('silhouette score: {}'.format(sil_score))
    print('\n')

covariance type: full
adjusted rand index: 0.18389186035089963
silhouette score: 0.13628813153331445


covariance type: tied
adjusted rand index: 0.18389186035089963
silhouette score: 0.13628813153331445


covariance type: diag
adjusted rand index: 0.18389186035089963
silhouette score: 0.13628813153331445


covariance type: spherical
adjusted rand index: 0.20765243525722465
silhouette score: 0.12468753110276873




ARI score of covariance type spherical is higher than the others and its silhouette score is lower than the others. The scores of the other covariance types are the same. 