In [2]:
!pip install ucimlrepo &> null
print("ucimlrepo installed successfully")

!pip install pycaret &> /dev/null
print ("Pycaret installed sucessfully!!")

ucimlrepo installed successfully
Pycaret installed sucessfully!!


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [4]:
from ucimlrepo import fetch_ucirepo

# getting dataset
heart_disease = fetch_ucirepo(id=45)

X = heart_disease.data.features
y = heart_disease.data.targets

print(heart_disease.variables)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

In [7]:
from pycaret.clustering import *
model = setup(X, verbose = False)

In [11]:
models_list = model.models().Name.index
print(models_list)

Index(['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics',
       'birch'],
      dtype='object', name='ID')


In [12]:
models_list = models_list[[0,2,4]]
print('Clustering Models Taken: ', models_list)

Clustering Models Taken:  Index(['kmeans', 'meanshift', 'hclust'], dtype='object', name='ID')


In [14]:
parameters ={
    'No Data Processing': {'transformation': False, 'normalize': False, 'pca': False},
    'Using Normalisation': {'transformation': False, 'normalize': True, 'pca': False},
    'Using Transform': {'transformation': True, 'normalize': False, 'pca': False},
    'Using PCA': {'transformation': False, 'normalize': False, 'pca': True},
    'T+N': {'transformation': True, 'normalize': True, 'pca': False},
    'T+N+PCA': {'transformation': True, 'normalize': True, 'pca': True},
}

In [15]:
results = []

for model in models_list:
    model_results = pd.DataFrame()

    for size in range(3, 6):
        for name, args in parameters.items():
            exp = setup(X, verbose=False, **args)
            create_model(model, num_clusters=size, verbose=False)
            temp = exp.pull()

            temp['name'] = name
            temp['cluster_size'] = size

            model_results = pd.concat([model_results, temp], ignore_index=True)

    model_results.set_index(['name', 'cluster_size'], inplace=True)

    model_results_transposed = model_results.sort_index().T

    model_results_transposed.iloc[:2, :].to_csv(model + '.csv')

    print(model)
    display(model_results_transposed.iloc[:3, :])


kmeans


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2812,0.2799,0.2768,0.1335,0.1356,0.1158,0.1353,0.1333,0.1193,0.1296,0.1333,0.1126,0.2821,0.2799,0.2778,0.5602,0.5469,0.5384
Calinski-Harabasz,195.4521,177.0044,169.3541,48.7306,41.709,37.1282,48.7772,41.677,36.5903,47.471,40.9062,35.7,195.5082,177.0044,169.3746,801.8385,922.5416,1017.3824
Davies-Bouldin,1.1562,1.0647,1.0583,2.3147,2.1029,2.2122,2.2734,2.1312,2.1511,2.2686,2.0932,2.1009,1.1533,1.0647,1.0674,0.5415,0.5311,0.5332


meanshift


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.6378,0.6378,0.6378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6378,0.6378,0.6378,0.0,0.0,0.0
Calinski-Harabasz,54.463,54.463,54.463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.463,54.463,54.463,0.0,0.0,0.0
Davies-Bouldin,0.5276,0.5276,0.5276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5276,0.5276,0.5276,0.0,0.0,0.0


hclust


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2443,0.2537,0.1995,0.1426,0.1123,0.0921,0.1426,0.1123,0.0921,0.1624,0.1193,0.1062,0.2443,0.2537,0.1995,0.5585,0.5403,0.5543
Calinski-Harabasz,174.0719,154.4552,145.7324,39.0832,35.3045,31.181,39.0832,35.3045,31.181,40.7073,37.7609,32.7531,174.0719,154.4552,145.7324,781.6421,877.0004,951.6959
Davies-Bouldin,1.2725,1.1425,1.2998,2.1858,2.231,2.4554,2.1858,2.231,2.4554,2.0277,2.1313,2.285,1.2725,1.1425,1.2998,0.5343,0.5204,0.5235


