In [23]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [24]:
dataset_path = "NHANES_age_prediction.csv"
df = pd.read_csv(dataset_path)
df

Unnamed: 0,SEQN,age_group,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,73564.0,Adult,61.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91
1,73568.0,Adult,26.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85
2,73576.0,Adult,16.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14
3,73577.0,Adult,32.0,1.0,2.0,28.9,104.0,2.0,84.0,16.15
4,73580.0,Adult,38.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92
...,...,...,...,...,...,...,...,...,...,...
2273,83711.0,Adult,38.0,2.0,2.0,33.5,100.0,2.0,73.0,6.53
2274,83712.0,Adult,61.0,1.0,2.0,30.0,93.0,2.0,208.0,13.02
2275,83713.0,Adult,34.0,1.0,2.0,23.7,103.0,2.0,124.0,21.41
2276,83718.0,Adult,60.0,2.0,2.0,27.4,90.0,2.0,108.0,4.99


In [25]:
features=df[['RIDAGEYR','RIAGENDR']]
target = df['age_group']
features

Unnamed: 0,RIDAGEYR,RIAGENDR
0,61.0,2.0
1,26.0,2.0
2,16.0,1.0
3,32.0,1.0
4,38.0,2.0
...,...,...
2273,38.0,2.0
2274,61.0,1.0
2275,34.0,1.0
2276,60.0,2.0


In [26]:
scaler = StandardScaler()
min_max_scaler = MinMaxScaler()
pca = PCA(n_components=2)

In [27]:
def perform_clustering_and_evaluate(data, method):
    if method == 'kmeans':
        clustering = KMeans(n_clusters=2, random_state=42)
    elif method == 'dbscan':
        clustering = DBSCAN(eps=0.5, min_samples=5)
    elif method == 'hierarchical':
        clustering = AgglomerativeClustering(n_clusters=2)

    labels = clustering.fit_predict(data)

    silhouette = silhouette_score(data, labels)
    calinski_harabasz = calinski_harabasz_score(data, labels)
    davies_bouldin = davies_bouldin_score(data, labels)

    return silhouette, calinski_harabasz, davies_bouldin


In [28]:
results_kmeans = []
results_dbscan = []
results_hierarchical = []
methods = ['kmeans', 'dbscan', 'hierarchical']

In [29]:
for data, description in zip([features, scaler.fit_transform(features), min_max_scaler.fit_transform(features),
                              min_max_scaler.fit_transform(scaler.fit_transform(features)),
                              pca.fit_transform(min_max_scaler.fit_transform(scaler.fit_transform(features)))],
                             ['Original', 'Normalized', 'Transformed', 'Normalized + Transformed', 'Normalized + Transformed + PCA']):
    silhouette_kmeans, calinski_harabasz_kmeans, davies_bouldin_kmeans = perform_clustering_and_evaluate(data, 'kmeans')
    silhouette_dbscan, calinski_harabasz_dbscan, davies_bouldin_dbscan = perform_clustering_and_evaluate(data, 'dbscan')
    silhouette_hierarchical, calinski_harabasz_hierarchical, davies_bouldin_hierarchical = perform_clustering_and_evaluate(data, 'hierarchical')

    results_kmeans.append({
        'Preprocessing': description,
        'Silhouette': silhouette_kmeans,
        'Calinski-Harabasz': calinski_harabasz_kmeans,
        'Davies-Bouldin': davies_bouldin_kmeans
    },)

    results_dbscan.append({
        'Preprocessing': description,
        'Silhouette': silhouette_dbscan,
        'Calinski-Harabasz': calinski_harabasz_dbscan,
        'Davies-Bouldin': davies_bouldin_dbscan
    })

    results_hierarchical.append({
        'Preprocessing': description,
        'Silhouette': silhouette_hierarchical,
        'Calinski-Harabasz': calinski_harabasz_hierarchical,
        'Davies-Bouldin': davies_bouldin_hierarchical
    })
kmeans = pd.DataFrame(results_kmeans)
dbscan = pd.DataFrame(results_dbscan)
hierarchy = pd.DataFrame(results_hierarchical)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
kmeans.to_csv('results_kmeans_102103080.csv', index=False)
dbscan.to_csv('results_dbscan_102103080.csv', index=False)
hierarchy.to_csv('results_hierarchical_102103080.csv', index=False)