Instructions:
-

1. Read the article: https://www.sciencedirect.com/science/article/abs/pii/S0031320322001753
2. Replicate the study using the same dataset.


In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetching the datasets
soybean_data = fetch_ucirepo('soybean')
zoo_data = fetch_ucirepo('zoo')
heart_disease_data = fetch_ucirepo('heart disease')
breast_cancer_data = fetch_ucirepo('breast cancer')
dermatology_data = fetch_ucirepo('dermatology')
mushroom_data = fetch_ucirepo('mushroom')


In [2]:
datasets = [
    ('soybean_data', 'soybean_df'),
    ('zoo_data', 'zoo_df'),
    ('heart_disease_data', 'heart_disease_df'),
    ('breast_cancer_data', 'breast_cancer_df'),
    ('dermatology_data', 'dermatology_df'),
    ('mushroom_data', 'mushroom_df')
]

# Loop over datasets to fetch and create dataframes
for dataset_name, dataframe_name in datasets:
    data = fetch_ucirepo(dataset_name.split('_')[0])
    X = data.data.features
    y = data.data.targets
    df = pd.merge(X, y, left_index=True, right_index=True)
    df = df.dropna()
    globals()[dataframe_name] = df

In [3]:
soybean_df

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,6.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,diaporthe-stem-canker
1,4.0,0.0,2.0,1.0,0.0,2.0,0.0,2.0,1.0,1.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,diaporthe-stem-canker
2,3.0,0.0,2.0,1.0,0.0,1.0,0.0,2.0,1.0,2.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,diaporthe-stem-canker
3,3.0,0.0,2.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,diaporthe-stem-canker
4,6.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0,0.0,2.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,diaporthe-stem-canker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,5.0,1.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,frog-eye-leaf-spot
286,4.0,0.0,2.0,2.0,0.0,1.0,3.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,frog-eye-leaf-spot
287,5.0,0.0,2.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,frog-eye-leaf-spot
288,5.0,0.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,frog-eye-leaf-spot


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import fowlkes_mallows_score, adjusted_rand_score, normalized_mutual_info_score
import numpy as np

datasets = ["soybean_df", "zoo_df", "heart_disease_df", "dermatology_df", "breast_cancer_df", "mushroom_df"]

results = []

for dataset_name in datasets:
    # Fetch dataset
    df = globals()[dataset_name]
    
    # Assuming the last column is the target/label column
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    
    # Label encode the target variable if it's categorical
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    # Handle categorical features using one-hot encoding if present
    if np.any(pd.DataFrame(X).dtypes == 'object'):
        # One-hot encode categorical features
        categorical_columns = np.where(pd.DataFrame(X).dtypes == 'object')[0]
        onehot_encoder = OneHotEncoder(categories='auto')
        X_categorical = onehot_encoder.fit_transform(X[:, categorical_columns])
        
        # Reshape one-hot encoded features to match the number of dimensions of numerical features
        X_categorical = X_categorical.toarray().reshape(-1, X_categorical.shape[1])
        
        # Concatenate one-hot encoded features with numerical features
        X_numeric = np.delete(X, categorical_columns, axis=1)
        X = np.hstack((X_numeric, X_categorical))
    
    # Perform any necessary preprocessing (e.g., scaling)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Perform clustering (example using KMeans)
    kmeans = KMeans(n_clusters=len(np.unique(y)))
    predicted_labels = kmeans.fit_predict(X_scaled)
    
    # Compute performance metrics
    ari = adjusted_rand_score(y, predicted_labels)
    nmi = normalized_mutual_info_score(y, predicted_labels)
    fmi = fowlkes_mallows_score(y, predicted_labels)
    
    # Append results to the results list
    results.append([dataset_name, ari, nmi, fmi])

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_df)


            Dataset       ARI       NMI       FMI
0        soybean_df  0.464300  0.747133  0.520112
1            zoo_df  0.770854  0.832474  0.822987
2  heart_disease_df  0.151906  0.229268  0.386662
3    dermatology_df  0.711411  0.875836  0.768963
4  breast_cancer_df  0.181071  0.086418  0.683552
5       mushroom_df -0.030095  0.063189  0.622972


3. Read articles about Adjusted Rand Index, Normalized Mutual Information, and Folkes-Mallows Index (only use paper published in IEEE, sciencedirect, springerlink, Taylor Francis).
4. Aside from the Adjusted Rand Index (ARI), and Normalized Mutual Information (NMI), use the Folkes-Mallows Index (FMI), and compare the result of each performance index.
5. Compare and contrast each performance index, what are the advantages and disadvantages of ARI, NMI, and FMI, and when to use each?
6. Using Kmodes and Hierarchical Clustering, use the same dataset and perform categorical data clustering, use FMI, ARI, and NMI for the comparison of performance.
7. Write your report using Latex. Your report should be focused on the "why's and the what's" of each performance metrices (i.e. why is FMI always greater than ARI and NMI? What's the problem with ARI and NMI?).

In [6]:
import warnings

results_categorical = []

for dataset_name in datasets:
    # Fetch dataset
    df = globals()[dataset_name]
    
    # Assuming the last column is the target/label column
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    
    # Perform clustering using K-Modes
    kmodes = KModes(n_clusters=len(np.unique(y)), init='Huang', n_init=5, verbose=0)
    predicted_labels_kmodes = kmodes.fit_predict(X)
    
    # Perform clustering using Hierarchical Clustering
    hierarchical = AgglomerativeClustering(n_clusters=len(np.unique(y)))
    predicted_labels_hierarchical = hierarchical.fit_predict(X)
    
    # Compute performance metrics for K-Modes
    ari_kmodes = adjusted_rand_score(y, predicted_labels_kmodes)
    nmi_kmodes = normalized_mutual_info_score(y, predicted_labels_kmodes)
    fmi_kmodes = fowlkes_mallows_score(y, predicted_labels_kmodes)
    
    # Compute performance metrics for Hierarchical Clustering
    ari_hierarchical = adjusted_rand_score(y, predicted_labels_hierarchical)
    nmi_hierarchical = normalized_mutual_info_score(y, predicted_labels_hierarchical)
    fmi_hierarchical = fowlkes_mallows_score(y, predicted_labels_hierarchical)
    
    # Append results to the results list
    results_categorical.append([dataset_name + " (K-Modes)", ari_kmodes, nmi_kmodes, fmi_kmodes])
    results_categorical.append([dataset_name + " (Hierarchical)", ari_hierarchical, nmi_hierarchical, fmi_hierarchical])

# Convert results to DataFrame
results_categorical_df = pd.DataFrame(results_categorical, columns=["Dataset", "ARI", "NMI", "FMI"])
print(results_categorical_df)

NameError: name 'KModes' is not defined