In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics.cluster import adjusted_rand_score

In [2]:
# Load dataset into a pandas DataFrame
data = pd.read_csv("wisc_bc_ContinuousVar.csv")



In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
# Dropping rows with missing values
data.dropna(inplace=True)

In [5]:
# Split features and target variables
X = data.drop(["id", "diagnosis"], axis=1)
y = data["diagnosis"]

In [7]:
# Standardize the feature variables
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [9]:
# Applying hierarchical clustering (hclust) with two clusters
hclust = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
hclust.fit(X)

AgglomerativeClustering()

In [10]:
# Adding cluster labels to the dataset
data['hclust_labels'] = hclust.labels_

In [11]:
# Printing tabular results of hclust clusters against the diagnosis column
print(pd.crosstab(index=data['diagnosis'], columns=data['hclust_labels']))


hclust_labels    0    1
diagnosis              
B               27  330
M              164   48


In [17]:
# Calculating ARI for hclust
hclust_ari = adjusted_rand_score(data['diagnosis'], data['hclust_labels'])

# Printing ARI for hclust
print('ARI Score for hclust:', hclust_ari)

ARI Score for hclust: 0.5383388964592652


In [12]:
# Applying k-means clustering with two clusters
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

KMeans(n_clusters=2)

In [13]:
# Adding cluster labels to the dataset
data['kmeans_labels'] = kmeans.labels_

In [14]:
# Printing tabular results of k-means clusters against the diagnosis column
print(pd.crosstab(index=data['diagnosis'], columns=data['kmeans_labels']))


kmeans_labels    0    1
diagnosis              
B                9  348
M              180   32


In [16]:
# Calculating ARI for k-means
kmeans_ari = adjusted_rand_score(data['diagnosis'], data['kmeans_labels'])

# Printing ARI for k-means
print('ARI Score for k-means:', kmeans_ari)

ARI Score for k-means: 0.7301749027614344
