In [70]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [71]:
# Load data
column_names = ["mpg", "cylinders", "displacement", "horsepower",
                "weight", "acceleration", "model_year", "origin", "car_name"]
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
df = pd.read_csv(url, names=column_names, na_values='?', sep=r'\s+', skipinitialspace=True)

In [72]:
# Extract continuous features
continuous_features = ["mpg", "displacement", "horsepower", "weight", "acceleration"]
data = df[continuous_features]

In [73]:
# Missing value imputation
data = data.fillna(data.mean())

In [74]:
# Standardization
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [75]:
# Hierarchical clustering
# Starting from scikit-learn >= 1.2, the affinity parameter in AgglomerativeClustering has been renamed to metric.
clustering = AgglomerativeClustering(
    n_clusters=3,
    linkage='average',
    metric='euclidean'
)
df['cluster'] = clustering.fit_predict(data_scaled)

In [76]:
# Mean and variance of each cluster when using cluster as class labels
cluster_mean = df.groupby('cluster')[continuous_features].mean()
cluster_var = df.groupby('cluster')[continuous_features].var()

print("\nMean of each cluster:\n", cluster_mean)
print("\nVariance of each cluster:\n", cluster_var)


Mean of each cluster:
                mpg  displacement  horsepower       weight  acceleration
cluster                                                                
0        26.177441    144.304714   86.120275  2598.414141     16.425589
1        14.528866    348.020619  161.804124  4143.969072     12.641237
2        43.700000     91.750000   49.000000  2133.750000     22.875000

Variance of each cluster:
                mpg  displacement  horsepower         weight  acceleration
cluster                                                                  
0        41.303375   3511.485383  294.554450  299118.709664      4.875221
1         4.771033   2089.499570  674.075816  193847.051117      3.189948
2         0.300000     12.250000    4.000000   21672.916667      2.309167


In [77]:
# Mean and variance of each cluster when using origin as class labels
origin_mean = df.groupby('origin')[continuous_features].mean()
origin_var = df.groupby('origin')[continuous_features].var()

print("\nMean under the origin category:\n", origin_mean)
print("\nVariance under the origin category:\n", origin_var)


Mean under the origin category:
               mpg  displacement  horsepower       weight  acceleration
origin                                                                
1       20.083534    245.901606  119.048980  3361.931727     15.033735
2       27.891429    109.142857   80.558824  2423.300000     16.787143
3       30.450633    102.708861   79.835443  2221.227848     16.172152

Variance under the origin category:
               mpg  displacement   horsepower         weight  acceleration
origin                                                                   
1       40.997026   9702.612255  1591.833657  631695.128385      7.568615
2       45.211230    509.950311   406.339772  240142.328986      9.276209
3       37.088685    535.465433   317.523856  102718.485881      3.821779


In [78]:
# Relationship between cluster results and origin results
# Cross-tabulation to see the distribution of clusters and origin
cross_tab = pd.crosstab(df['cluster'], df['origin'])
print("\nCross-tabulation of Cluster and Origin:\n", cross_tab)
print("\nExpanation:")
print("Based on the cross-tabulation, it can be inferred that there might be a certain relationship between the clustering and the origin label.")
print("Especially for clusters 1 and 2, whose samples are almost entirely composed of a single origin category.")
print("This suggests that the clustering results may be influenced by the origin category.\n")



Cross-tabulation of Cluster and Origin:
 origin     1   2   3
cluster             
0        152  66  79
1         97   0   0
2          0   4   0

Expanation:
Based on the cross-tabulation, it can be inferred that there might be a certain relationship between the clustering and the origin label.
Especially for clusters 1 and 2, whose samples are almost entirely composed of a single origin category.
This suggests that the clustering results may be influenced by the origin category.



In [79]:
# Clustering effect score
sil_score = silhouette_score(data_scaled, df['cluster'])
print("\nCluster Silhouette Score: ", sil_score)
print("\nExpanation:")
print("The Silhouette Score is {:.2f}, indicating a moderate clustering effect.".format(sil_score))
print("This score suggests that while the clustering results can distinguish different samples to some extent,")
print("the separation between clusters is not very distinct.")


Cluster Silhouette Score:  0.42120821530523345

Expanation:
The Silhouette Score is 0.42, indicating a moderate clustering effect.
This score suggests that while the clustering results can distinguish different samples to some extent,
the separation between clusters is not very distinct.
