# Problem 1


In [25]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [26]:
#Load the auto-mpg dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, names=column_names, delim_whitespace=True, na_values='?')

  df = pd.read_csv(url, names=column_names, delim_whitespace=True, na_values='?')


In [27]:
# Select continuous fields as features
continuous_features = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
X = df[continuous_features]

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=continuous_features)

# Standardize the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=continuous_features)

# Perform Hierarchical Clustering
clustering = AgglomerativeClustering(n_clusters=3, linkage='average', metric='euclidean')
cluster_labels = clustering.fit_predict(X_scaled)

# Add cluster labels to the dataframe
df['cluster'] = cluster_labels

In [28]:
# Compute mean and variance for each cluster (on original scale for interpretability)
cluster_stats = df.groupby('cluster')[continuous_features].agg(['mean', 'var']).round(2)
print("\nCluster Statistics (Mean and Variance):")
print(cluster_stats)

# Compute mean and variance for each origin class
origin_stats = df.groupby('origin')[continuous_features].agg(['mean', 'var']).round(2)
print("\nOrigin Class Statistics (Mean and Variance):")
print(origin_stats)

# Analyze relationship between clusters and origin
crosstab = pd.crosstab(df['cluster'], df['origin'])
print("\nContingency Table (Cluster vs. Origin):")
print(crosstab)


Cluster Statistics (Mean and Variance):
           mpg        displacement          horsepower           weight  \
          mean    var         mean      var       mean     var     mean   
cluster                                                                   
0        26.18  41.30       144.30  3511.49      86.12  294.55  2598.41   
1        14.53   4.77       348.02  2089.50     161.80  674.08  4143.97   
2        43.70   0.30        91.75    12.25      49.00    4.00  2133.75   

                   acceleration        
               var         mean   var  
cluster                                
0        299118.71        16.43  4.88  
1        193847.05        12.64  3.19  
2         21672.92        22.88  2.31  

Origin Class Statistics (Mean and Variance):
          mpg        displacement          horsepower            weight  \
         mean    var         mean      var       mean      var     mean   
origin                                                                  

## Result

There was no clear relationship between cluster assignment and source category labeling. While Cluster 1 corresponds exactly to American cars (Source 1) and Cluster 2 corresponds to a small fraction of European cars (Source 2), the largest cluster (Cluster 0, 74% of the data) mixes all sources (152 American, 66 European, and 79 Japanese cars) and does not allow for clear one-to-one mapping. This indicates that continuous features alone cannot fully distinguish between sources, especially European and Japanese cars.

# Problem 2


In [29]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd


In [30]:
# 1Load data and ensure numeric types
boston = fetch_openml(name='boston', version=1, parser='auto')
df = pd.DataFrame(boston.data, columns=boston.feature_names)

df = df.apply(pd.to_numeric, errors='coerce')

# Data standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [31]:
# Perform K-Means clustering analysis
silhouette_scores = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, labels)
    silhouette_scores.append(score)
    print(f"k={k}: Silhouette Score: {score:.4f}")

# Determine optimal k value
best_k = range(2,7)[silhouette_scores.index(max(silhouette_scores))]
print(f"\nOptimal k value: {best_k}")

k=2: Silhouette Score: 0.3601
k=3: Silhouette Score: 0.2448
k=4: Silhouette Score: 0.2275
k=5: Silhouette Score: 0.2389
k=6: Silhouette Score: 0.2291

Optimal k value: 2


In [32]:
# Calculate cluster characteristics
best_kmeans = KMeans(n_clusters=best_k, random_state=42).fit(scaled_data)

# Method 1: Calculate cluster means from original data
cluster_means = df.groupby(best_kmeans.labels_).mean()

# Method 2: Inverse transform centroids to original scale (equivalent to method 1)
centroids_original = scaler.inverse_transform(best_kmeans.cluster_centers_)

print("\nMethod 1: Cluster means from original data:")
print(cluster_means)
print("\nMethod 2: Inverse-transformed centroids:")
print(pd.DataFrame(centroids_original, columns=df.columns))


Method 1: Cluster means from original data:
       CRIM         ZN      INDUS      CHAS       NOX        RM        AGE  \
0  0.261172  17.477204   6.885046  0.069909  0.487011  6.455422  56.339210   
1  9.844730   0.000000  19.039718  0.067797  0.680503  5.967181  91.318079   

        DIS        RAD         TAX    PTRATIO           B      LSTAT  
0  4.756868   4.471125  301.917933  17.837386  386.447872   9.468298  
1  2.007242  18.988701  605.858757  19.604520  301.331695  18.572768  

Method 2: Inverse-transformed centroids:
       CRIM            ZN      INDUS      CHAS       NOX        RM        AGE  \
0  0.261172  1.747720e+01   6.885046  0.069909  0.487011  6.455422  56.339210   
1  9.844730  1.243450e-14  19.039718  0.067797  0.680503  5.967181  91.318079   

        DIS        RAD         TAX    PTRATIO           B      LSTAT  
0  4.756868   4.471125  301.917933  17.837386  386.447872   9.468298  
1  2.007242  18.988701  605.858757  19.604520  301.331695  18.572768  


## Result

The optimal k value is 2.

From the above results, we can see that the group means are exactly consistent with the inverse normalized centroids

# Problem 3

In [33]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score

In [34]:
# Load the wine dataset
wine = load_wine()
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
true_labels = wine.target

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [35]:
# Perform K-Means clustering with k=3
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_data)

# Calculate Homogeneity and Completeness
homogeneity = homogeneity_score(true_labels, cluster_labels)
completeness = completeness_score(true_labels, cluster_labels)


In [36]:
print(f"Homogeneity Score: {homogeneity:.4f}")
print(f"Completeness Score: {completeness:.4f}")

Homogeneity Score: 0.8788
Completeness Score: 0.8730


## Result

Homogeneity Score (0.8788) :

- Meaning: This score measures whether each cluster contains only data points from a single true category. A score of 0.8788 (close to 1) indicates high cluster purity, meaning most data points within each cluster belong to the same true category.
- Information Provided: Homogeneity tells us about the purity of clusters. A high value indicates that the K-Means algorithm successfully grouped data points such that the mixing of different true categories within each cluster is minimized. In this case, about 87.88% of the clustering structure aligns with the true category boundaries in terms of purity.

Completeness Score (0.8730):

- Meaning: This score measures whether all data points from a given true category are assigned to the same cluster. A score of 0.8730 indicates that most data points from each true category are grouped into a single cluster.
- Information Provided: Completeness reflects the extent to which clustering captures the entirety of each true category. A high value indicates the algorithm avoids splitting a true category across multiple clusters. Here, about 87.30% of the true category structure is preserved in the clustering.