In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [8]:
usa_arrests_csv = "../data/USArrests.csv"

with open(usa_arrests_csv, "r") as usa_arrests_infile:
    usa_arrests_df = pd.read_csv(usa_arrests_infile)

In [9]:
usa_arrests_df.head()

Unnamed: 0,State,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [10]:
# Select the variables for clustering
X = usa_arrests_df[['Murder', 'Assault', 'UrbanPop', 'Rape']]

In [11]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Perform K-means clustering with K=4
kmeans = KMeans(n_clusters=4, random_state=42)
usa_arrests_df['Cluster'] = kmeans.fit_predict(X_scaled)

In [13]:
# Qualitative description of each cluster
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_labels = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']
for i, center in enumerate(cluster_centers):
    print(f"{cluster_labels[i]}:")
    print(f"  - Murder: {center[0]:.2f}")
    print(f"  - Assault: {center[1]:.2f}")
    print(f"  - UrbanPop: {center[2]:.2f}")
    print(f"  - Rape: {center[3]:.2f}")
    print()

Cluster 1:
  - Murder: 3.60
  - Assault: 78.54
  - UrbanPop: 52.08
  - Rape: 12.18

Cluster 2:
  - Murder: 13.94
  - Assault: 243.62
  - UrbanPop: 53.75
  - Rape: 21.41

Cluster 3:
  - Murder: 10.97
  - Assault: 264.00
  - UrbanPop: 76.50
  - Rape: 33.61

Cluster 4:
  - Murder: 5.85
  - Assault: 141.18
  - UrbanPop: 73.65
  - Rape: 19.34

