# 第4课：聚类算法 K-Means

## 学习目标
- 理解无监督学习的概念
- 掌握 K-Means 聚类算法
- 学会选择最佳聚类数
- 了解其他聚类算法

## 1. 无监督学习简介

无监督学习是在没有标签的数据上发现模式和结构。

**主要任务**：
- 聚类：将相似的数据分组
- 降维：减少特征数量
- 异常检测：发现异常数据点

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs, make_moons
from sklearn.metrics import silhouette_score, silhouette_samples

plt.rcParams['figure.figsize'] = (10, 6)
np.random.seed(42)

## 2. K-Means 算法原理

In [None]:
# 生成示例数据
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)

plt.scatter(X[:, 0], X[:, 1], s=50, alpha=0.6)
plt.title('原始数据（无标签）')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.show()

In [None]:
# K-Means 聚类
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
y_pred = kmeans.fit_predict(X)

# 可视化结果
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', s=50, alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='X', s=200, edgecolors='black', label='质心')
plt.title('K-Means 聚类结果')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=50, alpha=0.6)
plt.title('真实标签')

plt.tight_layout()
plt.show()

print(f"聚类质心:\n{kmeans.cluster_centers_}")
print(f"\n惯性（Inertia）: {kmeans.inertia_:.2f}")

## 3. K-Means 算法步骤可视化

In [None]:
# 可视化 K-Means 迭代过程
def plot_kmeans_steps(X, n_clusters=4, max_iter=5):
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    # 随机初始化质心
    np.random.seed(42)
    centers = X[np.random.choice(len(X), n_clusters, replace=False)]
    
    for i in range(min(max_iter + 1, 6)):
        # 分配点到最近的质心
        distances = np.sqrt(((X - centers[:, np.newaxis]) ** 2).sum(axis=2))
        labels = np.argmin(distances, axis=0)
        
        # 绘图
        axes[i].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.6)
        axes[i].scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, edgecolors='black')
        axes[i].set_title(f'迭代 {i}' if i > 0 else '初始化')
        
        # 更新质心
        new_centers = np.array([X[labels == k].mean(axis=0) for k in range(n_clusters)])
        
        # 检查收敛
        if np.allclose(centers, new_centers):
            break
        centers = new_centers
    
    plt.tight_layout()
    plt.show()

plot_kmeans_steps(X)

## 4. 选择最佳聚类数 K

In [None]:
# 方法1：肘部法则（Elbow Method）
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('聚类数 K')
plt.ylabel('惯性（Inertia）')
plt.title('肘部法则')
plt.axvline(x=4, color='r', linestyle='--', label='最佳 K=4')
plt.legend()
plt.show()

In [None]:
# 方法2：轮廓系数（Silhouette Score）
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)
    print(f"K={k}: 轮廓系数 = {score:.3f}")

plt.figure(figsize=(10, 5))
plt.plot(range(2, 11), silhouette_scores, 'go-')
plt.xlabel('聚类数 K')
plt.ylabel('轮廓系数')
plt.title('轮廓系数法')
plt.axvline(x=4, color='r', linestyle='--', label='最佳 K=4')
plt.legend()
plt.show()

In [None]:
# 轮廓系数可视化
from sklearn.metrics import silhouette_samples

def plot_silhouette(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, labels)
    sample_silhouette_values = silhouette_samples(X, labels)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
        ith_cluster_silhouette_values.sort()
        
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0, ith_cluster_silhouette_values,
                         alpha=0.7)
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10
    
    ax.axvline(x=silhouette_avg, color="red", linestyle="--", 
               label=f"平均轮廓系数: {silhouette_avg:.3f}")
    ax.set_xlabel("轮廓系数")
    ax.set_ylabel("聚类标签")
    ax.set_title(f"K={n_clusters} 的轮廓图")
    ax.legend()
    plt.show()

plot_silhouette(X, 4)

## 5. 实际应用：客户分群

In [None]:
# 创建模拟客户数据
np.random.seed(42)
n_customers = 500

customers = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'age': np.random.randint(18, 70, n_customers),
    'annual_income': np.random.randint(20000, 150000, n_customers),
    'spending_score': np.random.randint(1, 100, n_customers)
})

print("客户数据预览:")
print(customers.head())
print(f"\n数据形状: {customers.shape}")

In [None]:
# 数据预处理
features = ['annual_income', 'spending_score']
X_customers = customers[features].values

# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_customers)

# 确定最佳 K
inertias = []
silhouettes = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, labels))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(range(2, 11), inertias, 'bo-')
axes[0].set_xlabel('K')
axes[0].set_ylabel('惯性')
axes[0].set_title('肘部法则')

axes[1].plot(range(2, 11), silhouettes, 'go-')
axes[1].set_xlabel('K')
axes[1].set_ylabel('轮廓系数')
axes[1].set_title('轮廓系数')

plt.tight_layout()
plt.show()

In [None]:
# 应用 K-Means（K=5）
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
customers['cluster'] = kmeans.fit_predict(X_scaled)

# 可视化
plt.figure(figsize=(10, 8))
scatter = plt.scatter(customers['annual_income'], customers['spending_score'], 
                       c=customers['cluster'], cmap='viridis', s=50, alpha=0.6)
plt.colorbar(scatter, label='聚类')
plt.xlabel('年收入')
plt.ylabel('消费评分')
plt.title('客户分群结果')
plt.show()

In [None]:
# 分析各聚类特征
cluster_analysis = customers.groupby('cluster').agg({
    'age': ['mean', 'std'],
    'annual_income': ['mean', 'std'],
    'spending_score': ['mean', 'std'],
    'customer_id': 'count'
}).round(2)

cluster_analysis.columns = ['_'.join(col) for col in cluster_analysis.columns]
print("各聚类特征分析:")
print(cluster_analysis)

In [None]:
# 为每个聚类命名
cluster_names = {
    0: '普通客户',
    1: '高收入低消费',
    2: '低收入高消费',
    3: '高价值客户',
    4: '潜力客户'
}

customers['segment'] = customers['cluster'].map(cluster_names)

# 可视化各群体
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 各群体人数
customers['segment'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('各客户群体人数')
axes[0].set_xlabel('客户群体')
axes[0].set_ylabel('人数')

# 各群体收入消费分布
for segment in cluster_names.values():
    data = customers[customers['segment'] == segment]
    axes[1].scatter(data['annual_income'], data['spending_score'], 
                    label=segment, s=50, alpha=0.6)

axes[1].set_xlabel('年收入')
axes[1].set_ylabel('消费评分')
axes[1].set_title('客户分群')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6. 其他聚类算法

In [None]:
# 生成月牙形数据
X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 原始数据
axes[0, 0].scatter(X_moons[:, 0], X_moons[:, 1], s=50)
axes[0, 0].set_title('原始数据')

# K-Means
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
labels_kmeans = kmeans.fit_predict(X_moons)
axes[0, 1].scatter(X_moons[:, 0], X_moons[:, 1], c=labels_kmeans, cmap='viridis', s=50)
axes[0, 1].set_title('K-Means（效果差）')

# DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=5)
labels_dbscan = dbscan.fit_predict(X_moons)
axes[1, 0].scatter(X_moons[:, 0], X_moons[:, 1], c=labels_dbscan, cmap='viridis', s=50)
axes[1, 0].set_title('DBSCAN（效果好）')

# 层次聚类
agg = AgglomerativeClustering(n_clusters=2)
labels_agg = agg.fit_predict(X_moons)
axes[1, 1].scatter(X_moons[:, 0], X_moons[:, 1], c=labels_agg, cmap='viridis', s=50)
axes[1, 1].set_title('层次聚类')

plt.tight_layout()
plt.show()

In [None]:
# DBSCAN 参数说明
print("DBSCAN 参数:")
print("- eps: 邻域半径")
print("- min_samples: 核心点所需的最小样本数")
print("\n优点:")
print("- 不需要预先指定聚类数")
print("- 可以发现任意形状的聚类")
print("- 能识别噪声点")

## 7. 聚类算法比较

In [None]:
comparison = pd.DataFrame({
    '算法': ['K-Means', 'DBSCAN', '层次聚类'],
    '是否需要K': ['是', '否', '是'],
    '聚类形状': ['球形', '任意', '任意'],
    '处理噪声': ['差', '好', '一般'],
    '计算复杂度': ['O(n)', 'O(n²)', 'O(n²)'],
    '适用场景': ['大规模数据', '密度不均数据', '小规模数据']
})

print("聚类算法比较:")
print(comparison.to_string(index=False))

## 8. 练习题

### 练习：对 Iris 数据集进行聚类分析

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# 在这里编写代码
# 1. 使用肘部法则确定最佳 K
# 2. 应用 K-Means 聚类
# 3. 比较聚类结果与真实标签
# 4. 可视化结果


## 9. 本课小结

1. **K-Means**：基于质心的聚类算法
2. **选择 K**：肘部法则、轮廓系数
3. **数据预处理**：聚类前需要标准化
4. **其他算法**：DBSCAN（密度）、层次聚类
5. **应用场景**：客户分群、图像分割、异常检测