# 第3课：分类算法

## 学习目标
- 掌握逻辑回归
- 了解决策树分类
- 学习随机森林
- 掌握模型评估方法

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns

## 1. 数据准备

In [None]:
# 生成二分类数据
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
                           n_redundant=5, n_classes=2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"训练集: {X_train.shape}")
print(f"测试集: {X_test.shape}")
print(f"类别分布: {np.bincount(y_train)}")

## 2. 逻辑回归

In [None]:
# 训练逻辑回归
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)

# 预测
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

print("逻辑回归结果:")
print(f"准确率: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"\n分类报告:\n{classification_report(y_test, y_pred_lr)}")

## 3. 决策树

In [None]:
# 训练决策树
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)  # 决策树不需要标准化

# 预测
y_pred_dt = dt.predict(X_test)

print("决策树结果:")
print(f"准确率: {accuracy_score(y_test, y_pred_dt):.4f}")

In [None]:
# 可视化决策树（使用简单数据）
iris = load_iris()
dt_iris = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_iris.fit(iris.data[:, :2], iris.target)

plt.figure(figsize=(15, 10))
plot_tree(dt_iris, feature_names=iris.feature_names[:2], 
          class_names=iris.target_names, filled=True, rounded=True)
plt.title('Decision Tree Visualization')
plt.show()

## 4. 随机森林

In [None]:
# 训练随机森林
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# 预测
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("随机森林结果:")
print(f"准确率: {accuracy_score(y_test, y_pred_rf):.4f}")

In [None]:
# 特征重要性
feature_importance = pd.DataFrame({
    'feature': [f'Feature {i}' for i in range(X.shape[1])],
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'][:10], feature_importance['importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()
plt.show()

## 5. 模型比较

In [None]:
# 比较模型性能
models = {
    'Logistic Regression': (lr, X_test_scaled),
    'Decision Tree': (dt, X_test),
    'Random Forest': (rf, X_test)
}

results = []
for name, (model, X_data) in models.items():
    y_pred = model.predict(X_data)
    acc = accuracy_score(y_test, y_pred)
    results.append({'Model': name, 'Accuracy': acc})

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# ROC 曲线
plt.figure(figsize=(10, 6))

# 逻辑回归
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
auc_lr = auc(fpr_lr, tpr_lr)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})')

# 随机森林
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
auc_rf = auc(fpr_rf, tpr_rf)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6. 交叉验证

In [None]:
# 5 折交叉验证
from sklearn.model_selection import cross_val_score

cv_scores_lr = cross_val_score(lr, X_train_scaled, y_train, cv=5)
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5)

print("交叉验证结果:")
print(f"逻辑回归: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")
print(f"随机森林: {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std():.4f})")

## 7. 练习题

### 练习：多分类问题
使用鸢尾花数据集进行三分类

In [None]:
iris = load_iris()
X, y = iris.data, iris.target

# 在这里编写代码


## 8. 本课小结

1. **逻辑回归**：线性模型，适合二分类
2. **决策树**：可解释性强，容易过拟合
3. **随机森林**：集成学习，性能稳定
4. **评估指标**：准确率、精确率、召回率、F1、AUC
5. **交叉验证**：更可靠的模型评估方法