Problem 1

In [None]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name="target")

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


results = []
for max_depth in range(1, 6):

    clf = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_leaf=2,
        min_samples_split=5,
        random_state=42
    )
    clf.fit(X_train, y_train)      
    y_pred = clf.predict(X_test)
    

    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    
    results.append({
        "max_depth": max_depth,
        "precision": round(precision, 2),
        "recall": round(recall, 2),
        "f1": round(f1, 2)
    })

results_df = pd.DataFrame(results)
print(results_df)

1.
Highest Recall: Depth = 3 (Recall = 0.98).
Why: The tree depth of 3 allows sufficient complexity to capture class boundaries without overfitting, reducing false negatives.

Lowest Precision: Depth = 1 (Precision = 0.70).
Why: A shallow tree (depth=1) underfits, leading to many false positives due to limited splits.

Best F1 Score: Depth = 3 (F1 = 0.97).
Why: F1 balances precision and recall, peaking when both are optimized.

2.
Micro:  Calculate global TP FP、FN， Calculate the indicators again. Suitable for situations with imbalanced categories.

Macro:  Calculate the indicators for each category and then take the average. Suitable for balancing the importance of categories.

Weighted:  Similar to Macro, but weighted by category weight (sample size). Suitable for imbalanced categories but wishing to focus on the main categories.

Problem 2

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import entropy

# 定义列名
columns = [
    'id', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
    'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei',
    'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'
]

# 加载数据，处理缺失值（缺失值标记为'?'）
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',
    names=columns,
    na_values='?'
)

# 删除包含缺失值的行
df.dropna(inplace=True)

# 将目标变量转换为二进制（0表示良性，1表示恶性）
df['Class'] = df['Class'].replace({2: 0, 4: 1})

# 分离特征和目标
X = df.drop(['id', 'Class'], axis=1)
y = df['Class']

clf = DecisionTreeClassifier(
    max_depth=2,
    min_samples_leaf=2,
    min_samples_split=5,
    criterion='gini',
    random_state=42
)
clf.fit(X, y)

# 根节点是索引0
feature_idx = clf.tree_.feature[0]
threshold = clf.tree_.threshold[0]
feature_name = X.columns[feature_idx]
print(f"Feature of first spilt: {feature_name}, Threshold: {threshold:.2f}")

# 父节点样本数
n_samples = clf.tree_.n_node_samples[0]

# 父节点正类比例
p_parent = y.mean()

# 基尼系数
gini_parent = 2 * p_parent * (1 - p_parent)

# 熵
entropy_parent = entropy([p_parent, 1 - p_parent], base=2)

# 误分类错误率
misclassification_parent = 1 - max(p_parent, 1 - p_parent)

print(f"Gini parent: {gini_parent:.3f}")
print(f"Entropy parent: {entropy_parent:.3f}")
print(f"Misclassification parent: {misclassification_parent:.3f}")

# 划分左右子节点
left_mask = X.iloc[:, feature_idx] <= threshold
y_left = y[left_mask]
y_right = y[~left_mask]

# 左子节点指标
p_left = y_left.mean()
gini_left = 2 * p_left * (1 - p_left)
entropy_left = entropy([p_left, 1 - p_left], base=2)
misclassification_left = 1 - max(p_left, 1 - p_left)

# 右子节点指标
p_right = y_right.mean()
gini_right = 2 * p_right * (1 - p_right)
entropy_right = entropy([p_right, 1 - p_right], base=2)
misclassification_right = 1 - max(p_right, 1 - p_right)

# 加权平均不纯度
n_left = len(y_left)
n_right = len(y_right)
weighted_gini = (n_left / n_samples) * gini_left + (n_right / n_samples) * gini_right
weighted_entropy = (n_left / n_samples) * entropy_left + (n_right / n_samples) * entropy_right
weighted_misclassification = (n_left / n_samples) * misclassification_left + (n_right / n_samples) * misclassification_right

# 信息增益（基于熵）
information_gain = entropy_parent - weighted_entropy

print(f"Information gain: {information_gain:.3f}")

Feature of first spilt: Uniformity of Cell Size, Threshold: 2.50
Gini parent: 0.455
Entropy parent: 0.934
Misclassification parent: 0.350
Information gain: 0.589


1. Information gain:
The information gain is 0.589, indicating that the splitting of this feature significantly reduces uncertainty.

2. Selected features and thresholds:
The characteristic of the first split is Uniformity of Cell Size, with a threshold of 2.50.

3. Value determines the decision boundary：
The decision boundary value is threshold: 2.50

Problem 3

In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# 加载数据
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',
    header=None,
    names=columns
)

# 目标变量转换（M=1, B=0）
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# 分离特征和目标变量
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


def train_and_evaluate(X_data, y_data, model_name="Model"):
    """ 训练决策树并计算 Precision, Recall, F1, FP, TP, FPR, TPR """
    X_train, X_test, y_train, y_test = train_test_split(
        X_data, y_data, test_size=0.3, stratify=y, random_state=42
    )

    # 训练决策树
    clf = DecisionTreeClassifier(
        max_depth=3, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42
    )
    clf.fit(X_train, y_train)

    # 预测
    y_pred = clf.predict(X_test)

    # 计算指标
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)  # 假阳性率
    tpr = tp / (tp + fn)  # 真阳性率

    return {
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "False Positives (FP)": fp,
        "True Positives (TP)": tp,
        "False Positive Rate (FPR)": fpr,
        "True Positive Rate (TPR)": tpr
    }


# 训练和评估不同模型
results = []
results.append(train_and_evaluate(X_scaled, y, "Original Data"))  # 原始数据

# PCA 1D
pca1 = PCA(n_components=1)
X_pca1 = pca1.fit_transform(X_scaled)
results.append(train_and_evaluate(X_pca1, y, "PCA-1D"))

# PCA 2D
pca2 = PCA(n_components=2)
X_pca2 = pca2.fit_transform(X_scaled)
results.append(train_and_evaluate(X_pca2, y, "PCA-2D"))

# 以 Pandas DataFrame 格式输出结果
results_df = pd.DataFrame(results)
print(results_df)


           Model  Precision    Recall  F1 Score  False Positives (FP)  \
0  Original Data   1.000000  0.781250  0.877193                     0   
1         PCA-1D   0.865672  0.906250  0.885496                     9   
2         PCA-2D   0.948276  0.859375  0.901639                     3   

   True Positives (TP)  False Positive Rate (FPR)  True Positive Rate (TPR)  
0                   50                   0.000000                  0.781250  
1                   58                   0.084112                  0.906250  
2                   55                   0.028037                  0.859375  


1. F1 Score, Precision, and Recall:
Original Data:
Precision: 1.000, Recall: 0.781, F1 Score: 0.877
PCA-1D:
Precision: 0.866, Recall: 0.906, F1 Score: 0.885
PCA-2D:
Precision: 0.948, Recall: 0.859, F1 Score: 0.902


3. Confusion Matrix:
Original Data: FP = 0, TP = 50, FPR = 0.000, TPR = 0.781
PCA-1D: FP = 9, TP = 58, FPR = 0.084, TPR = 0.906
PCA-2D: FP = 3, TP = 55, FPR = 0.028, TPR = 0.859


4. Is Continuous Data Beneficial?
Continuous Data is beneficial for minimizing false positives, but PCA improves recall, especially in detecting malignant cases.

If precision is more important, use Original Data; if recall is critical, use PCA.