In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

# 1. 加载数据集
df = pd.read_csv('normalized_dataset.csv')

#2. 对 Section 列进行数值化处理
# 这里采用 one-hot 编码方法，将 Section 列转换为多个虚拟变量
df_encoded = pd.get_dummies(df, columns=['Section'], drop_first=True)

# 保留除 'Unnamed: 0' 和 'FailureMode' 之外的所有特征
X = df_encoded.drop(['Unnamed: 0', 'FailureMode'], axis=1)
y = df_encoded['FailureMode']

# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 对训练集应用 SMOTE 进行过采样（平衡类别）
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 5. 使用 PCA 降维
# 保留 95% 的方差，可以根据需要调整 n_components
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test)

# 6. 定义 SVM 模型的参数网格
param_grid = {
    'C': [20, 15, 10, 100],         # 惩罚参数
    'gamma': [1, 0.1, 1.01, 1.5],   # 核函数参数
    'kernel': ['rbf']               # 使用径向基核函数
}

# 7. 使用 GridSearchCV 结合 5 折交叉验证训练 SVM 模型
grid = GridSearchCV(SVC(), param_grid, cv=10, verbose=2, n_jobs=-1)
grid.fit(X_train_pca, y_train_smote)

# 输出最佳参数和模型
print("最佳参数配置:", grid.best_params_)
print("最佳模型:", grid.best_estimator_)

# 8. 在测试集上预测，并输出评估指标
y_pred = grid.predict(X_test_pca)

print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

print("\n分类报告:")
print(classification_report(y_test, y_pred))

print("\n准确率:", accuracy_score(y_test, y_pred))


Fitting 10 folds for each of 16 candidates, totalling 160 fits
最佳参数配置: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
最佳模型: SVC(C=100, gamma=1)

混淆矩阵:
[[34  3  0  0]
 [ 2  4  3  0]
 [ 0  1 33  0]
 [ 0  1  1  4]]

分类报告:
              precision    recall  f1-score   support

           1       0.94      0.92      0.93        37
           2       0.44      0.44      0.44         9
           3       0.89      0.97      0.93        34
           4       1.00      0.67      0.80         6

    accuracy                           0.87        86
   macro avg       0.82      0.75      0.78        86
weighted avg       0.88      0.87      0.87        86


准确率: 0.872093023255814


In [3]:
from xgboost import XGBClassifier

df_encoded = pd.get_dummies(df, columns=['Section'], drop_first=True)
df_encoded['FailureMode'] = df_encoded['FailureMode'] - 1  # 将 [1,2,3,4] 转换为 [0,1,2,3]

# 3. 分离特征和目标变量
X = df_encoded.drop(['Unnamed: 0', 'FailureMode'], axis=1)
y = df_encoded['FailureMode']

# 4. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. 对训练集应用 SMOTE 进行过采样（平衡类别）
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 6. 使用 PCA 降维，保留 95% 的方差
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test)


# 定义 XGBoost 模型的参数网格
param_grid_xgb = {
    'n_estimators': [50, 100, 150],         # 树的数量
    'max_depth': [3, 4, 5],                 # 树的最大深度
    'learning_rate': [0.1, 0.05, 0.01],       # 学习率
    'subsample': [0.8, 1]                    # 子样本比例
}

# 初始化 XGBClassifier
# 设置 use_label_encoder=False 避免警告，并通过 eval_metric 指定评价指标
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# 使用 GridSearchCV 结合 10 折交叉验证训练 XGBoost 模型
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=10, verbose=2, n_jobs=-1)
grid_xgb.fit(X_train_pca, y_train_smote)

# 输出最佳参数配置和最佳模型
print("最佳参数配置:", grid_xgb.best_params_)
print("最佳模型:", grid_xgb.best_estimator_)

# 在测试集上预测，并输出评估指标
y_pred_xgb = grid_xgb.predict(X_test_pca)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

print("\n准确率:", accuracy_score(y_test, y_pred_xgb))

Fitting 10 folds for each of 54 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


最佳参数配置: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
最佳模型: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=None,
              num_parallel_tree=None, ...)

混淆矩阵:
[[28  6  2  1]
 [ 2  6  1  0]
 [ 0  3 31  0]
 [ 1  1  1  3]]

分类报告:
              precision    recall  f1-score   support

  