# 引入必要库

In [1]:
# 引入必要库
import pandas as pd
import numpy as np
import torch
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from model.vae_pipeline import train_vae
from model.anomaly_detection_pipeline import train_vae_anomaly_detection

# 选择设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 数据准备

In [2]:
df_application_record = pd.read_csv("application_record.csv")
df_credit_record = pd.read_csv("credit_record.csv")

In [3]:
# 丢弃重复ID数据
df_application_record = df_application_record.drop_duplicates(subset = 'ID', keep = False)

In [4]:
# 调整数据框，以便仅使用一致的ID进行处理
df_application_record = df_application_record[df_application_record['ID'].isin(df_credit_record['ID'])]
df_credit_record = df_credit_record[df_credit_record['ID'].isin(df_application_record['ID'])]

# 数据清洗

In [5]:
# 生成标签用于柱状图
label_dict = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, 'X': -1, 'C': 1, '0': 1}
df_credit_record['APPROVED'] = df_credit_record['STATUS'].map(label_dict)
df_credit_record = df_credit_record[df_credit_record['APPROVED'] != -1]

In [6]:
# 合并数据
df_application_record = df_application_record.merge(df_credit_record, on='ID')

In [7]:
df_application_record = df_application_record[df_application_record['MONTHS_BALANCE']==-4]
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Pensioner","OCCUPATION_TYPE"] = "Pension"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Commercial associate","OCCUPATION_TYPE"] = "Commercial associate"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="State servant","OCCUPATION_TYPE"] = "State servant"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Student","OCCUPATION_TYPE"] = "Student"
df_application_record = df_application_record.dropna()

In [8]:
df_application_record['Work_Time'] = -(df_application_record['DAYS_EMPLOYED'])//365

df_application_record = df_application_record.drop(df_application_record[df_application_record['Work_Time']>50].index)
df_application_record = df_application_record.drop(df_application_record[df_application_record['Work_Time']<0].index)
df_application_record = df_application_record.drop(columns=['STATUS'])
df_application_record.drop(['DAYS_EMPLOYED'],axis=1,inplace=True)


In [9]:
baseline_date = pd.to_datetime('2023-01-01')
df_application_record['BIRTH_DATE'] = baseline_date + pd.to_timedelta(df_application_record['DAYS_BIRTH'], unit='D')
df_application_record['AGE'] = (baseline_date - df_application_record['BIRTH_DATE']).dt.days // 365
df_application_record = df_application_record.drop(columns=['DAYS_BIRTH','BIRTH_DATE'])

In [10]:
onehot = False
if onehot:
    categorical_columns = ['CODE_GENDER', 'NAME_FAMILY_STATUS', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
    dummy_columns = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
else:
    categorical_columns = ['CODE_GENDER', 'NAME_FAMILY_STATUS', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
    dummy_columns = []
categorical_df = df_application_record[categorical_columns]
categorical_df = categorical_df.apply(lambda x: pd.factorize(x)[0])
categorical_df = pd.DataFrame(categorical_df)
df_application_record[categorical_columns] = categorical_df
df_application_record = pd.get_dummies(df_application_record, columns=dummy_columns)
df_application_record.to_csv('dataset.csv', index=False)

# 创建数据集

In [11]:
# scaler = MinMaxScaler()
# df_application_record['AMT_INCOME_TOTAL']=scaler.fit_transform(df_application_record['AMT_INCOME_TOTAL'].values.reshape(-1, 1))
# df_application_record['DAYS_EMPLOYED']=scaler.fit_transform(df_application_record['DAYS_EMPLOYED'].values.reshape(-1, 1))
# df_application_record['MONTHS_BALANCE']=scaler.fit_transform(df_application_record['MONTHS_BALANCE'].values.reshape(-1, 1))
# scaler = StandardScaler()
# df_application_record['CNT_FAM_MEMBERS']=scaler.fit_transform(df_application_record['CNT_FAM_MEMBERS'].values.reshape(-1, 1))
# df_application_record['AGE']=scaler.fit_transform(df_application_record['AGE'].values.reshape(-1, 1))

negative_data_orgin = df_application_record[df_application_record['APPROVED']==0]
negative_data = negative_data_orgin.drop(['APPROVED', 'ID','CODE_GENDER'], axis = 1)

X = df_application_record.drop(['APPROVED', 'ID','CODE_GENDER'], axis = 1)
y = df_application_record['APPROVED']
X = np.array(X,dtype=float)
y = np.array(y, dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.fit_transform(X_test)

negative_data = scalar.fit_transform(np.array(negative_data,dtype=float))
negative_label_list = np.zeros(len(negative_data))


# 创建 RandomUnderSampler 对象
undersampler = RandomUnderSampler(sampling_strategy='majority')

# 使用 RandomUnderSampler 来生成平衡的训练集
X_train_under_random, y_train_under_random = undersampler.fit_resample(X_train, y_train)

# 创建RandomOverSampler对象
oversampler = RandomOverSampler(sampling_strategy='minority')

# 使用RandomOverSampler来生成平衡的训练集
X_train_over_random, y_train_over_random = oversampler.fit_resample(X_train, y_train)

# 创建TomekLinks对象
undersampler = TomekLinks()

# 使用TomekLinks来生成平衡的训练集
X_train_under_tomelinks, y_train_under_tomelinks = undersampler.fit_resample(X_train, y_train)

# 创建SMOTE对象
smote = SMOTE(sampling_strategy='minority',random_state=42)

# 使用SMOTE来生成平衡的训练集
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [12]:
data_list = [
             (X_train, y_train, "Original Data"),
             (X_train_over_random,y_train_over_random, "Over-sampled Data"),
             (X_train_under_random,y_train_under_random, "Under-sampled Data"),
             (X_train_under_tomelinks,y_train_under_tomelinks, "Tomelinks Data"),
             (X_train_smote,y_train_smote, "SMOTE Data")
            ]


# 分类

In [13]:
performance_data= []

## LightGBM

In [14]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.05,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5,
    'n_estimators': 100,
    'colsample_bytree': .5,
    'min_child_samples': 100,
    'subsample': .9,
    'importance_type': 'gain',
    'random_state': 71,
    'num_leaves': 32,
    'force_col_wise': True,
    'scale_pos_weight': 1,
    'bagging_freq': 5,
}


In [15]:
index = 0

performance_data = []
for X_train_processed, y_train_processed, method_name in data_list:
    lgb_model = lgb.LGBMClassifier(**params, verbose=-1)

    lgb_model.fit(X_train_processed, y_train_processed)

    print(len(negative_data))
    y_pred = lgb_model.predict(X_test)
    y_prob = lgb_model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    negative_data_pred = lgb_model.predict(negative_data)
    negative_accuracy = accuracy_score(negative_label_list, negative_data_pred)

    print(
        classification_report(negative_label_list,
                              negative_data_pred,
                              zero_division=1))

    performance_data.append({
        'Classification Method' : 'LightGBM',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1


300
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

300
              precision    recall  f1-score   support

           0       0.03      0.24      0.05        86
           1       0.98      0.82      0.89      3663

    accuracy                           0.81      3749
   macro avg       0.50      0.53      0.47      3749
weighted avg       0.96      0.81      0.87      3749

           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


300
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


300
              precision    recall  f1-score   support

           0       0.02      0.99      0.04        86
           1       0.97      0.01      0.02      3663

    accuracy                           0.03      3749
   macro avg       0.50      0.50      0.03      3749
weighted avg       0.95      0.03      0.02      3749

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



## RandomForest

In [16]:
for X_train_processed, y_train_processed, method_name in data_list:
    rfc = RandomForestClassifier(n_estimators=1000, max_features=12)
    rfc.fit(X_train_processed, y_train_processed)
    predictions = rfc.predict(X_test)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions))
    negative_predictions = rfc.predict(negative_data)
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              negative_predictions,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    performance_data.append({
        'Classification Method': 'Random Forest',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.13      0.03      0.06        86
           1       0.98      0.99      0.99      3663

    accuracy                           0.97      3749
   macro avg       0.55      0.51      0.52      3749
weighted avg       0.96      0.97      0.96      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.01      0.01       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.01       300
   macro avg       0.50      0.50      0.01       300
weighted avg       1.00      0.01      0.01       300

Classification Report for Over-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.10      0.15      0.12        86
           1       0.98      0.97      0.97      3663

    accuracy

## AdaBoost for Desicion Tree

In [17]:
for X_train_processed, y_train_processed, method_name in data_list:
    # 使用决策树桩作为弱分类器，也可以选择其他弱分类器
    base_classifier = DecisionTreeClassifier(max_depth=1)

    # 使用AdaBoost分类器
    adaboost = AdaBoostClassifier(base_classifier,
                                  n_estimators=1000,
                                  algorithm='SAMME',
                                  random_state=42)

    # 训练模型
    adaboost.fit(X_train_processed, y_train_processed)

    # 在测试集上进行预测和评估
    predictions_test = adaboost.predict(X_test)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions_test))

    # 在负样本数据上进行预测和评估
    predictions_negative = adaboost.predict(negative_data)
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              predictions_negative,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    recall = recall_score(y_test, predictions_test)
    f1 = f1_score(y_test, predictions_test)
    performance_data.append({
        'Classification Method': 'AdaBoost for Decision Tree',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Over-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.38      0.05        86
           1       0.98      0.66      0.79      3663

    accuracy                           0.65      3749
   macro avg       0.50      0.52      0.42      3749
weighted avg       0.96      0.65      0.77      3749

Classification Report for Over-sampled Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.40      0.57       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.40       300
   macro avg       0.50      0.70      0.29       300
weighted avg       1.00      0.40      0.57       300

Classification Report for Under-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.55      0.05        86
           1       0.98      0.54      0.69      3663

   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for SMOTE Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      1.00      0.04        86
           1       1.00      0.00      0.00      3663

    accuracy                           0.02      3749
   macro avg       0.51      0.50      0.02      3749
weighted avg       0.98      0.02      0.00      3749

Classification Report for SMOTE Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



## SVM

In [18]:
for X_train_processed, y_train_processed, method_name in data_list:
   # 使用 SVM 替代 RandomForestClassifier
   svm_model = SVC()
   svm_model.fit(X_train_processed, y_train_processed)

   # SVM 在测试集上的分类报告
   predictions_svm_test = svm_model.predict(X_test)
   print(f"Method: {method_name} - SVM Classification Report on Test Data:")
   print(classification_report(y_test, predictions_svm_test))

   # SVM 在负样本上的分类报告
   predictions_svm_negative = svm_model.predict(negative_data)
   print(
      f"Method: {method_name} - SVM Classification Report on Negative Data:")
   print(
      classification_report(negative_label_list,
                           predictions_negative,
                           zero_division=1))
   accuracy = accuracy_score(y_test, predictions_svm_test)
   precision = precision_score(y_test, predictions_svm_test)
   recall = recall_score(y_test, predictions_svm_test)
   f1 = f1_score(y_test, predictions_svm_test)
   performance_data.append({
       'Classification Method': 'SVM',
       'Data Process Method': method_name,
       'Accuracy': accuracy,
       'Precision': precision,
       'Recall': recall,
       'F1 Score': f1,
       'Negative Accuracy': negative_accuracy,
   })


Method: Original Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

Method: Original Data - SVM Classification Report on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Method: Over-sampled Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.26      0.06        86
           1       0.98      0.82      0.89      3663

    accuracy                           0.81      3749
   macro avg       0.51      0.54      0.48      3749
weighted avg       0.96      0.81      0.88      3749

Method: Over-sampled Data - SVM Classification Report on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

Method: Under-sampled Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.59      0.05        86
           1       0.98      0.52      0.68      3663

    accuracy               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Method: SMOTE Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.20      0.06        86
           1       0.98      0.86      0.92      3663

    accuracy                           0.84      3749
   macro avg       0.51      0.53      0.49      3749
weighted avg       0.96      0.84      0.90      3749

Method: SMOTE Data - SVM Classification Report on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



## AdaBoost for SVM

In [19]:
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report

# for X_train_processed, y_train_processed, method_name in data_list:
#     # 使用支持向量机作为弱分类器
#     base_classifier = SVC(kernel='linear', C=1.0)

#     # 使用AdaBoost分类器
#     adaboost = AdaBoostClassifier(base_classifier,
#                                   n_estimators=1000,
#                                   algorithm='SAMME',
#                                   random_state=42)

#     # 训练模型
#     adaboost.fit(X_train_processed, y_train_processed)

#     # 在测试集上进行预测和评估
#     predictions_test = adaboost.predict(X_test)
#     print(f"Classification Report for {method_name} on Test Data:")
#     print(classification_report(y_test, predictions_test))

#     # 在负样本数据上进行预测和评估
#     predictions_negative = adaboost.predict(negative_data)
#     print(f"Classification Report for {method_name} on Negative Data:")
#     print(
#         classification_report(negative_label_list,
#                               predictions_negative,
#                               zero_division=1))
#     accuracy = accuracy_score(y_test, predictions_test)
#     precision = precision_score(y_test, predictions_test)
#     recall = recall_score(y_test, predictions_test)
#     f1 = f1_score(y_test, predictions_test)
#     performance_data.append({
#         'Classification Method': 'AdaBoost for SVM',
#         'Data Process Method': method_name,
#         'Accuracy': accuracy,
#         'Precision': precision,
#         'Recall': recall,
#         'F1 Score': f1,
#         'Negative Accuracy': negative_accuracy,
#     })

In [20]:
# index = 0
# for X_train_processed, y_train_processed, method_name in data_list:
#     svm_model = SVC()
#     train_feature = train_features[index]
#     test_feature = test_features[index]
#     svm_model.fit(train_feature, y_train_processed)
#     predictions = svm_model.predict(test_feature)
#     print(classification_report(y_test,predictions))
#     predictions = svm_model.predict(negative_data_list[index])
#    print(classification_report(negative_label_list, predictions_negative, zero_division=1))
#     index = index + 1

# 特征提取

In [21]:
model_list = []
for X_train_processed, _, method_name in data_list:
   print("Training for the method: " + method_name)
   model = train_vae(X_train=X_train_processed,X_test=X_test, progress=False,num_epoch=250).eval()
   model_list.append(model)

total_list = [ t+ (data,)  for t, data in zip(data_list, model_list)]

index = 0
train_features = []
test_features = []
negative_data_list = []
for X_train_processed, _, method_name, __ in total_list:
    train_features.append(model_list[index].encoder(torch.Tensor(X_train_processed).to(device)).cpu().detach().numpy())
    test_features.append(model_list[index].encoder(torch.Tensor(X_test).to(device)).detach().cpu().numpy())
    negative_data_list.append(model_list[index].encoder(torch.Tensor(negative_data).to(device)).detach().cpu().numpy())
    index+=1

   
# for X_train_processed, _, method_name, model in model_list:
#     model.eval()
#     with torch.no_grad():
#         encoded_data = model.encoder(torch.Tensor(X_train_processed).to(device))
#         encoded_data = encoded_data.cpu().numpy()
#         tsne = TSNE(n_components=2)
#         reduced_data = tsne.fit_transform(encoded_data)

# plt.scatter(reduced_data[:, 0], reduced_data[:, 1])
# plt.title("VAE Visualization")
# plt.show()

Training for the method: Original Data


                                                            

Best epoch: 249
Training for the method: Over-sampled Data


                                                            

Best epoch: 249
Training for the method: Under-sampled Data


                                                    

Best epoch: 249
Training for the method: Tomelinks Data


                                                            

Best epoch: 249
Training for the method: SMOTE Data


                                                            

Best epoch: 249


## LightGBM for VAE data

In [22]:
index = 0

performance_data = []
data_df = pd.DataFrame(columns=['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Negative Accuracy'])

for X_train_processed, y_train_processed, method_name, _ in total_list:
    lgb_model = lgb.LGBMClassifier(**params, verbose=-1)

    lgb_model.fit(train_features[index], y_train_processed)

    print(len(negative_data))
    y_pred = lgb_model.predict(test_features[index])
    y_prob = lgb_model.predict_proba(test_features[index])[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(classification_report(y_test, y_pred))
    negative_data_pred = lgb_model.predict(negative_data_list[index])
    negative_accuracy = accuracy_score(negative_label_list, negative_data_pred)

    print(
        classification_report(negative_label_list,
                              negative_data_pred,
                              zero_division=1))

    performance_data.append({
        'Classification Method': 'LightGBM for VAE data',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1

data_df = pd.DataFrame(performance_data)
print(data_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


300
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

300
              precision    recall  f1-score   support

           0       0.03      0.26      0.05        86
           1       0.98      0.81      0.89      3663

    accuracy                           0.80      3749
   macro avg       0.50      0.53      0.47      3749
weighted avg       0.96      0.80      0.87      3749

           

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

300
              precision    recall  f1-score   support

           0       0.02      0.38      0.05        86
           1       0.98      0.64      0.77      3663

    accuracy                           0.64      3749
   macro avg       0.50      0.51      0.41      3749
weighted avg       0.96      0.64      0.76      3749

              p

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM for VAE data

In [23]:
index = 0
for X_train_processed, y_train_processed, method_name in data_list:
    svm_model = SVC()
    train_feature = train_features[index]
    test_feature = test_features[index]
    svm_model.fit(train_feature, y_train_processed)
    predictions = svm_model.predict(test_feature)
    print(classification_report(y_test, predictions))
    negative_predictions= svm_model.predict(negative_data_list[index])
    print(
        classification_report(negative_label_list,
                              negative_predictions,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    negative_accuracy = accuracy_score(negative_label_list, negative_predictions)
    performance_data.append({
        'Classification Method':
        'SVM for VAE data',
        'Data Process Method':
        method_name,
        'Accuracy':
        accuracy,
        'Precision':
        precision,
        'Recall':
        recall,
        'F1 Score':
        f1,
        'Negative Accuracy':
        negative_accuracy,
    })
    index = index + 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

              precision    recall  f1-score   support

           0       0.04      0.36      0.07        86
           1       0.98      0.78      0.87      3663

    accuracy                           0.77      3749
   macro avg       0.51      0.57      0.47      3749
weighted avg       0.96      0.77      0.85      3749

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

              precision    recall  f1-score   support

           0       0.02      0.33      0.04        86
           1       0.98      0.69      0.81      3663

    accuracy                           0.68      3749
   macro avg       0.50      0.51      0.43      3749
weighted avg       0.96      0.68      0.79      3749

              preci

## AdaBoost for Desicion Tree for VAE data

In [24]:
index = 0
for X_train_processed, y_train_processed, method_name in data_list:

    train_feature = train_features[index]
    test_feature = test_features[index]
    # 使用决策树桩作为弱分类器，也可以选择其他弱分类器
    base_classifier = DecisionTreeClassifier(max_depth=1)

    # 使用AdaBoost分类器
    adaboost = AdaBoostClassifier(base_classifier,
                                  n_estimators=1000,
                                  algorithm='SAMME',
                                  random_state=42)

    # 训练模型
    adaboost.fit(train_feature, y_train_processed)

    # 在测试集上进行预测和评估
    predictions_test = adaboost.predict(test_feature)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions_test))

    # 在负样本数据上进行预测和评估
    predictions_negative = adaboost.predict(negative_data_list[index])
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              predictions_negative,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    recall = recall_score(y_test, predictions_test)
    f1 = f1_score(y_test, predictions_test)
    negative_accuracy = accuracy_score(negative_label_list, predictions_negative)
    performance_data.append({
        'Classification Method': 'AdaBoost for Decision Tree',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Over-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.34      0.04        86
           1       0.98      0.67      0.79      3663

    accuracy                           0.66      3749
   macro avg       0.50      0.50      0.42      3749
weighted avg       0.96      0.66      0.78      3749

Classification Report for Over-sampled Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.47      0.64       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.47       300
   macro avg       0.50      0.73      0.32       300
weighted avg       1.00      0.47      0.64       300

Classification Report for Under-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.03      0.63      0.06        86
           1       0.98      0.52      0.68      3663

   

# Ensemble

In [25]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'binary_logloss',
    'learning_rate': 0.05,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5,
    'n_estimators': 100,
    'colsample_bytree': .5,
    'min_child_samples': 100,
    'subsample': .9,
    'importance_type': 'gain',
    'random_state': 71,
    'num_leaves': 32,
    'force_col_wise': True,
    'scale_pos_weight': 1,
    'bagging_freq': 5,
}

## Stacking

In [26]:
# 使用决策树桩作为弱分类器，也可以选择其他弱分类器
base_classifier = DecisionTreeClassifier(max_depth=1)

# 使用AdaBoost分类器
adaboost = AdaBoostClassifier(base_classifier,
                              n_estimators=1000,
                              algorithm='SAMME',
                              random_state=42)

# 训练模型
adaboost.fit(X_train_under_random, y_train_under_random)

# 在测试集上进行预测和评估
predictions_ada = adaboost.predict(X_train_under_random)
negative_accuracy_1 = adaboost.predict(negative_data)

In [27]:
# 使用LightGBM分类器
lgb_model = lgb.LGBMClassifier(**params, verbose=-1)

# 训练模型
lgb_model.fit(X_train_under_random, y_train_under_random)

# 在测试集上进行预测和评估
predictions_lgbm = lgb_model.predict(X_train_under_random)
negative_accuracy_2 = lgb_model.predict(negative_data)

## Voting

In [28]:
# 创建元模型
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 将基模型的预测结果作为特征
X_ensemble = np.column_stack((predictions_ada, predictions_lgbm))
negative_predictions = np.column_stack((negative_accuracy_1,negative_accuracy_2))

# 划分数据集
X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(X_ensemble, y_train_under_random, test_size=0.2, random_state=42)

# 训练元模型
meta_model.fit(X_train_ensemble, y_train_ensemble)

# 预测
ensemble_predictions = meta_model.predict(X_test_ensemble)
negative_predictions = meta_model.predict(negative_predictions)

# 计算新的指标
accuracy_ensemble = accuracy_score(y_test_ensemble, ensemble_predictions)
negative_accuracy_ensemble = accuracy_score(negative_label_list, negative_predictions)
precision_ensemble = precision_score(y_test_ensemble, ensemble_predictions)
recall_ensemble = recall_score(y_test_ensemble, ensemble_predictions)
f1_ensemble = f1_score(y_test_ensemble, ensemble_predictions)

# 打印新的指标
print("Ensemble Model Metrics:")
print(f"Accuracy: {accuracy_ensemble:.4f}")
print(f"Precision: {precision_ensemble:.4f}")
print(f"Recall: {recall_ensemble:.4f}")
print(f"F1 Score: {f1_ensemble:.4f}")
print(f"Negative Accuracy: {negative_accuracy_ensemble:.4f}")

# 打印分类报告
print("Classification Report for Ensemble Model:")
print(classification_report(y_test_ensemble, ensemble_predictions))
performance_data.append({
    'Classification Method': 'Stacking Ensemble Model for AdaBoost and LightGBM',
    'Data Process Method': 'Under-sampled Data',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Negative Accuracy': negative_accuracy_ensemble,
})

Ensemble Model Metrics:
Accuracy: 0.8256
Precision: 0.7500
Recall: 0.8571
F1 Score: 0.8000
Negative Accuracy: 0.5233
Classification Report for Ensemble Model:
              precision    recall  f1-score   support

           0       0.89      0.80      0.85        51
           1       0.75      0.86      0.80        35

    accuracy                           0.83        86
   macro avg       0.82      0.83      0.82        86
weighted avg       0.83      0.83      0.83        86



In [29]:
# 创建投票分类器
voting_model = VotingClassifier(estimators=[('adaboost', adaboost), ('lgbm', lgb_model)], voting='hard')

# 训练投票模型
voting_model.fit(X_train_under_random, y_train_under_random)

# 预测
ensemble_predictions = voting_model.predict(X_test)
negative_predictions = voting_model.predict(negative_data)

# 计算新的指标
accuracy_ensemble = accuracy_score(y_test, ensemble_predictions)
negative_accuracy_ensemble = accuracy_score(negative_label_list, negative_predictions)
precision_ensemble = precision_score(y_test, ensemble_predictions)
recall_ensemble = recall_score(y_test, ensemble_predictions)
f1_ensemble = f1_score(y_test, ensemble_predictions)

# 打印新的指标
print("Ensemble Model Metrics:")
print(f"Accuracy: {accuracy_ensemble:.4f}")
print(f"Precision: {precision_ensemble:.4f}")
print(f"Recall: {recall_ensemble:.4f}")
print(f"F1 Score: {f1_ensemble:.4f}")
print(f"Negative Accuracy: {negative_accuracy_ensemble:.4f}")
performance_data.append({
    'Classification Method': 'Voting Ensemble Model for AdaBoost and LightGBM',
    'Data Process Method': 'Under-sampled Data',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Negative Accuracy': negative_accuracy_ensemble,
})

# 打印分类报告
print("Classification Report for Ensemble Model:")
print(classification_report(y_test, ensemble_predictions))


Ensemble Model Metrics:
Accuracy: 0.4425
Precision: 0.9828
Recall: 0.4371
F1 Score: 0.6051
Negative Accuracy: 0.6033
Classification Report for Ensemble Model:
              precision    recall  f1-score   support

           0       0.03      0.67      0.05        86
           1       0.98      0.44      0.61      3663

    accuracy                           0.44      3749
   macro avg       0.51      0.56      0.33      3749
weighted avg       0.96      0.44      0.59      3749



# Anomaly Detection

## VAE

In [35]:
Anomal_model_list = []

for X_train_processed, _, method_name in data_list:
   print("Training for the method: " + method_name)
   model = train_vae_anomaly_detection(X_train=X_train_processed,X_test=X_test, progress=True,num_epoch=250).eval()
   Anomal_model_list.append(model)

Anomal_list = [ t+ (data,)  for t, data in zip(data_list, Anomal_model_list)]


Training for the method: Original Data


                                                            

Best epoch: 249
Training for the method: Over-sampled Data


                                                            

Best epoch: 249
Training for the method: Under-sampled Data


                                                    

Best epoch: 249
Training for the method: Tomelinks Data


                                                            

Best epoch: 249
Training for the method: SMOTE Data


                                                            

Best epoch: 249




In [36]:
index = 0
print(len(Anomal_list))
for X_train_processed, _, method_name, model in Anomal_list:
    predictions = model.predict_anomaly(
        torch.Tensor(X_test).to(device),
        threshold=0.05).detach().cpu().numpy()
    negative_data_pred = model.predict_anomaly(
        torch.Tensor(negative_data).to(device),
        threshold=0.05).detach().cpu().numpy()

    # 计算评估指标
    report_test = classification_report(y_test, predictions, output_dict=True)
    report_negative = classification_report(negative_label_list, negative_data_pred, output_dict=True, zero_division=1)

    # 提取指标
    accuracy = accuracy_score(y_test, predictions)
    precision = report_test['1']['precision']
    recall = report_test['1']['recall']
    f1 = report_test['1']['f1-score']
    negative_accuracy = accuracy_score(negative_label_list, negative_data_pred)

    print(classification_report(y_test, predictions))
    print(
        classification_report(negative_label_list,
                              negative_data_pred,
                              zero_division=1))
    # 添加到 performance_data 列表中
    performance_data.append({
        'Classification Method': 'Anomaly Detection with VAE',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })

    index += 1

# 将列表转换为 DataFrame
df_performance = pd.DataFrame(performance_data)
df_performance.to_csv('performance_data.csv', index=False)

5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        86
           1       0.98      1.00      0.99      3663

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.49      3749
weighted avg       0.95      0.98      0.97      3749

              pre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [37]:
data_df = pd.DataFrame(performance_data)
data_df = data_df.round({'Accuracy': 3, 'Precision': 3, 'Recall': 3, 'F1 Score': 3, 'Negative Accuracy': 3})

# 找到最佳性能数据的行
best_accuracy_row = data_df[data_df['Accuracy'] == data_df['Accuracy'].max()]
best_precision_row = data_df[data_df['Precision'] == data_df['Precision'].max()]
best_recall_row = data_df[data_df['Recall'] == data_df['Recall'].max()]
best_f1_score_row = data_df[data_df['F1 Score'] == data_df['F1 Score'].max()]
best_negative_accuracy_row = data_df[data_df['Negative Accuracy'] == data_df['Negative Accuracy'].max()]

# 输出最佳性能数据
print("best Accuracy Classification Method:", best_accuracy_row['Classification Method'].values[0])
print("Best Accuracy Data Process Method:", best_accuracy_row['Data Process Method'].values[0])
print("Best Accuracy Value:", best_accuracy_row['Accuracy'].values[0])

print("best Accuracy Classification Method:", best_precision_row['Classification Method'].values[0])
print("Best Precision Data Process Method:", best_precision_row['Data Process Method'].values[0])
print("Best Precision Value:", best_precision_row['Precision'].values[0])

print("best Accuracy Classification Method:", best_recall_row['Classification Method'].values[0])
print("Best Recall Data Process Method:", best_recall_row['Data Process Method'].values[0])
print("Best Recall Value:", best_recall_row['Recall'].values[0])

print("best Accuracy Classification Method:", best_f1_score_row['Classification Method'].values[0])
print("Best F1 Score Data Process Method:", best_f1_score_row['Data Process Method'].values[0])
print("Best F1 Score Value:", best_f1_score_row['F1 Score'].values[0])


print("best Accuracy Classification Method:", best_negative_accuracy_row['Classification Method'].values[0])
print("Best Negative Accuracy Data Process Method:", best_negative_accuracy_row['Data Process Method'].values[0])
print("Best Negative Accuracy Value:", best_negative_accuracy_row['Negative Accuracy'].values[0])
data_df.to_csv('output/performance_data.csv', index=False)

[{'Classification Method': 'LightGBM for VAE data', 'Data Process Method': 'Original Data', 'Accuracy': 0.9770605494798613, 'Precision': 0.9770605494798613, 'Recall': 1.0, 'F1 Score': 0.9883971937398812, 'Negative Accuracy': 0.0}, {'Classification Method': 'LightGBM for VAE data', 'Data Process Method': 'Over-sampled Data', 'Accuracy': 0.7951453720992264, 'Precision': 0.9788289778365862, 'Recall': 0.8078078078078078, 'F1 Score': 0.8851331139694885, 'Negative Accuracy': 0.33666666666666667}, {'Classification Method': 'LightGBM for VAE data', 'Data Process Method': 'Under-sampled Data', 'Accuracy': 0.48306214990664176, 'Precision': 0.9826524902070509, 'Recall': 0.4793884793884794, 'F1 Score': 0.6444036697247707, 'Negative Accuracy': 0.5666666666666667}, {'Classification Method': 'LightGBM for VAE data', 'Data Process Method': 'Tomelinks Data', 'Accuracy': 0.9770605494798613, 'Precision': 0.9770605494798613, 'Recall': 1.0, 'F1 Score': 0.9883971937398812, 'Negative Accuracy': 0.0}, {'Class