# 引入必要库

In [68]:
#import needed libraries、
import pandas as pd
import numpy as np
import torch
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
import matplotlib.pyplot as plt
from VAE_pipeline import train_vae

# 选择设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 数据准备

In [69]:
df_application_record = pd.read_csv("application_record.csv")
df_credit_record = pd.read_csv("credit_record.csv")

print(df_application_record.info())
print(df_application_record.nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [70]:
#For each set of duplicate ID's drop both of them
df_application_record = df_application_record.drop_duplicates(subset = 'ID', keep = False)
print(df_application_record.info())

<class 'pandas.core.frame.DataFrame'>
Index: 438463 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438463 non-null  int64  
 1   CODE_GENDER          438463 non-null  object 
 2   FLAG_OWN_CAR         438463 non-null  object 
 3   FLAG_OWN_REALTY      438463 non-null  object 
 4   CNT_CHILDREN         438463 non-null  int64  
 5   AMT_INCOME_TOTAL     438463 non-null  float64
 6   NAME_INCOME_TYPE     438463 non-null  object 
 7   NAME_EDUCATION_TYPE  438463 non-null  object 
 8   NAME_FAMILY_STATUS   438463 non-null  object 
 9   NAME_HOUSING_TYPE    438463 non-null  object 
 10  DAYS_BIRTH           438463 non-null  int64  
 11  DAYS_EMPLOYED        438463 non-null  int64  
 12  FLAG_MOBIL           438463 non-null  int64  
 13  FLAG_WORK_PHONE      438463 non-null  int64  
 14  FLAG_PHONE           438463 non-null  int64  
 15  FLAG_EMAIL           4

In [71]:
print(df_credit_record.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB
None


In [72]:
#show how many unique IDs we will be able to work with in the dataframes
print("# of unique IDs that are consistent between both datasets", df_application_record[df_application_record['ID'].isin(df_credit_record['ID'])]['ID'].nunique())

#adjust the dataframes so that we only work with the consistent IDs
df_application_record = df_application_record[df_application_record['ID'].isin(df_credit_record['ID'])]
df_credit_record = df_credit_record[df_credit_record['ID'].isin(df_application_record['ID'])]
print("New # of IDs in application_record", df_application_record['ID'].nunique())
print("New # of IDs in credit_record", df_credit_record['ID'].nunique())

# of unique IDs that are consistent between both datasets 36457
New # of IDs in application_record 36457
New # of IDs in credit_record 36457


# 数据清洗

In [73]:
df_credit_record['APPROVED'] = df_credit_record['STATUS'].map({'1':0,'2':0,'3':0,'4':0,'5':0,'X':-1,'C':1,'0':1})
df_credit_record = df_credit_record[df_credit_record['APPROVED']!=-1]
print(df_credit_record['STATUS'].value_counts())

STATUS
C    329536
0    290654
1      8747
5      1527
2       801
3       286
4       214
Name: count, dtype: int64


In [74]:
df_application_record = df_application_record.merge(df_credit_record, on='ID')
print(df_application_record.head())

        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008804           M            Y               Y             0   
2  5008804           M            Y               Y             0   
3  5008804           M            Y               Y             0   
4  5008804           M            Y               Y             0   

   AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS  \
0          427500.0          Working    Higher education     Civil marriage   
1          427500.0          Working    Higher education     Civil marriage   
2          427500.0          Working    Higher education     Civil marriage   
3          427500.0          Working    Higher education     Civil marriage   
4          427500.0          Working    Higher education     Civil marriage   

  NAME_HOUSING_TYPE  ...  DAYS_EMPLOYED  FLAG_MOBIL  FLAG_WORK_PHONE  \
0  Rented apartment  ...          -454

In [75]:
df_application_record = df_application_record[df_application_record['MONTHS_BALANCE']==-4]
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Pensioner","OCCUPATION_TYPE"] = "Pension"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Commercial associate","OCCUPATION_TYPE"] = "Commercial associate"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="State servant","OCCUPATION_TYPE"] = "State servant"
df_application_record.loc[df_application_record["NAME_INCOME_TYPE"]=="Student","OCCUPATION_TYPE"] = "Student"
df_application_record = df_application_record.dropna()
print(df_application_record.isna().sum())

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
MONTHS_BALANCE         0
STATUS                 0
APPROVED               0
dtype: int64


In [76]:
df_application_record['Work_Time'] = -(df_application_record['DAYS_EMPLOYED'])//365

df_application_record = df_application_record.drop(df_application_record[df_application_record['Work_Time']>50].index)
df_application_record = df_application_record.drop(df_application_record[df_application_record['Work_Time']<0].index)
# df_application_record['Work_Time'].plot(kind='hist',bins=20,density=True)
df_application_record = df_application_record.drop(columns=['STATUS'])
df_application_record.drop(['DAYS_EMPLOYED'],axis=1,inplace=True)
print(df_application_record.head())


          ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
33   5008806           M            Y               Y             0   
49   5008810           F            N               Y             0   
70   5008811           F            N               Y             0   
145  5008815           M            Y               Y             0   
148  5112956           M            Y               Y             0   

     AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
33           112500.0               Working  Secondary / secondary special   
49           270000.0  Commercial associate  Secondary / secondary special   
70           270000.0  Commercial associate  Secondary / secondary special   
145          270000.0               Working               Higher education   
148          270000.0               Working               Higher education   

       NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  FLAG_MOBIL  \
33                Married  House

In [77]:
baseline_date = pd.to_datetime('2023-01-01')
df_application_record['BIRTH_DATE'] = baseline_date + pd.to_timedelta(df_application_record['DAYS_BIRTH'], unit='D')
df_application_record['AGE'] = (baseline_date - df_application_record['BIRTH_DATE']).dt.days // 365
df_application_record = df_application_record.drop(columns=['DAYS_BIRTH','BIRTH_DATE'])
print(df_application_record.isna().sum())

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
MONTHS_BALANCE         0
APPROVED               0
Work_Time              0
AGE                    0
dtype: int64


In [78]:
categorical_columns = ['CODE_GENDER', 'NAME_FAMILY_STATUS', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
dummy_columns = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
categorical_df = df_application_record[categorical_columns]
categorical_df = categorical_df.apply(lambda x: pd.factorize(x)[0])
categorical_df = pd.DataFrame(categorical_df)
df_application_record[categorical_columns] = categorical_df
df_application_record = pd.get_dummies(df_application_record, columns=dummy_columns)
print(df_application_record['APPROVED'].value_counts())
df_application_record.to_csv('dataset.csv', index=False)
print(df_application_record.columns)
print(df_application_record.head())


APPROVED
1    14693
0      300
Name: count, dtype: int64
Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_FAMILY_STATUS', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS',
       'MONTHS_BALANCE', 'APPROVED', 'Work_Time', 'AGE',
       'NAME_INCOME_TYPE_Commercial associate', 'NAME_INCOME_TYPE_Pensioner',
       'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Student',
       'NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment', 'NAME_HOUSING_TYPE_Wi

# 创建数据集

In [79]:
# scaler = MinMaxScaler()
# df_application_record['AMT_INCOME_TOTAL']=scaler.fit_transform(df_application_record['AMT_INCOME_TOTAL'].values.reshape(-1, 1))
# df_application_record['DAYS_EMPLOYED']=scaler.fit_transform(df_application_record['DAYS_EMPLOYED'].values.reshape(-1, 1))
# df_application_record['MONTHS_BALANCE']=scaler.fit_transform(df_application_record['MONTHS_BALANCE'].values.reshape(-1, 1))
# scaler = StandardScaler()
# df_application_record['CNT_FAM_MEMBERS']=scaler.fit_transform(df_application_record['CNT_FAM_MEMBERS'].values.reshape(-1, 1))
# df_application_record['AGE']=scaler.fit_transform(df_application_record['AGE'].values.reshape(-1, 1))

negative_data_orgin = df_application_record[df_application_record['APPROVED']==0]
negative_data = negative_data_orgin.drop(['APPROVED', 'ID','CODE_GENDER'], axis = 1)

X = df_application_record.drop(['APPROVED', 'ID','CODE_GENDER'], axis = 1)
y = df_application_record['APPROVED']
X = np.array(X,dtype=float)
y = np.array(y, dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.fit_transform(X_test)

negative_data = scalar.fit_transform(np.array(negative_data,dtype=float))
negative_label_list = np.zeros(len(negative_data))


# 创建 RandomUnderSampler 对象
undersampler = RandomUnderSampler(sampling_strategy='majority')

# 使用 RandomUnderSampler 来生成平衡的训练集
X_train_under_random, y_train_under_random = undersampler.fit_resample(X_train, y_train)

# 创建RandomOverSampler对象
oversampler = RandomOverSampler(sampling_strategy='minority')

# 使用RandomOverSampler来生成平衡的训练集
X_train_over_random, y_train_over_random = oversampler.fit_resample(X_train, y_train)

# 创建TomekLinks对象
undersampler = TomekLinks()

# 使用TomekLinks来生成平衡的训练集
X_train_under_tomelinks, y_train_under_tomelinks = undersampler.fit_resample(X_train, y_train)

# 创建SMOTE对象
smote = SMOTE(sampling_strategy='minority',random_state=42)

# 使用SMOTE来生成平衡的训练集
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [80]:
data_list = [
             (X_train, y_train, "Original Data"),
            #  (X_train_over_random,y_train_over_random, "Over-sampled Data"),
             (X_train_under_random,y_train_under_random, "Under-sampled Data"),
            #  (X_train_under_tomelinks,y_train_under_tomelinks, "Tomelinks Data"),
             (X_train_smote,y_train_smote, "SMOTE Data")
            ]


# 分类

In [81]:
performance_data= []

## LightGBM

In [82]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'binary_logloss',
    'learning_rate': 0.05,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5,
    'n_estimators': 100,
    'colsample_bytree': .5,
    'min_child_samples': 100,
    'subsample': .9,
    'importance_type': 'gain',
    'random_state': 71,
    'num_leaves': 32,
    'force_col_wise': True,
    'scale_pos_weight': 1,
    'bagging_freq': 5,
}


In [83]:
index = 0

performance_data = []
for X_train_processed, y_train_processed, method_name in data_list:
    lgb_model = lgb.LGBMClassifier(**params, verbose=-1)

    lgb_model.fit(X_train_processed, y_train_processed)

    print(len(negative_data))
    y_pred = lgb_model.predict(X_test)
    y_prob = lgb_model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    negative_data_pred = lgb_model.predict(negative_data)
    negative_accuracy = accuracy_score(negative_label_list, negative_data_pred)

    print(
        classification_report(negative_label_list,
                              negative_data_pred,
                              zero_division=1))

    performance_data.append({
        'Classification Method' : 'LightGBM',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1


300
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

300
              precision    recall  f1-score   support

           0       0.02      0.61      0.04        62
           1       0.99      0.51      0.68      3687

    accuracy                           0.52      3749
   macro avg       0.50      0.56      0.36      3749
weighted avg       0.97      0.52      0.67      3749

           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## RandomForest

In [84]:
for X_train_processed, y_train_processed, method_name in data_list:
    rfc = RandomForestClassifier(n_estimators=1000, max_features=12)
    rfc.fit(X_train_processed, y_train_processed)
    predictions = rfc.predict(X_test)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions))
    negative_predictions = rfc.predict(negative_data)
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              negative_predictions,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    performance_data.append({
        'Classification Method': 'Random Forest',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.06      0.02      0.03        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.52      0.51      0.51      3749
weighted avg       0.97      0.98      0.97      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.02      0.04       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.02       300
   macro avg       0.50      0.51      0.02       300
weighted avg       1.00      0.02      0.04       300

Classification Report for Under-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.58      0.04        62
           1       0.99      0.56      0.71      3687

    accurac

## AdaBoost for Desicion Tree

In [85]:
for X_train_processed, y_train_processed, method_name in data_list:
    # 使用决策树桩作为弱分类器，也可以选择其他弱分类器
    base_classifier = DecisionTreeClassifier(max_depth=1)

    # 使用AdaBoost分类器
    adaboost = AdaBoostClassifier(base_classifier,
                                  n_estimators=1000,
                                  algorithm='SAMME',
                                  random_state=42)

    # 训练模型
    adaboost.fit(X_train_processed, y_train_processed)

    # 在测试集上进行预测和评估
    predictions_test = adaboost.predict(X_test)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions_test))

    # 在负样本数据上进行预测和评估
    predictions_negative = adaboost.predict(negative_data)
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              predictions_negative,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    recall = recall_score(y_test, predictions_test)
    f1 = f1_score(y_test, predictions_test)
    performance_data.append({
        'Classification Method': 'AdaBoost for Decision Tree',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Under-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.60      0.04        62
           1       0.99      0.53      0.69      3687

    accuracy                           0.53      3749
   macro avg       0.50      0.56      0.37      3749
weighted avg       0.97      0.53      0.68      3749

Classification Report for Under-sampled Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.55      0.71       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.55       300
   macro avg       0.50      0.77      0.35       300
weighted avg       1.00      0.55      0.71       300

Classification Report for SMOTE Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      1.00      0.03        62
           1       1.00      0.00      0.00      3687

    accur

## SVM

In [86]:
for X_train_processed, y_train_processed, method_name in data_list:
   # 使用 SVM 替代 RandomForestClassifier
   svm_model = SVC()
   svm_model.fit(X_train_processed, y_train_processed)

   # SVM 在测试集上的分类报告
   predictions_svm_test = svm_model.predict(X_test)
   print(f"Method: {method_name} - SVM Classification Report on Test Data:")
   print(classification_report(y_test, predictions_svm_test))

   # SVM 在负样本上的分类报告
   predictions_svm_negative = svm_model.predict(negative_data)
   print(
      f"Method: {method_name} - SVM Classification Report on Negative Data:")
   print(
      classification_report(negative_label_list,
                           predictions_negative,
                           zero_division=1))
   accuracy = accuracy_score(y_test, predictions_svm_test)
   precision = precision_score(y_test, predictions_svm_test)
   recall = recall_score(y_test, predictions_svm_test)
   f1 = f1_score(y_test, predictions_svm_test)
   performance_data.append({
       'Classification Method': 'SVM',
       'Data Process Method': method_name,
       'Accuracy': accuracy,
       'Precision': precision,
       'Recall': recall,
       'F1 Score': f1,
       'Negative Accuracy': negative_accuracy,
   })


Method: Original Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

Method: Original Data - SVM Classification Report on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

Method: Under-sampled Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.53      0.03        62
           1       0.98      0.50      0.67      3687

    accuracy                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Method: SMOTE Data - SVM Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.18      0.03        62
           1       0.98      0.81      0.89      3687

    accuracy                           0.80      3749
   macro avg       0.50      0.49      0.46      3749
weighted avg       0.97      0.80      0.87      3749

Method: SMOTE Data - SVM Classification Report on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



## AdaBoost for SVM

In [87]:
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report

# for X_train_processed, y_train_processed, method_name in data_list:
#     # 使用支持向量机作为弱分类器
#     base_classifier = SVC(kernel='linear', C=1.0)

#     # 使用AdaBoost分类器
#     adaboost = AdaBoostClassifier(base_classifier,
#                                   n_estimators=1000,
#                                   algorithm='SAMME',
#                                   random_state=42)

#     # 训练模型
#     adaboost.fit(X_train_processed, y_train_processed)

#     # 在测试集上进行预测和评估
#     predictions_test = adaboost.predict(X_test)
#     print(f"Classification Report for {method_name} on Test Data:")
#     print(classification_report(y_test, predictions_test))

#     # 在负样本数据上进行预测和评估
#     predictions_negative = adaboost.predict(negative_data)
#     print(f"Classification Report for {method_name} on Negative Data:")
#     print(
#         classification_report(negative_label_list,
#                               predictions_negative,
#                               zero_division=1))
#     accuracy = accuracy_score(y_test, predictions_test)
#     precision = precision_score(y_test, predictions_test)
#     recall = recall_score(y_test, predictions_test)
#     f1 = f1_score(y_test, predictions_test)
#     performance_data.append({
#         'Classification Method': 'AdaBoost for SVM',
#         'Data Process Method': method_name,
#         'Accuracy': accuracy,
#         'Precision': precision,
#         'Recall': recall,
#         'F1 Score': f1,
#         'Negative Accuracy': negative_accuracy,
#     })

In [88]:
# index = 0
# for X_train_processed, y_train_processed, method_name in data_list:
#     svm_model = SVC()
#     train_feature = train_features[index]
#     test_feature = test_features[index]
#     svm_model.fit(train_feature, y_train_processed)
#     predictions = svm_model.predict(test_feature)
#     print(classification_report(y_test,predictions))
#     predictions = svm_model.predict(negative_data_list[index])
#    print(classification_report(negative_label_list, predictions_negative, zero_division=1))
#     index = index + 1

# 特征提取

In [89]:
model_list = []
for X_train_processed, _, method_name in data_list:
   print("Training for the method: " + method_name)
   model = train_vae(X_train=X_train_processed,X_test=X_test, progress=False,num_epoch=100).eval()
   model_list.append(model)

total_list = [ t+ (data,)  for t, data in zip(data_list, model_list)]

index = 0
train_features = []
test_features = []
negative_data_list = []
for X_train_processed, _, method_name, __ in total_list:
    train_features.append(model_list[index].encoder(torch.Tensor(X_train_processed).to(device)).cpu().detach().numpy())
    test_features.append(model_list[index].encoder(torch.Tensor(X_test).to(device)).detach().cpu().numpy())
    negative_data_list.append(model_list[index].encoder(torch.Tensor(negative_data).to(device)).detach().cpu().numpy())
    index+=1

   
# for X_train_processed, _, method_name, model in model_list:
#     model.eval()
#     with torch.no_grad():
#         encoded_data = model.encoder(torch.Tensor(X_train_processed).to(device))
#         encoded_data = encoded_data.cpu().numpy()
#         tsne = TSNE(n_components=2)
#         reduced_data = tsne.fit_transform(encoded_data)

# plt.scatter(reduced_data[:, 0], reduced_data[:, 1])
# plt.title("VAE Visualization")
# plt.show()

Training for the method: Original Data


                                                            

Best epoch: 99
Training for the method: Under-sampled Data


                                                           

Best epoch: 99
Training for the method: SMOTE Data


                                                            

Best epoch: 99




## LightGBM for VAE data

In [90]:
index = 0

performance_data = []
data_df = pd.DataFrame(columns=['Method', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Negative Accuracy'])

for X_train_processed, y_train_processed, method_name, _ in total_list:
    lgb_model = lgb.LGBMClassifier(**params, verbose=-1)

    lgb_model.fit(train_features[index], y_train_processed)

    print(len(negative_data))
    y_pred = lgb_model.predict(test_features[index])
    y_prob = lgb_model.predict_proba(test_features[index])[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(classification_report(y_test, y_pred))
    negative_data_pred = lgb_model.predict(negative_data_list[index])
    negative_accuracy = accuracy_score(negative_label_list, negative_data_pred)

    print(
        classification_report(negative_label_list,
                              negative_data_pred,
                              zero_division=1))

    performance_data.append({
        'Classification Method': 'LightGBM for VAE data',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1

data_df = pd.DataFrame(performance_data)
print(data_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


300
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

300
              precision    recall  f1-score   support

           0       0.02      0.50      0.03        62
           1       0.98      0.48      0.65      3687

    accuracy                           0.48      3749
   macro avg       0.50      0.49      0.34      3749
weighted avg       0.97      0.48      0.64      3749

           

## SVM for VAE data

In [92]:
index = 0
for X_train_processed, y_train_processed, method_name in data_list:
    svm_model = SVC()
    train_feature = train_features[index]
    test_feature = test_features[index]
    svm_model.fit(train_feature, y_train_processed)
    predictions = svm_model.predict(test_feature)
    print(classification_report(y_test, predictions))
    negative_predictions= svm_model.predict(negative_data_list[index])
    print(
        classification_report(negative_label_list,
                              negative_predictions,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    negative_accuracy = accuracy_score(negative_label_list, negative_predictions)
    performance_data.append({
        'Classification Method':
        'SVM for VAE data',
        'Data Process Method':
        method_name,
        'Accuracy':
        accuracy,
        'Precision':
        precision,
        'Recall':
        recall,
        'F1 Score':
        f1,
        'Negative Accuracy':
        negative_accuracy,
    })
    index = index + 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     300.0
         1.0       0.00      1.00      0.00       0.0

    accuracy                           1.00     300.0
   macro avg       0.50      0.50      0.00     300.0
weighted avg       1.00      0.00      0.00     300.0

              precision    recall  f1-score   support

           0       0.02      0.56      0.04        62
           1       0.99      0.60      0.75      3687

    accuracy                           0.60      3749
   macro avg       0.51      0.58      0.40      3749
weighted avg       0.97      0.60      0.74      3749

              preci

## AdaBoost for Desicion Tree for VAE data

In [93]:
index = 0
for X_train_processed, y_train_processed, method_name in data_list:

    train_feature = train_features[index]
    test_feature = test_features[index]
    # 使用决策树桩作为弱分类器，也可以选择其他弱分类器
    base_classifier = DecisionTreeClassifier(max_depth=1)

    # 使用AdaBoost分类器
    adaboost = AdaBoostClassifier(base_classifier,
                                  n_estimators=1000,
                                  algorithm='SAMME',
                                  random_state=42)

    # 训练模型
    adaboost.fit(train_feature, y_train_processed)

    # 在测试集上进行预测和评估
    predictions_test = adaboost.predict(test_feature)
    print(f"Classification Report for {method_name} on Test Data:")
    print(classification_report(y_test, predictions_test))

    # 在负样本数据上进行预测和评估
    predictions_negative = adaboost.predict(negative_data_list[index])
    print(f"Classification Report for {method_name} on Negative Data:")
    print(
        classification_report(negative_label_list,
                              predictions_negative,
                              zero_division=1))
    accuracy = accuracy_score(y_test, predictions_test)
    precision = precision_score(y_test, predictions_test)
    recall = recall_score(y_test, predictions_test)
    f1 = f1_score(y_test, predictions_test)
    negative_accuracy = accuracy_score(negative_label_list, predictions_negative)
    performance_data.append({
        'Classification Method': 'AdaBoost for Decision Tree',
        'Data Process Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Negative Accuracy': negative_accuracy,
    })
    index += 1

Classification Report for Original Data on Test Data:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        62
           1       0.98      1.00      0.99      3687

    accuracy                           0.98      3749
   macro avg       0.49      0.50      0.50      3749
weighted avg       0.97      0.98      0.98      3749

Classification Report for Original Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.01       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.00       300
   macro avg       0.50      0.50      0.00       300
weighted avg       1.00      0.00      0.01       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Under-sampled Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.60      0.04        62
           1       0.99      0.52      0.68      3687

    accuracy                           0.52      3749
   macro avg       0.50      0.56      0.36      3749
weighted avg       0.97      0.52      0.67      3749

Classification Report for Under-sampled Data on Negative Data:
              precision    recall  f1-score   support

         0.0       1.00      0.59      0.74       300
         1.0       0.00      1.00      0.00         0

    accuracy                           0.59       300
   macro avg       0.50      0.79      0.37       300
weighted avg       1.00      0.59      0.74       300

Classification Report for SMOTE Data on Test Data:
              precision    recall  f1-score   support

           0       0.02      0.37      0.03        62
           1       0.98      0.62      0.76      3687

    accur

In [94]:
data_df = pd.DataFrame(performance_data)
# 找到最佳性能数据的行
best_accuracy_row = data_df[data_df['Accuracy'] == data_df['Accuracy'].max()]
best_precision_row = data_df[data_df['Precision'] == data_df['Precision'].max()]
best_recall_row = data_df[data_df['Recall'] == data_df['Recall'].max()]
best_f1_score_row = data_df[data_df['F1 Score'] == data_df['F1 Score'].max()]
best_negative_accuracy_row = data_df[data_df['Negative Accuracy'] == data_df['Negative Accuracy'].max()]

# 输出最佳性能数据
print("best Accuracy Classification Method:", best_accuracy_row['Classification Method'].values[0])
print("Best Accuracy Data Process Method:", best_accuracy_row['Data Process Method'].values[0])
print("Best Accuracy Value:", best_accuracy_row['Accuracy'].values[0])

print("best Accuracy Classification Method:", best_precision_row['Classification Method'].values[0])
print("Best Precision Data Process Method:", best_precision_row['Data Process Method'].values[0])
print("Best Precision Value:", best_precision_row['Precision'].values[0])

print("best Accuracy Classification Method:", best_recall_row['Classification Method'].values[0])
print("Best Recall Data Process Method:", best_recall_row['Data Process Method'].values[0])
print("Best Recall Value:", best_recall_row['Recall'].values[0])

print("best Accuracy Classification Method:", best_f1_score_row['Classification Method'].values[0])
print("Best F1 Score Data Process Method:", best_f1_score_row['Data Process Method'].values[0])
print("Best F1 Score Value:", best_f1_score_row['F1 Score'].values[0])


print("best Accuracy Classification Method:", best_negative_accuracy_row['Classification Method'].values[0])
print("Best Negative Accuracy Data Process Method:", best_negative_accuracy_row['Data Process Method'].values[0])
print("Best Negative Accuracy Value:", best_negative_accuracy_row['Negative Accuracy'].values[0])
data_df.to_csv('performance_data.csv', index=False)

best Accuracy Classification Method: LightGBM for VAE data
Best Accuracy Data Process Method: Original Data
Best Accuracy Value: 0.9834622566017605
best Accuracy Classification Method: SVM for VAE data
Best Precision Data Process Method: Under-sampled Data
Best Precision Value: 0.9879732739420936
best Accuracy Classification Method: LightGBM for VAE data
Best Recall Data Process Method: Original Data
Best Recall Value: 1.0
best Accuracy Classification Method: LightGBM for VAE data
Best F1 Score Data Process Method: Original Data
Best F1 Score Value: 0.9916621839698763
best Accuracy Classification Method: AdaBoost for Decision Tree
Best Negative Accuracy Data Process Method: Under-sampled Data
Best Negative Accuracy Value: 0.5866666666666667


# Ensemble