In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os

abs_path = os.path.abspath('.')
current_path = abs_path + '/tmp/pycharm_project_317/pytorch_dl/'
credits_data_path = current_path + 'creditcard.csv'

In [4]:
# 加载数据文件
df = pd.read_csv(credits_data_path)
df.shape

(284807, 31)

In [7]:
from sklearn.preprocessing import StandardScaler

# 数据表中的大多数列的数据已经归一化，接下来对Amount进行归一化

df['scaled_amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1)) #全额归一化
df['scaled_time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1)) # 时间归一化

df.drop(['Amount', 'Time'], axis=1, inplace=True) #删除原始数据的列

In [8]:
print("正常交易的数据量： ", df.loc[df['Class']==0].shape[0])
print("欺诈交易的数据量： ", df.loc[df['Class']==1].shape[0])

正常交易的数据量：  284315
欺诈交易的数据量：  492


In [10]:
X = df.drop('Class', axis=1) # 删除数据集Class列
y = df['Class'] #标签

print('X shape :', X.shape)
print('y shape : ', y.shape)

X shape : (284807, 30)
y shape :  (284807,)


### # 方法1：利用SMOTE解决数据不平衡的问题

In [11]:
from imblearn.over_sampling import SMOTE

X_new_1, y_new_1 = SMOTE().fit_resample(X, y)

In [13]:
# 新的类分布
print(y_new_1.value_counts())

0    284315
1    284315
Name: Class, dtype: int64


In [14]:
X_new_1.shape # 数据集已经扩充

(568630, 30)

### # 方法2：利用SMOTE+undersamping解决数据不平衡

In [16]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# over
over = SMOTE(sampling_strategy=0.1)

# under
under = RandomUnderSampler(sampling_strategy=0.5)

# pipeline
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_new_2, y_new_2 = pipeline.fit_resample(X, y)

In [17]:
print(y_new_2.value_counts())

0    56862
1    28431
Name: Class, dtype: int64


In [18]:
X_new_2.shape

(85293, 30)

### 数据集分离：train和test

#### 方法1的数据集

In [23]:
from sklearn.model_selection import train_test_split

X_new_1_train, X_new_1_test, y_new_1_train, y_new_1_test = train_test_split(X_new_1, y_new_1)

# 数据集
X_new_1_train = X_new_1_train.values
X_new_1_test = X_new_1_test.values

# 标签
y_new_1_train = y_new_1_train.values
y_new_1_test = y_new_1_test.values

In [24]:
print(X_new_1_train.shape)
print(X_new_1_test.shape)

(426472, 30)
(142158, 30)


#### 方法2的数据集

In [26]:
X_new_2_train, X_new_2_test, y_new_2_train, y_new_2_test = train_test_split(X_new_2, y_new_2)

# 数据集
X_new_2_train = X_new_2_train.values
X_new_2_test = X_new_2_test.values

# 标签
y_new_2_train = y_new_2_train.values
y_new_2_test = y_new_2_test.values

In [27]:
print(X_new_2_train.shape)
print(X_new_2_test.shape)

(63969, 30)
(21324, 30)


### 模型训练

In [32]:
# 简单分类器实现

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier

classifiers = {
    'LogisticRegression':LogisticRegression(), # 逻辑回归
    '''
        SVM=Support Vector Machine 是支持向量
        SVC=Support Vector Classification就是支持向量机用于分类，
        SVR=Support Vector Regression.就是支持向量机用于回归分析
    '''
    'SVC':SVC(),                               # 支撑向量机用于分类
    'KNN':KNeighborsClassifier(),              # k邻近
    'DT':DecisionTreeClassifier(),             # 决策树
    'RFC':RandomForestClassifier(),            # 随机森林
    'Bagging':BaggingClassifier(),             # 集成学习bagging
    'SGD':SGDClassifier(),                     # 随机梯度
    'GBC':GradientBoostingClassifier(),        # 集成学习Gradient
    'xgb':XGBClassifier()                      # 极限梯度提升树
}

In [33]:
from sklearn.model_selection import cross_val_score


def accurary_score(x_train, y_train):
    for key, classifier in classifiers.items(): # 遍历每一个分类器，分别训练、计算得分
        classifier.fit(x_train, y_train)
        training_socre = cross_val_score(classifier, x_train, y_train, cv=5) # 5折交叉验证
        print('Classifier Name : ', classifier.__class__.__name__, " Train Score : ", round(training_socre.mean(), 2)*100, '%')

#### 1 最简单的交叉验证

##### 方法1：训练交叉验证的结果

In [None]:
# 1.1 SMOTE
accurary_score(X_new_1_train, y_new_1_train)

Classifier Name :  LogisticRegression  Train Score :  95.0 %


##### 方法2：训练交叉验证的结果

In [None]:
# 1.2 SMOTE + under sampling
accurary_score(X_new_2_train, y_new_2_train)