# lab 8 随机森林实现信用卡欺诈预测

## import environment

In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

## read data

In [3]:
data_path = 'creditcard.csv'
df = pd.read_csv(data_path)

df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [4]:
X = np.array(df.values[:,1:30])
y = np.array(df.values[:,-1])

X.shape, y.shape

((284807, 29), (284807,))

## preprocessing

In [5]:
X_std = preprocessing.scale(X)
pca = PCA(n_components=20)
X_pca = pca.fit(X_std).transform(X_std)

## dataset split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, stratify=y)

In [7]:
## print the rate
print(f'Distribution of y_train: {Counter(y_train)}')
print(f'Distribution of y_test: {Counter(y_test)}')

Distribution of y_train: Counter({0.0: 227451, 1.0: 394})
Distribution of y_test: Counter({0.0: 56864, 1.0: 98})


## model establishing

In [8]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    random_state=10
)
model.fit(X_train, y_train)

In [9]:
pred = model.predict(X_test)
pred_score = model.predict_proba(X_test)
acc = np.sum([pred==y_test])/pred.shape[0]
print(f'The accuracy on test: {acc*100:.2f}%')

The accuracy on test: 99.95%


## analyse

In [10]:
print(classification_report(y_true=y_test, y_pred=pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56864
         1.0       0.95      0.76      0.84        98

    accuracy                           1.00     56962
   macro avg       0.97      0.88      0.92     56962
weighted avg       1.00      1.00      1.00     56962



In [35]:
print(f'The ROC_AUC score is {roc_auc_score(y_test, pred_score[:,1]):.4f}')

The ROC_AUC score is 0.9871
