# 불균형 데이터 분류

# Imbalanced classification : credit card fraud detection  

한 클래스의 데이터 수가 다른 클래스보다 훨씬 많은 경우를 __불균형한 데이터 (Imbalanced classification)__ 라고 한다.  

이러한 불균형 데이터셋을 분류해보자.  
데이터는 kaggle의 [Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud) 을 불러와 사용한다.

[Tensorflow : 불균형 데이터 분류](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#%ED%9B%88%EB%A0%A8_%EC%9D%B4%EB%A0%A5_%EC%9E%AC%ED%99%95%EC%9D%B8)

![image](https://user-images.githubusercontent.com/84179578/127277084-5697d07b-a2df-444e-af5d-45553ed254b0.png)

## 필요한 모듈 불러오기

In [34]:
import csv
import numpy as np

from tensorflow import keras

## 데이터 불러오기

In [2]:
import csv
import numpy as np


fname = "data/creditcard.csv"

all_features = []
all_targets = []
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue                        # 컬럼 이름은 제외
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])


features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
features.shape: (284807, 30)
targets.shape: (284807, 1)


## training set 과 validation set 분할

In [3]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


## 불균형 데이터 분석

In [4]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

Number of positive samples in training data: 417 (0.18% of total)


## 정규화 Normalize

train set 의 평균과 표준편차를 이용해 정규화 과정 진행

In [5]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

## binary classification 모델 생성

In [6]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               7936      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
__________________________________________________

## class_weight 인자를 이용해 모델 학습  --> KET POINT

In [8]:
weight_for_0 = 1.0 / counts[0]    # 가중치를 정하는 수식은 데이터에 따라 유동성있음
weight_for_1 = 1.0 / counts[1]

In [10]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
112/112 - 3s - loss: 1.5875e-07 - fn: 2.0000 - fp: 1749.0000 - tn: 225680.0000 - tp: 415.0000 - precision: 0.1918 - recall: 0.9952 - val_loss: 0.0103 - val_fn: 13.0000 - val_fp: 193.0000 - val_tn: 56693.0000 - val_tp: 62.0000 - val_precision: 0.2431 - val_recall: 0.8267
Epoch 2/30
112/112 - 1s - loss: 1.0042e-07 - fn: 0.0000e+00 - fp: 1077.0000 - tn: 226352.0000 - tp: 417.0000 - precision: 0.2791 - recall: 1.0000 - val_loss: 0.0089 - val_fn: 14.0000 - val_fp: 139.0000 - val_tn: 56747.0000 - val_tp: 61.0000 - val_precision: 0.3050 - val_recall: 0.8133
Epoch 3/30
112/112 - 1s - loss: 1.3670e-07 - fn: 2.0000 - fp: 1151.0000 - tn: 226278.0000 - tp: 415.0000 - precision: 0.2650 - recall: 0.9952 - val_loss: 0.0257 - val_fn: 11.0000 - val_fp: 436.0000 - val_tn: 56450.0000 - val_tp: 64.0000 - val_precision: 0.1280 - val_recall: 0.8533
Epoch 4/30
112/112 - 1s - loss: 1.2207e-07 - fn: 0.0000e+00 - fp: 1443.0000 - tn: 225986.0000 - tp: 417.0000 - precision: 0.2242 - recall: 1.0000 - va

Epoch 30/30
112/112 - 1s - loss: 1.0274e-07 - fn: 1.0000 - fp: 745.0000 - tn: 226684.0000 - tp: 416.0000 - precision: 0.3583 - recall: 0.9976 - val_loss: 0.0249 - val_fn: 9.0000 - val_fp: 312.0000 - val_tn: 56574.0000 - val_tp: 66.0000 - val_precision: 0.1746 - val_recall: 0.8800


## 정리

불균형한 데이터를 다룰때, 적은 데이터에 더 큰 중요도를 줘야하는 경우가 있다. 이럴때는 모델을 학습시킬때 `class_weight` 인자를 이용하여 가중치를 줄 수 있다.  

또한 데이터 준비 단계에서 데이터 수가 적은 샘플에 대해서 __오버샘플링__ 과정을 진행 함으로써 여러 불균형한 데이터셋을 다룰 수 있다.