### One Class SVM

SVM과 달리 비지도 학습으로 주어진 데이터를 잘 설명할 수 있는 최적의 support vector를 구하고 이 영역 밖의 데이터들은 outlier로 간주하는 방식  

### Import Module

In [1]:
from sklearn.svm import OneClassSVM
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

### Data Load

In [2]:
train_df = pd.read_csv("/home/yjhwang/finance/CreditCard_Fraud/data/train.csv")
val_df = pd.read_csv("/home/yjhwang/finance/CreditCard_Fraud/data/val.csv")
test_df = pd.read_csv("/home/yjhwang/finance/CreditCard_Fraud/data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,3,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,4,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972
2,6,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,-0.256131,-0.99496
3,8,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901
4,9,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.9949,-0.994901


In [4]:
train_x = train_df.drop(columns=['ID'])

### Define Model

In [6]:
svm = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.03)

In [7]:
print(svm)

OneClassSVM(gamma=0.001, nu=0.03)


In [8]:
svm.fit(train_x)

OneClassSVM(gamma=0.001, nu=0.03)

In [16]:
val_x = val_df.drop(columns=['ID','Class'])
val_y = val_df['Class'] # Label

In [10]:
val_pred = svm.predict(val_x)

In [13]:
unique, counts = np.unique(val_pred, return_counts=True)
dict(zip(unique, counts))

{-1: 864, 1: 27598}

In [14]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [15]:
val_pred = get_pred_label(val_pred)

In [17]:
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.5204325263591213]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     28432
           1       0.03      0.83      0.06        30

    accuracy                           0.97     28462
   macro avg       0.51      0.90      0.52     28462
weighted avg       1.00      0.97      0.98     28462



### Inference

In [18]:
test_x = test_df.drop(columns=['ID'])

In [19]:
test_pred = svm.predict(test_x)

In [20]:
test_pred = get_pred_label(test_pred)

## Submission

In [21]:
submit = pd.read_csv("/home/yjhwang/finance/CreditCard_Fraud/data/sample_submission.csv")
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,1
1,AAAA0x2,1
2,AAAA0x5,1
3,AAAA0x7,1
4,AAAA0xc,1


In [22]:
submit['Class'] = test_pred

In [23]:
submit['Class'].value_counts()

0    138043
1      4460
Name: Class, dtype: int64