In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


warnings.filterwarnings(action='ignore')

In [17]:
train_df = pd.read_csv('./Data/train.csv') # Train
train_df = train_df.drop(columns=['ID']) 
val_df = pd.read_csv('./Data/val.csv') # Validation

val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal

train_x = train_df.copy()

def get_pred_label(model_pred):
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

In [18]:


from sklearn import preprocessing

def df_normal(norm_df):
    col = list(norm_df.columns[:])
    x = norm_df[col].values
    #최대값과 최소값의 값 가져오기
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler
    # 정규화 시킬 최종 값은 비율로 계산되기 때문에 float설정
    x_scaled = min_max_scaler.fit_transform(x.astype(float))
    x_scaled

    norm_df_norm = pd.DataFrame(x_scaled, columns=col, index=norm_df.index)
    return norm_df_norm

In [20]:
train_df_norm = df_normal(train_df)
val_x_norm = df_normal(val_x)

In [26]:
train_df_norm.to_csv("./norm_Data/train_norm.csv",index=False)
val_x_norm.to_csv("./norm_Data/val_x_norm.csv",index=False)


In [28]:
val_x_norm.tail()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
28457,0.907349,0.725133,0.868055,0.348965,0.481234,0.551452,0.561737,0.685828,0.48064,0.643668,...,0.388826,0.637336,0.568382,0.438913,0.523647,0.257375,0.545613,0.354415,0.003069,0.999826
28458,0.900517,0.709703,0.850779,0.27071,0.489491,0.544557,0.556383,0.689571,0.501823,0.609683,...,0.382689,0.554674,0.575542,0.525004,0.50467,0.399586,0.521043,0.3445,0.006724,0.999861
28459,0.910463,0.722728,0.87226,0.352196,0.482466,0.550039,0.564999,0.651923,0.519854,0.648162,...,0.405864,0.605004,0.568999,0.428226,0.52294,0.26452,0.5565,0.356753,0.005085,0.999896
28460,0.988342,0.695769,0.842372,0.308731,0.469669,0.545766,0.543473,0.675804,0.56885,0.653727,...,0.37527,0.535192,0.579067,0.378578,0.494755,0.385389,0.524941,0.344093,0.000225,0.999977
28461,0.916931,0.708646,0.89758,0.335219,0.461629,0.593477,0.531229,0.696403,0.56636,0.634312,...,0.389839,0.638785,0.568656,0.453265,0.472822,0.459472,0.535191,0.351548,0.00084,1.0


In [30]:
# 정규화된 데이터 테스트

collist01 = ['V3','V9','V11','V12','V16','V17']
train_x_norm =train_df_norm.loc[:,collist01]
model = IsolationForest(n_estimators=50, max_samples=len(train_x_norm), contamination=0.00105, random_state=42, verbose=0)
model.fit(train_x_norm)
## Evaluation : Validation set
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred
val_x_norm2 = val_x_norm.loc[:,collist01]
val_y = val_df['Class'] # Label

val_pred = model.predict(val_x_norm2) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.6061198984211713]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.13      0.63      0.21        30

    accuracy                           1.00     28462
   macro avg       0.56      0.81      0.61     28462
weighted avg       1.00      1.00      1.00     28462



In [43]:
# 일반 데이터 데스트
collist01 = ['V3','V9','V11','V12','V16']
train_x =train_df.loc[:,collist01]
model = IsolationForest(n_estimators=30, max_samples=len(train_x), contamination=0.00105, random_state=42, verbose=0)
model.fit(train_x)
## Evaluation : Validation set
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred
val_x = val_df.loc[:,collist01]
val_y = val_df['Class'] # Label

val_pred = model.predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.7928924258723169]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.61      0.57      0.59        30

    accuracy                           1.00     28462
   macro avg       0.80      0.78      0.79     28462
weighted avg       1.00      1.00      1.00     28462



In [33]:
#제출용
test_df = pd.read_csv('./Data/test.csv') # Train
test_df.head()
test_x = test_df.drop(columns=['ID'])
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)
## Submission
submit = pd.read_csv('../Data/sample_submission.csv')
submit.head()
submit['Class'] = test_pred
submit.to_csv('./submit.csv', index=False)

ValueError: X has 30 features, but IsolationForest is expecting 2 features as input.