## **초기 세팅**
* Google Colab 환경

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"
!unzip "open.zip"

## **결과**
* Validation F1 Score : 70%
* 점수 : 68

## **Code**
### **Import**

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

### **Data Load**

In [None]:
train_df = pd.read_csv('./train.csv') # Train
train_df.head()

In [None]:
val_df = pd.read_csv('./val.csv') # Validation
val_df.head()

### **Train / Validation Feature 분포 확인**

In [None]:
train_df.drop(columns=['ID']).hist(bins = 50, figsize =(20,20))
plt.show()

In [None]:
val_df.drop(columns=['ID','Class']).hist(bins = 50, figsize =(20,20))
plt.show()

### **Validation set 사기 거래 비율**
* **Validation set의 사기 거래 비율이 다른 데이터 집합에서도 비슷하게 발생할 것이라고 가정**

In [None]:
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal
print(f'Validation contamination : [{val_contamination}]')

### **Model Define & Fit**

In [None]:
# Train dataset은 Label이 존재하지 않음
train_x = train_df.drop(columns=['ID']) # InputData(ID 제외)    

In [None]:
# 가설 설정 
# Train Dataset도 Validation Dataset과 동일한 비율로 사기거래가 발생했을 것이다. 
# => model parameter값을 val_contamination으로
model = IsolationForest(n_estimators = 125, max_samples = len(train_x), contamination = val_contamination, random_state=42, verbose=0)
model.fit(train_x)

### **Evaluation : Validation set**

In [None]:
def get_pred_label(model_pred):
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1,1, model_pred)

    return model_pred

In [None]:
val_x = val_df.drop(columns=['ID','Class']) #Input Data
val_y = val_df['Class'] # Label

val_pred = model.predict(val_x)
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y,val_pred,average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y,val_pred))

### **Inference : Test set**

In [None]:
test_df = pd.read_csv('./test.csv')
test_df.head()

In [None]:
test_x = test_df.drop(columns=['ID'])

In [None]:
test_scaled = Scaler.fit_transform(test_x)
test_x = pd.DataFrame(test_scaled, columns = test_x.columns, index=list(test_x.index.values))

In [None]:
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)

### **Submission**

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

In [None]:
submit['Class'] = test_pred
submit.to_csv('./submit.csv',index=False)

In [None]:
submit.head()