## 01. 라이브러리 불러오기

In [1]:
import numpy as np
import pandas as pd
import regex as re
import gc
import matplotlib.pyplot as plt
import pyarrow.feather as feather
import seaborn as sns
import datatable as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 02. 데이터 불러오기

In [2]:
preData = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')
preLabel = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv', low_memory=True)

## 03. 데이터 준비

In [3]:
# 데이터 정제 : 50% 이상 missing data 날려주기
def process_data(data):
    missing_data = [col for col in data.columns if data[col].isna().sum() >  0.5*len(data.index)]
    data = data.drop(columns=missing_data)
    
    # Feature engineering: number of transactions
    numTx = data['customer_ID'].value_counts().tolist()
    data = data.groupby('customer_ID').tail(1)
    data = data.reset_index(drop=True)
    data.insert(2,"numTx",numTx)
    data = data.drop(columns=["S_2", "customer_ID"])

    # 데이터 채우기
    for i in data.columns:
        data[i] = data[i].fillna(data[i].mean())
    return data    

In [4]:
preData = process_data(preData)

## 04. Train Test Split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(preData,preLabel['target'],test_size=0.2, random_state=42)

In [6]:
del preData, preLabel
gc.collect()

## 05. CATBOOST 모델 쌓기

In [7]:
import catboost as cat
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

clf = CatBoostClassifier(iterations=5000, task_type="GPU",random_state=22, bagging_temperature = 0.2)
clf.fit(X_train,y_train,eval_set=(X_val,y_val),verbose=True)

In [8]:
prediction = clf.predict_proba(X_val)
rounded_predictions = np.argmax(prediction, axis=-1)
c_matrix = confusion_matrix(y_val,rounded_predictions)
dt_acc = c_matrix.trace()/c_matrix.sum()
print(c_matrix)
print(dt_acc)

In [9]:
#cleanup
del X_train, X_val, y_train, y_val
gc.collect()

## 06. Test Data 준비

In [10]:
testData = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet')
testCustomer  = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv', usecols=['customer_ID'], low_memory=True)

In [11]:
testData = process_data(testData)

In [12]:
prediction = clf.predict_proba(testData)
final_predictions = prediction[:,1]

In [13]:
output = pd.DataFrame({'customer_ID': testCustomer.customer_ID, 'prediction': final_predictions})
output.to_csv('submission.csv', index=False)

## 제출 score : 0.786