## 01. 라이브러리 불러오기

In [1]:
import numpy as np
import pandas as pd
import regex as re
import gc
import matplotlib.pyplot as plt
import pyarrow.feather as feather
import seaborn as sns
import datatable as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


## 02. 데이터 불러오기

In [2]:
preData = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet')
preLabel = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv', low_memory=True)

## 03. 데이터 준비

In [3]:
# 데이터 정제 : 50% 이상 missing data 날려주기
def process_data(data):
    missing_data = [col for col in data.columns if data[col].isna().sum() >  0.5*len(data.index)]
    data = data.drop(columns=missing_data)
    
    # Feature engineering: number of transactions
    numTx = data['customer_ID'].value_counts().tolist()
    data = data.groupby('customer_ID').tail(1)
    data = data.reset_index(drop=True)
    data.insert(2,"numTx",numTx)
    data = data.drop(columns=["S_2", "customer_ID"])

    # 데이터 채우기
    for i in data.columns:
        data[i] = data[i].fillna(data[i].mean())
    return data    

In [4]:
preData = process_data(preData)

## 04. 학습

In [5]:
X_train, X_val, y_train, y_val = train_test_split(preData,preLabel['target'],test_size=0.2, random_state=42)

RAM 용량 확보를 위해 사용했던 데이터를 지워준다.

In [6]:
del preData, preLabel
gc.collect()

42

## 05. CATBOOST 모델 쌓기

In [7]:
# iter = 100, 18 seconds, 3000+GPU = 2 mins
# 출처 : https://www.kaggle.com/code/alvinleenh/amex-prediction-with-cat-boost/notebook

import catboost as cat
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

In [8]:
clf = CatBoostClassifier(iterations=3000, task_type="GPU",bagging_temperature = 0.2)
clf.fit(X_train,y_train,eval_set=(X_val,y_val),verbose=True)

Learning rate set to 0.027352
0:	learn: 0.6593682	test: 0.6592802	best: 0.6592802 (0)	total: 22.1ms	remaining: 1m 6s
1:	learn: 0.6294119	test: 0.6293437	best: 0.6293437 (1)	total: 44.5ms	remaining: 1m 6s
2:	learn: 0.6003773	test: 0.6003030	best: 0.6003030 (2)	total: 58.8ms	remaining: 58.7s
3:	learn: 0.5752573	test: 0.5751637	best: 0.5751637 (3)	total: 72.8ms	remaining: 54.5s
4:	learn: 0.5528973	test: 0.5528186	best: 0.5528186 (4)	total: 87.1ms	remaining: 52.2s
5:	learn: 0.5321529	test: 0.5320079	best: 0.5320079 (5)	total: 101ms	remaining: 50.6s
6:	learn: 0.5129104	test: 0.5127251	best: 0.5127251 (6)	total: 116ms	remaining: 49.4s
7:	learn: 0.4931577	test: 0.4930029	best: 0.4930029 (7)	total: 130ms	remaining: 48.5s
8:	learn: 0.4766087	test: 0.4764671	best: 0.4764671 (8)	total: 144ms	remaining: 47.8s
9:	learn: 0.4612702	test: 0.4611056	best: 0.4611056 (9)	total: 158ms	remaining: 47.3s
10:	learn: 0.4477575	test: 0.4475591	best: 0.4475591 (10)	total: 172ms	remaining: 46.8s
11:	learn: 0.4342

<catboost.core.CatBoostClassifier at 0x7f022809e8d0>

In [9]:
prediction = clf.predict_proba(X_val)
rounded_predictions = np.argmax(prediction, axis=-1)
c_matrix = confusion_matrix(y_val,rounded_predictions)
dt_acc = c_matrix.trace()/c_matrix.sum()
print(c_matrix)
print(dt_acc)

[[63807  4433]
 [ 4595 18948]]
0.9016375581534707


RAM 용량 확보를 위해 사용한 데이터 처리하기

## 06. Test Data 준비

In [10]:
testData = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet')
testCustomer  = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv', usecols=['customer_ID'], low_memory=True)

In [11]:
testData = process_data(testData)

In [12]:
prediction = clf.predict_proba(testData)
final_predictions = prediction[:,1]

In [13]:
output = pd.DataFrame({'customer_ID': testCustomer.customer_ID, 'prediction': final_predictions})
output.to_csv('submission.csv', index=False)

## 제출 score : 0.786