In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/non-fulfillment/sample_submission.csv
/kaggle/input/non-fulfillment/train.csv
/kaggle/input/non-fulfillment/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# 학습/평가 데이터 로드
# train_df = pd.read_csv('./train.csv').drop(columns=['UID'])
# test_df = pd.read_csv('./test.csv').drop(columns=['UID'])
train_df = pd.read_csv('/kaggle/input/non-fulfillment/train.csv').drop(columns=['UID'])
test_df = pd.read_csv('/kaggle/input/non-fulfillment/test.csv').drop(columns=['UID'])

In [4]:
categorical_col = [
    '주거 형태',
    '현재 직장 근속 연수',
    '대출 목적',
    '대출 상환 기간'
]

# OneHotEncoder 초기화
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# 훈련 데이터에 대해 인코더 학습
encoder.fit(train_df[categorical_col])

# 훈련 데이터와 테스트 데이터 변환
train_encoded = encoder.transform(train_df[categorical_col])
test_encoded = encoder.transform(test_df[categorical_col])

# One-hot encoding 결과를 데이터프레임으로 변환
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_col))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_col))

# 인코딩된 결과를 원래 데이터에 적용
train_df = pd.concat([train_df.drop(columns=categorical_col).reset_index(drop=True), train_encoded_df], axis=1)
test_df = pd.concat([test_df.drop(columns=categorical_col).reset_index(drop=True), test_encoded_df], axis=1)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(columns=['채무 불이행 여부']), 
    train_df['채무 불이행 여부'], 
    test_size=0.2, 
    random_state=42
)

In [6]:
# XGBoost 모델 학습
model = XGBClassifier(
    n_estimators=100,  # 트리 개수
    max_depth=5,       # 최대 깊이
    learning_rate=0.15, # 학습률
    random_state=42,
    use_label_encoder=False,  # 경고 방지
    eval_metric="auc",        # 평가 지표 설정
)

# 학습 및 Validation 성능 모니터링
eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=True,  # 학습 로그 출력
    early_stopping_rounds=10  # 5번의 학습 동안 성능 개선 없을 시 조기 종료
)

[0]	validation_0-auc:0.73056	validation_1-auc:0.67897
[1]	validation_0-auc:0.74434	validation_1-auc:0.69985
[2]	validation_0-auc:0.75222	validation_1-auc:0.70013
[3]	validation_0-auc:0.75962	validation_1-auc:0.70342
[4]	validation_0-auc:0.76341	validation_1-auc:0.70646
[5]	validation_0-auc:0.76785	validation_1-auc:0.70622
[6]	validation_0-auc:0.77482	validation_1-auc:0.71070
[7]	validation_0-auc:0.78105	validation_1-auc:0.71547
[8]	validation_0-auc:0.78692	validation_1-auc:0.71940
[9]	validation_0-auc:0.78952	validation_1-auc:0.72130
[10]	validation_0-auc:0.79418	validation_1-auc:0.72203
[11]	validation_0-auc:0.79838	validation_1-auc:0.72385
[12]	validation_0-auc:0.80065	validation_1-auc:0.72424
[13]	validation_0-auc:0.80335	validation_1-auc:0.72626
[14]	validation_0-auc:0.80676	validation_1-auc:0.72740
[15]	validation_0-auc:0.80895	validation_1-auc:0.72852
[16]	validation_0-auc:0.81204	validation_1-auc:0.72941
[17]	validation_0-auc:0.81473	validation_1-auc:0.72946
[18]	validation_0-au

In [7]:
# 채무 불이행 '확률'을 예측합니다.
preds = model.predict_proba(test_df)[:,1]

In [8]:
submit = pd.read_csv('/kaggle/input/non-fulfillment/sample_submission.csv')

# 결과 저장
submit['채무 불이행 확률'] = preds
submit.to_csv('./submission.csv', encoding='UTF-8-sig', index=False)