In [48]:
import warnings

# 오류 경고 무시하기
warnings.filterwarnings(action='ignore')

In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import f1_score, make_scorer

In [50]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

display(train_df.head(3))
display(test_df.head(3))

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36 months,6 years,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36 months,5 years,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A


Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,TEST_00000,16800000,36 months,8 years,MORTGAGE,132000000,19.64,12,주택 개선,0,394692,146604.0,0.0,0.0
1,TEST_00001,8400000,36 months,5 years,RENT,89971200,15.84,25,부채 통합,0,0,0.0,0.0,0.0
2,TEST_00002,17280000,36 months,6 years,RENT,150000000,8.41,20,신용 카드,0,1786980,281820.0,0.0,0.0


In [51]:
train_df['대출목적'].value_counts()

대출목적
부채 통합     55150
신용 카드     24500
주택 개선      6160
기타         4725
주요 구매      1803
의료         1039
자동차         797
소규모 사업      787
이사          506
휴가          466
주택          301
재생 에너지       60
Name: count, dtype: int64

In [52]:
train_df[train_df['주택소유상태']=='ANY']

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
28730,TRAIN_28730,41160000,36 months,10+ years,ANY,93600000,17.71,20,기타,0,0,0.0,0.0,0.0,C


In [53]:
train_df[train_df['근로기간']=='Unknown'].대출등급.value_counts()

대출등급
B    1742
C    1628
A     934
D     872
E     378
F      92
G      25
Name: count, dtype: int64

In [54]:
train_df.drop(train_df[train_df['주택소유상태']=='ANY'].index, inplace=True)
train_df.drop(train_df[train_df['근로기간']=='Unknown'].index, inplace=True)
train_df

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36 months,6 years,RENT,72000000,18.90,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36 months,5 years,MORTGAGE,96000000,8.60,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,TRAIN_00003,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
5,TRAIN_00005,4800000,36 months,10+ years,RENT,84000000,13.78,30,휴가,0,240216,55428.0,0.0,0.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,TRAIN_96289,14400000,36 months,10+ years,MORTGAGE,210000000,9.33,33,신용 카드,0,974580,492168.0,0.0,0.0,C
96290,TRAIN_96290,28800000,60 months,10+ years,MORTGAGE,132000000,5.16,25,주택 개선,0,583728,855084.0,0.0,0.0,E
96291,TRAIN_96291,14400000,36 months,1 year,MORTGAGE,84000000,11.24,22,신용 카드,0,1489128,241236.0,0.0,0.0,A
96292,TRAIN_96292,15600000,36 months,5 years,MORTGAGE,66330000,17.30,21,부채 통합,2,1378368,818076.0,0.0,0.0,D


In [55]:
# x, y 분리
train_x = train_df.drop(columns=['ID', '대출등급'])
train_y = train_df['대출등급']

test_x = test_df.drop(columns=['ID'])

In [56]:
# 범주형 변수 카테고리형으로 변환
for col in ['대출기간', '주택소유상태', '대출목적', '근로기간']:
    train_x[col] = train_x[col].astype('category')
    test_x[col] = test_x[col].astype('category')

In [58]:
# 수치형 변수 스케일링
continuous_vars = [
    '대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', 
    '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '연체계좌수'
]

scaler = MinMaxScaler()

train_x[continuous_vars] = scaler.fit_transform(train_x[continuous_vars])
test_x[continuous_vars] = scaler.transform(test_x[continuous_vars])


display(train_x.head(3))
display(test_x.head(3))

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,0.276471,36 months,6 years,RENT,0.006075,0.18883,0.066667,부채 통합,0.0,0.0,0.0,0.0,0.0
1,0.323529,60 months,10+ years,MORTGAGE,0.011522,0.223099,0.10303,주택 개선,0.0,0.008904,0.041402,0.0,0.0
2,0.264706,36 months,5 years,MORTGAGE,0.008298,0.085923,0.060606,부채 통합,0.0,0.022134,0.026876,0.0,0.0


Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,0.382353,36 months,8 years,MORTGAGE,0.011634,0.196223,0.048485,주택 개선,0.0,0.009407,0.025932,0.0,0.0
1,0.176471,36 months,5 years,RENT,0.00774,0.158258,0.127273,부채 통합,0.0,0.0,0.0,0.0,0.0
2,0.394118,36 months,6 years,RENT,0.013301,0.084024,0.09697,신용 카드,0.0,0.042592,0.04985,0.0,0.0


In [59]:
# LightGBM 분류기 초기화
model = lgb.LGBMClassifier(objective='multiclass', metric='multi_logloss', force_row_wise=True)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'num_leaves': [31, 63, 127],  # 각 트리가 가질 수 있는 최대 리프의 수
    'max_depth': [10, 20, 30],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'n_estimators': [100, 200, 300],  # 부스팅 라운드 수
}

# f1_macro를 사용하기 위한 스코어러 생성
f1_macro_scorer = make_scorer(f1_score, average='macro')

# GridSearchCV 초기화
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, scoring=f1_macro_scorer, verbose=2)

# 하이퍼파라미터 튜닝 실행
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터 출력
print('Best parameters found by grid search are:', grid_search.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Total Bins 1451
[LightGBM] [Info] Number of data points in the train set: 60414, number of used features: 13
[LightGBM] [Info] Start training from score -1.744242
[LightGBM] [Info] Start training from score -1.208075
[LightGBM] [Info] Start training from score -1.248839
[LightGBM] [Info] Start training from score -1.982438
[LightGBM] [Info] Start training from score -2.564354
[LightGBM] [Info] Start training from score -3.885303
[LightGBM] [Info] Start training from score -5.433027
[CV] END learning_rate=0.01, max_depth=10, n_estimators=100, num_leaves=31; total time=   2.1s
[LightGBM] [Info] Total Bins 1447
[LightGBM] [Info] Number of data points in the train set: 60415, number of used features: 13
[LightGBM] [Info] Start training from score -1.744259
[LightGBM] [Info] Start training from score -1.208092
[LightGBM] [Info] Start training from score -1.248856
[LightGBM] [Info] Start training from score -1.98

In [60]:

# 최적의 모델로 예측
best_model = grid_search.best_estimator_
preds = best_model.predict(test_x)

# 예측 결과 확인
print(preds)

['B' 'B' 'A' ... 'D' 'C' 'A']


In [62]:
# 예측 DF 생성
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['대출등급'] = preds.ravel()
sample_submission

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [64]:
# 예측 결과 저장
sample_submission.to_csv('baseline_submit.csv', index=False)