# 고객 대출등급 분류 해커톤 - Modeling

- 알고리즘 | 정형 | 분류 | 금융 | Macro F1 Score
- 경진대회 링크 : https://dacon.io/competitions/official/236214/codeshare
- [주제] 고객의 대출등급을 예측하는 AI 알고리즘 개발
- [설명] 대출 고객과 관련된 데이터 분석을 통해 고객의 대출등급 예측하는 AI 모델을 개발해야 합니다.


In [1]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier as dct
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv('data/train_preprocessing.csv')
test_df = pd.read_csv('data/test_preprocessing.csv').set_index('ID')

In [4]:
y = train_df['대출등급']
X = train_df.drop('대출등급', axis=1).set_index('ID')

In [5]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

## 1. 베이스라인 모델 비교

In [6]:
def train_and_evaluate(model_name, X, y):
    # 모델 선택 및 훈련
    if model_name == 'lightgbm':
        model = lgb.LGBMClassifier()
    elif model_name == 'xgboost':
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='')
    elif model_name == 'catboost':
        model = cb.CatBoostClassifier(verbose=0)
    elif model_name == 'decisiontree':
        model = dct(max_depth=16, min_samples_leaf= 3, min_samples_split= 7)
    else:
        raise ValueError("Model not supported.")

    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    model.fit(X_train, y_train)
    
    # 예측 및 평가
    predictions = model.predict(X_test)
    f1 = f1_score(y_test, predictions, average='macro')
    
    return f1

In [7]:
def compare_models(X, y):
    models = ['decisiontree', 'lightgbm', 'catboost']#, 'xgboost']
    results = {}
    
    for model in models:
        results[model] = train_and_evaluate(model, X, y)

    return results

In [8]:
baseline_result = compare_models(X, y_encoded)
baseline_result

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2731
[LightGBM] [Info] Number of data points in the train set: 77035, number of used features: 18
[LightGBM] [Info] Start training from score -1.743795
[LightGBM] [Info] Start training from score -1.209157
[LightGBM] [Info] Start training from score -1.248004
[LightGBM] [Info] Start training from score -1.976168
[LightGBM] [Info] Start training from score -2.569816
[LightGBM] [Info] Start training from score -3.904071
[LightGBM] [Info] Start training from score -5.446880


{'decisiontree': 0.9248085438228377,
 'lightgbm': 0.9031323457170647,
 'catboost': 0.9025346325576863}

## 2. Decisiontree

In [9]:
def tune_decision_tree(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dtc = dct()

    parameters = {
        'max_depth': np.arange(10, 23),  
        'min_samples_leaf': np.arange(1, 6),
        'min_samples_split': np.arange(4, 11)
    }

    random_search = RandomizedSearchCV(dtc, param_distributions=parameters, n_iter=20, cv=3, verbose=2, n_jobs=-1, scoring='f1_macro')
    random_search.fit(X_train, y_train)

    print("Best parameters found: ", random_search.best_params_)
    print("Best score: ", random_search.best_score_)

    best_model = random_search.best_estimator_
    predictions = best_model.predict(X_test)

    return best_model, predictions

In [10]:
dt_model, dt_pred = tune_decision_tree(X, y_encoded)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=18, min_samples_leaf=5, min_samples_split=9; total time=   0.9s
[CV] END max_depth=18, min_samples_leaf=5, min_samples_split=9; total time=   0.9s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=6; total time=   0.9s
[CV] END max_depth=18, min_samples_leaf=5, min_samples_split=9; total time=   0.9s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=6; total time=   1.0s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=10; total time=   0.9s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=10; total time=   0.9s
[CV] END max_depth=14, min_samples_leaf=5, min_samples_split=4; total time=   0.9s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=6; total time=   1.0s
[CV] END max_depth=15, min_samples_leaf=2, min_samples_split=10; total time=   1.0s
[CV] END max_depth=14, min_samples_leaf=5, min_samples_split=4; total time=   0.9s
[CV] END max_depth=14, 

- Best parameters found:  {'min_samples_split': 7, 'min_samples_leaf': 5, 'max_depth': 19}
- Best score:  0.9239847991801448

## 3. lightgbm

In [12]:
def tune_lightgbm(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    lgbm = lgb.LGBMClassifier(objective='multiclass', metric='multi_logloss')

    parameters = {
        'n_estimators': np.arange(50, 201, 70),
        'learning_rate': [0.01, 0.05],
        'max_depth': np.arange(5, 11),
        'num_leaves': np.arange(20, 60, 20),  
        'subsample': [0.7, 0.9],
        'colsample_bytree': [0.7, 0.9]
    }

    random_search = RandomizedSearchCV(lgbm, 
                                       param_distributions=parameters, 
                                       n_iter=20, 
                                       cv=3, 
                                       verbose=2, 
                                       n_jobs=-1, 
                                       scoring='f1_macro')
    random_search.fit(X_train, y_train)

    print("Best parameters found: ", random_search.best_params_)
    print("Best score: ", random_search.best_score_)

    best_model = random_search.best_estimator_
    predictions = best_model.predict(X_test)

    return best_model, predictions

In [13]:
lgbm_model, lgbm_pred = tune_lightgbm(X, y_encoded)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2711
[LightGBM] [Info] Number of data points in the train set: 51357, number of used features: 18
[LightGBM] [Info] Start training from score -1.746813
[LightGBM] [Info] Start training from score -1.208012
[LightGBM] [Info] Start training from score -1.246339
[LightGBM] [Info] Start training from score -1.980963
[LightGBM] [Info] Start training from score -2.569653
[LightGBM] [Info] Start training from score -3.886209
[LightGBM] [Info] Start training from score -5.448394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.064333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2711
[LightGBM] [

KeyboardInterrupt: 

## 3. Catboost

In [14]:
def tune_catboost(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    catboost = cb.CatBoostClassifier(loss_function='MultiClass', verbose=False)

    parameters = {
        'iterations': np.arange(50, 201, 50),
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': np.arange(4, 10),
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'border_count': [32, 64, 128, 254],
        'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
    }
    
    # 조건부 하이퍼파라미터를 위한 특정 설정
    # bootstrap_type이 'Bayesian'일 때, bagging_temperature를 추가 (https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/catboost-hyperparameters.html)
    if 'Bayesian' in parameters['bootstrap_type']:
        parameters['bagging_temperature'] = np.linspace(0, 1, 10)

    random_search = RandomizedSearchCV(catboost, 
                                       param_distributions=parameters, 
                                       n_iter=20, 
                                       cv=3, 
                                       verbose=2, 
                                       n_jobs=-1, 
                                       scoring='f1_macro')
    random_search.fit(X_train, y_train)

    print("Best parameters found: ", random_search.best_params_)
    print("Best score: ", random_search.best_score_)

    best_model = random_search.best_estimator_
    predictions = best_model.predict(X_test)

    return best_model, predictions

In [15]:
cat_model, cat_pred = tune_catboost(X, y_encoded)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END bagging_temperature=0.4444444444444444, bootstrap_type=Bernoulli, border_count=64, depth=7, iterations=200, l2_leaf_reg=5, learning_rate=0.01; total time=   0.0s
[CV] END bagging_temperature=0.4444444444444444, bootstrap_type=Bernoulli, border_count=64, depth=7, iterations=200, l2_leaf_reg=5, learning_rate=0.01; total time=   0.0s
[CV] END bagging_temperature=0.4444444444444444, bootstrap_type=Bernoulli, border_count=64, depth=7, iterations=200, l2_leaf_reg=5, learning_rate=0.01; total time=   0.0s
[CV] END bagging_temperature=0.0, bootstrap_type=Bernoulli, border_count=32, depth=4, iterations=50, l2_leaf_reg=9, learning_rate=0.1; total time=   0.0s
[CV] END bagging_temperature=0.0, bootstrap_type=Bernoulli, border_count=32, depth=4, iterations=50, l2_leaf_reg=9, learning_rate=0.1; total time=   0.0s
[CV] END bagging_temperature=0.0, bootstrap_type=Bernoulli, border_count=32, depth=4, iterations=50, l2_leaf_reg=9, le

24 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/hdd_data1/annmunju/anaconda3/envs/ml/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/hdd_data1/annmunju/anaconda3/envs/ml/lib/python3.9/site-packages/catboost/core.py", line 5131, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/hdd_data1/annmunju/anaconda3/envs/ml/lib/python3.9/site-packages/catboost/core.py", line 2341, in _fit
    train_params = self._prepa

Best parameters found:  {'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 200, 'depth': 5, 'border_count': 64, 'bootstrap_type': 'MVS', 'bagging_temperature': 0.4444444444444444}
Best score:  0.7953778885200116


- Best parameters found:  {'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 200, 'depth': 5, 'border_count': 64, 'bootstrap_type': 'MVS', 'bagging_temperature': 0.4444444444444444}
- Best score:  0.7953778885200116

## 3. Xgboost

In [16]:
def tune_xgboost(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    xgbc = xgb.XGBClassifier() 

    parameters = {
        'n_estimators': np.arange(50, 201, 50),
        'learning_rate': [0.01, 0.05],  
        'max_depth': np.arange(5, 11),   
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
    }

    random_search = RandomizedSearchCV(xgbc, 
                                       param_distributions=parameters, 
                                       n_iter=20, 
                                       cv=3, 
                                       verbose=2, 
                                       n_jobs=-1, 
                                       scoring='f1_macro')
    random_search.fit(X_train, y_train)

    print("Best parameters found: ", random_search.best_params_)
    print("Best score: ", random_search.best_score_)

    best_model = random_search.best_estimator_
    predictions = best_model.predict(X_test)

    return best_model, predictions

In [30]:
X, test_df

((96294, 18), (64197, 19))

In [17]:
xgb_model, xgb_pred = tune_xgboost(X, y_encoded)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=0.7; total time=  35.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=0.7; total time=  36.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=0.7; total time=  36.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=1.0; total time=  39.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=1.0; total time=  38.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=50, subsample=1.0; total time=  39.6s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=50, subsample=0.7; total time=  41.5s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=50, subsample=0.7; total time=  41.9s
[CV] END colsample_

- Best parameters found:  {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
- Best score:  0.9401824838437207

## 최종 모델 선택 후 제출

In [15]:
def save_inference_df(model, test_df=test_df, save_fname='result/result.csv'):
    best_pred_result = model.predict(test_df)
    pd.DataFrame(index=test_df.index,columns=['대출등급'], data=le.inverse_transform(best_pred_result)).to_csv(save_fname)

In [None]:
save_inference_df(xgb_model)

![](result/dacon_result.png)

In [16]:
save_inference_df(dt_model, save_fname='result/second_model_result.csv')

![](result/second_result.png)