In [79]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [63]:
main_df = pd.read_excel('./main_df_dropna_completed.xlsx',index_col=0)

In [64]:
main_df.drop(['K2_부실판단'],axis=1,inplace=True)

In [65]:
onehot_features = ['산업위험_경기민감도']
binary_features = ['주거래은행'] 

In [66]:
transformers = []

In [67]:
if onehot_features:
    transformers.append(('onehot', OneHotEncoder(handle_unknown='ignore'), onehot_features))


In [68]:
if binary_features:
    binary_transformer = Pipeline(steps=[
        ('binary_onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))  # 원-핫 인코딩 대신 바이너리 인코딩으로 변경
    ])
    transformers.append(('binary', binary_transformer, binary_features))



In [69]:
numeric_features = ['유동비율(%)', '당좌비율(%)', '비유동비율(%)', '부채비율(%)',
       '유동부채비율(%)', '차입금의존도(%)', '이익잉여금비율(%)', '순운전자본비율(%)', '비유동장기적합률(%)',
       '이자보상배율(이자비용)(배)', '외화포지션(배)', '총자본순이익률(%)', '자기자본순이익률(%)',
       '경영자본순이익률(%)', '매출액순이익률(%)', '총자본정상영업이익률(%)', '자기자본정상영업이익률(%)',
       '매출액정상영업이익률(%)', '금융비용부담률(%)', '총자산회전율(배)', '매출채권회전률(배)', '당좌자산회전률(배)',
       '재고자산회전률(배)', '유형자산회전율(배)', '매입채무회전률(배)', '자기자본회전률(배)', '경영자본회전률(배)',
       '유동자산증가율(%)', '매출액증가율(%)', '정상영업이익증가율(%)', '순이익증가율(%)', 'spread(%)',
       'PPI(2015기준)', '실질GDP성장률(%)', 'EV/EBITDA(배)', '52주베타(배)', 'PER', 'PBR',
       'PCR', 'CASH FLOW 대 부채비율(%)', '총자본투자효율(%)', 'log_평균총자산', 'K2_score']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [70]:
transformers.append(('numeric_scaled', numeric_transformer, numeric_features))
preprocessor = ColumnTransformer(transformers=transformers)

In [71]:
df_train = main_df.loc[main_df['회계년도'] <= 2019]
df_test = main_df.loc[main_df['회계년도'] > 2019]

In [72]:
X_train = df_train.drop(['Target','회사명', '거래소코드', '회계년도'], axis=1)
y_train = df_train['Target']

X_test = df_test.drop(['Target','회사명', '거래소코드', '회계년도'],axis=1)
y_test = df_test['Target']


In [73]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [74]:
X_test_transformed = preprocessor.transform(X_test)

In [80]:
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_lgb = lgb.LGBMClassifier()

In [81]:
pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_dt)
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_rf)
])

pipeline_lgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_lgb)
])

In [82]:
pipeline_dt.fit(X_train, y_train)
pipeline_rf.fit(X_train, y_train)
pipeline_lgb.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 672, number of negative: 6700
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10244
[LightGBM] [Info] Number of data points in the train set: 7372, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.091156 -> initscore=-2.299604
[LightGBM] [Info] Start training from score -2.299604


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['산업위험_경기민감도']),
                                                 ('binary',
                                                  Pipeline(steps=[('binary_onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['주거래은행']),
                                                 ('numeric_scaled',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['유동비율(%)', '당좌비율(%)',
                                 

In [83]:
y_pred_dt = pipeline_dt.predict(X_test)
y_pred_rf = pipeline_rf.predict(X_test)
y_pred_lgb = pipeline_lgb.predict(X_test)

In [84]:
from sklearn.metrics import f1_score

# 각 모델별 성능 평가
f1_dt = f1_score(y_test, y_pred_dt)
f1_rf = f1_score(y_test, y_pred_rf)
f1_lgb = f1_score(y_test, y_pred_lgb)


print(f"Decision Tree 모델의 f1-score: {f1_dt}")
print(f"Random Forest 모델의 f1-score: {f1_rf}")
print(f"LightGBM 모델의 f1-score: {f1_lgb}")


Decision Tree 모델의 f1-score: 0.3290322580645161
Random Forest 모델의 f1-score: 0.4345991561181435
LightGBM 모델의 f1-score: 0.4921875
