<a href="https://colab.research.google.com/github/busiri/busil/blob/main/4%EB%8B%A8%EA%B3%84(Feature_selection).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
import warnings, sys, os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)   # 열 생략 없이 전부 출력
pd.set_option("display.expand_frame_repr", False)  # 줄 넘김 없이 한 줄에 출력

warnings.filterwarnings("ignore")

plt.rcParams["font.family"]      = "Malgun Gothic"   # Windows 기본 한글 글꼴
plt.rcParams["axes.unicode_minus"] = False           # - 부호 깨짐 방지

In [91]:
phase_files = {
    '도입기_train.csv' : '도입',
    '성장기_train.csv' : '성장',
    '성숙기_train.csv': '성숙',
    '쇠퇴기_train.csv': '쇠퇴'
}

# 파일 보내기
for file, phase in phase_files.items():
  globals()[f"df_{phase}"]=pd.read_csv(file)

### 수치형 - RFE, t-test, VIF를 활용하여 선정

In [92]:
obj_cols = [x for x in df_도입.columns if len(df_도입[x].unique())<20 and x != '부실여부']
obj_cols = [x for x in df_도입.columns if len(df_도입[x].unique())<20 and x != '부실여부']
df_도입[obj_cols] = df_도입[obj_cols].astype('object')
df_도입_obj = df_도입.select_dtypes(include='object')
df_도입_num = df_도입.select_dtypes(exclude='object')

df_성장[obj_cols] = df_성장[obj_cols].astype('object')
df_성장_obj = df_성장.select_dtypes(include='object')
df_성장_num = df_성장.select_dtypes(exclude='object')

df_성숙[obj_cols] = df_성숙[obj_cols].astype('object')
df_성숙_obj = df_성숙.select_dtypes(include='object')
df_성숙_num = df_성숙.select_dtypes(exclude='object')

df_쇠퇴[obj_cols] = df_쇠퇴[obj_cols].astype('object')
df_쇠퇴_obj = df_쇠퇴.select_dtypes(include='object')
df_쇠퇴_num = df_쇠퇴.select_dtypes(exclude='object')

### Balanced RF를 활용하여 Feaute importance 상위 200개 피처 선정

### 도입기(나머지 세개의 국면에도 진행했음)

In [97]:
from imblearn.ensemble import BalancedRandomForestClassifier
def RFE(df:pd.DataFrame,top_k = 150):
  # 0) 데이터 X,y 분리 후 train, valid 분리
  X = df.drop('부실여부',axis=1)
  y = df['부실여부']

  X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

  # 1) 스케일링
  scaler = StandardScaler()
  X_scaled       = scaler.fit_transform(X_train)
  X_valid_scaled  = scaler.transform(X_valid)

  # 2) LightGBM 모델 학습
  clf = BalancedRandomForestClassifier(
    n_estimators=200,
    random_state=42
  )
  clf.fit(X_scaled, y_train)

  # 3) feature_importances_ 를 Series로 추출·정렬
  feat_imp = pd.Series(
    clf.feature_importances_,
    index=X_train.columns).sort_values(ascending=False)


  # 4) 상위 피처 선택
  topK = feat_imp.head(top_k).index.tolist()
  X_train = X_train[topK]
  X_valid  = X_valid[topK]
  print(f"Top {top_k} features:\n", feat_imp.head(top_k))
  return X_train,X_valid,y_train,y_valid

In [98]:
stages = {'도입':df_도입_num,
          '성장':df_성장_num,
          '성숙':df_성숙_num,
          '쇠퇴':df_쇠퇴_num}

for stage,df in stages.items():
  globals()[f"{stage}_X_train"],globals()[f"{stage}_X_valid"],globals()[f"{stage}_y_train"],globals()[f"{stage}_y_valid"] = RFE(df)

Top 150 features:
 경영자본영업이익률         0.041440
/ 이익잉여금/총자산 비율    0.037133
총자본영업이익률          0.033449
/ 투하자본수익률         0.032626
자기자본영업이익률         0.029163
                    ...   
부가가치율             0.003484
재고자산회전률           0.003396
노동소득분배율           0.003211
매출채권비율            0.003140
자본분배율             0.002551
Length: 117, dtype: float64
Top 150 features:
 경영자본영업이익률        0.083722
총자본영업이익률         0.069442
/ 투하자본수익률        0.055510
총자본사업이익률         0.048667
자기자본영업이익률        0.041681
                   ...   
재고자산(천원)         0.001886
재고자산회전률          0.001858
매출채권 대 상,제품비율    0.001849
재고자산회전기간         0.001727
1회전기간            0.001421
Length: 127, dtype: float64
Top 150 features:
 경영자본영업이익률    0.075334
총자본영업이익률     0.074375
/ 투하자본수익률    0.070901
총자본순이익률      0.051831
총자본사업이익률     0.042715
               ...   
재고자산회전률      0.001685
감가상각률        0.001674
유동자산회전률      0.001672
상품,제품회전률     0.001602
1회전기간        0.000885
Length: 126, dtype: float64
Top 150 features:
 총자본영업이익률      0

### t-test를 활용하여 정상집단과 부실집단 간 평균 차이를 보이는 컬럼 선정(유의수준 : 0.05)

In [99]:
# t-test
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import levene

def ttest_with_levene(df, alpha=0.05):
    results = []
    numeric_cols = df.drop('부실여부',axis=1)

    group1 = df[df['부실여부'] == 0]
    group2 = df[df['부실여부'] == 1]

    for col in numeric_cols:
        data1 = group1[col].dropna()
        data2 = group2[col].dropna()

        if len(data1) > 1 and len(data2) > 1:
            # 등분산 검정 (Levene’s test)
            levene_stat, levene_p = levene(data1, data2)
            equal_var = True if levene_p >= alpha else False

            # t-test
            t_stat, pval = ttest_ind(data1, data2, equal_var=equal_var)

            results.append({
                '컬럼': col,
                '등분산성 p값': levene_p,
                '등분산성 통과 여부': '통과' if equal_var else '불통과',
                't통계량': t_stat,
                't검정 p값': pval,
                'n1': len(data1),
                'n2': len(data2)
            })
        else:
            results.append({
                '컬럼': col,
                '등분산성 p값': None,
                '등분산성 통과 여부': '데이터 부족',
                't통계량': None,
                't검정 p값': None,
                'n1': len(data1),
                'n2': len(data2)
            })

    result_df = pd.DataFrame(results)
    result_top = result_df.dropna(subset=['t검정 p값']).sort_values(by='t검정 p값')
    col_list = result_top.loc[result_top['t검정 p값']<0.05]['컬럼'].tolist()
    col_list.append('부실여부')
    return df[col_list]

도입_RFE = pd.concat([도입_X_train,도입_y_train],axis=1)
성장_RFE = pd.concat([성장_X_train,성장_y_train],axis=1)
성숙_RFE = pd.concat([성숙_X_train,성숙_y_train],axis=1)
쇠퇴_RFE = pd.concat([쇠퇴_X_train,쇠퇴_y_train],axis=1)


# 처음 데이터프레임에서 적용하는 방법도 있고 RFE 진행 후 ttest 진행도 했음
phases = {'도입' : 도입_RFE,
          '성장' : 성장_RFE,
          '성숙' : 성숙_RFE,
          '쇠퇴' : 쇠퇴_RFE }

for phase, df in phases.items():
  globals()[f"ttest_{phase}"] = ttest_with_levene(df)

In [100]:
ttest_도입= ttest_도입.dropna(axis=0,how='any')
ttest_성장= ttest_성장.dropna(axis=0,how='any')
ttest_성숙= ttest_성숙.dropna(axis=0,how='any')
ttest_쇠퇴= ttest_쇠퇴.dropna(axis=0,how='any')

### 다중공선성 제거를 위해 모든 컬럼이 VIF 10 미만이 될때까지 가장 높은 VIF를 갖는 컬럼 제거

In [101]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import warnings

def calculate_vif(df, target_col):
    X = df.drop(columns=[target_col], errors='ignore')
    # 수치형 컬럼만, 결측치 컬럼은 제외
    X = X.select_dtypes(include=["number"]).dropna(axis=1, how='all')
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        vif_data["VIF"] = [
            variance_inflation_factor(X.values, i)
            for i in range(X.shape[1])
        ]
    return vif_data

def vif_removal_for_phase(df, target_col='부실여부', threshold=10, label="국면"):

    df_vif = df.copy()

    # VIF 반복 제거
    step = 0
    while True:
        vif_df = calculate_vif(df_vif, target_col)
        max_vif = vif_df["VIF"].max()
        max_vif_feature = vif_df.sort_values(
            by="VIF", ascending=False
        ).iloc[0]["feature"]
        print(f"[{label}] Step {step}: 최대 VIF = {max_vif:.3f} ({max_vif_feature})")
        if max_vif <= threshold:
            print(f"[{label}] 모든 VIF가 {threshold} 이하입니다. 반복 종료.")
            break
        print(
            f"[{label}] '{max_vif_feature}' 컬럼(VIF={max_vif:.3f})을 제거합니다."
        )
        df_vif = df_vif.drop(columns=[max_vif_feature])
        step += 1

    df_final = df_vif.loc[:, ~df_final.columns.duplicated()]

    # 6) 결과 저장
    save_name = f'{label}_VIF.csv'
    df_final.to_csv(save_name, index=False)
    print(f"[{label}] 최종 파일 저장 완료: {save_name}")

    return df_final, vif_df

In [105]:
phases = {'도입' : ttest_도입,
          '성장' : ttest_성장,
          '성숙' : ttest_성숙,
          '쇠퇴' : ttest_쇠퇴}

for phase, df in phases.items():
  globals()[f"VIF_{phase}"] = vif_removal_for_phase(df)

KeyboardInterrupt: 

### 범주형 - 카이제곱 검정을 활용하여 선정

In [102]:
obj_cols = df_도입_obj.columns.tolist()
obj_cols.append('부실여부')
df_도입_obj = df_도입[obj_cols]

obj_cols2 = df_성장_obj.columns.tolist()
obj_cols2.append('부실여부')
df_성장_obj = df_성장[obj_cols2]

obj_cols3 = df_성숙_obj.columns.tolist()
obj_cols3.append('부실여부')
df_성숙_obj = df_성숙[obj_cols3]

obj_cols4 = df_쇠퇴_obj.columns.tolist()
obj_cols4.append('부실여부')
df_쇠퇴_obj = df_쇠퇴[obj_cols4]

In [87]:
# 회사명과 회계년도를 제외하고 각 범주형 데이터에 대해서 카이제곱검정 진행
from scipy.stats import chi2_contingency

def categorical_selection(df):
  categorical_cols = df.columns.tolist()[2:-1]
  col_list = []
  for col in categorical_cols:
    contingency_table = pd.crosstab(df[col],df['부실여부'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    if p < 0.05:
      col_list.append(col)
  return df[col_list]

df_도입_obj = categorical_selection(df_도입_obj)
df_성장_obj = categorical_selection(df_성장_obj)
df_성숙_obj = categorical_selection(df_성숙_obj)
df_쇠퇴_obj = categorical_selection(df_쇠퇴_obj)

In [104]:
도입_selected_col = VIF_도입.columns.tolist().append(df_도입_obj.columns)
성장_selected_col = VIF_성장.columns.tolist().append(df_성장_obj.columns)
성숙_selected_col = VIF_성숙.columns.tolist().append(df_성숙_obj.columns)
쇠퇴_selected_col = VIF_쇠퇴.columns.tolist().append(df_쇠퇴_obj.columns)

df_final1 = df_도입[도입_selected_col]
df_final2 = df_성장[성장_selected_col]
df_final3 = df_성숙[성숙_selected_col]
df_final4 = df_쇠퇴[쇠퇴_selected_col]

NameError: name 'VIF_도입' is not defined

In [None]:
# 파일 보내기
df_final1.to_csv('도입기_feature_selected.csv',index=False)
df_final2.to_csv('성장기_feature_selected.csv',index=False)
df_final3.to_csv('성숙기_feature_selected.csv',index=False)
df_final4.to_csv('쇠퇴기_feature_selected.csv',index=False)