In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))  # Add project root to Python path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.utils import *
from src.features import *
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline

In [None]:
train_df = load_train_data()
train = train_df.copy()
test_df = load_test_data()
test = test_df.copy()
train = train.drop(columns='id')
test = test.drop(columns='id')

In [None]:
pd.set_option('display.max_columns', None)
train.head()

In [None]:
def resumetable2(df, target_col, missing_value=-1, ignore_cols=None, verbose=True):
    ignore_cols = ignore_cols or []
    if verbose:
        print(f'Data shape: {df.shape}')

    summary = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summary['Missing'] = (df == missing_value).sum().values
    summary['Nunique'] = df.nunique().values
    summary['Feature Type'] = None

    for col in df.columns:
        if 'target' in col:
            summary.loc[col, 'Feature Type'] = 'Target'
        elif 'bin' in col:
            summary.loc[col, 'Feature Type'] = 'Binary'
        elif 'cat' in col:
            summary.loc[col, 'Feature Type'] = 'Categorical'
        else:
            summary.loc[col, 'Feature Type'] = 'Temp'
            
    summary = summary.sort_values(by='Feature Type')
    return summary

In [None]:
feature_table = resumetable2(train, 'target')

In [None]:
cat_cols = feature_table[feature_table['Feature Type'] == 'Categorical'].index.tolist()
cat_cols = cat_cols + feature_table[feature_table['Feature Type'] == 'Binary'].index.tolist()
cat_cols = cat_cols + feature_table[(feature_table['Nunique'] < 30) & (feature_table['Feature Type'] == 'Temp')].index.tolist()
num_cols = feature_table[(feature_table['Nunique'] >= 30) & (feature_table['Feature Type'] == 'Temp')].index.tolist()

In [None]:
train['target'].value_counts()

1. value_counts: stratified kfold 사용 필요, 타겟값 불균형
2. target 데이터부터 시각화 시작

In [None]:
sns.countplot(data=train, x='target')
plt.title('Target Distribution')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

def gini_normalized(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

In [None]:
# Pipeline -> Baseline model

# 타겟 정의  
X = train.drop(columns='target')
y = train['target']

# 전처리기 & 피처 생성기 정의 
interaction_fe = InteractionFeatureGenerator()

# 모델 정의 
model = LGBMClassifier(random_state=42)

# 파이프라인 생성 
pipeline = make_pipeline(
    interaction_fe,
    model
)

# 교차 검증 및 평가 
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    pipeline.fit(X_train, y_train)
    
    y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
    score = gini_normalized(y_val.values, y_pred_proba)
    cv_scores.append(score)
    
    print(f'Fold {fold+1} Gini Normalized: {score: .5f}')
    
# 결과
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f'Baseline CV Score: {mean_cv_score: .5f} +/- {std_cv_score: .5f}')

### model.predict_proba()
이진분류에서 predict_proba()는 2차원 배열을 반환 <br>
y_pred_proba = model.predict_proba(X_val)<br>
print(y_pred_proba.shape)  # (n_samples, 2)<br>
print(y_pred_proba[:3])    # 처음 3개 샘플 예시<br>

출력 예시:

[[0.8, 0.2],   첫 번째 샘플: 80% 확률로 class 0, 20% 확률로 class 1 <br>
 [0.3, 0.7],   두 번째 샘플: 30% 확률로 class 0, 70% 확률로 class 1 <br>
 [0.9, 0.1]]   세 번째 샘플: 90% 확률로 class 0, 10% 확률로 class 1 <br>
 

[:, 1]의 의미:<br>
첫 번째 차원(행): 모든 샘플 선택 (:)<br>
두 번째 차원(열): index 1 선택 (1) = class 1의 확률<br>
y_pred_proba[:, 1]  # class 1 확률 (사고 날 확률) <- 우리가 원하는 값

In [None]:
# param_sets = [
#     # Baseline (현재)
#     {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': -1},
    
#     # Set 1: 더 많은 트리, 낮은 학습률
#     {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 7, 'num_leaves': 64},
    
#     # Set 2: 더 깊은 트리
#     {'n_estimators': 1500, 'learning_rate': 0.03, 'max_depth': 8, 'num_leaves': 100},
    
#     # Set 3: 정규화 강화
#     {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 6, 'num_leaves': 50, 
#      'min_child_samples': 100, 'subsample': 0.8, 'colsample_bytree': 0.8}
# ]

# best_score = 0
# best_params = None

# for i, params in enumerate(param_sets):
#     print(f"\n=== Testing Parameter Set {i+1} ===")
#     print(f"Params: {params}")
    
#     # 모델 생성
#     model = LGBMClassifier(random_state=42, **params)
    
#     # 파이프라인 생성
#     pipeline = make_pipeline(interaction_fe, model)
    
#     # CV 실행
#     cv_scores = []
#     for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
#         pipeline.fit(X_train, y_train)
#         y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
#         score = gini_normalized(y_val.values, y_pred_proba)
#         cv_scores.append(score)
    
#     # 결과 출력
#     mean_score = np.mean(cv_scores)
#     std_score = np.std(cv_scores)
#     print(f"CV Score: {mean_score:.5f} +/- {std_score:.5f}")
    
#     # 최고 점수 업데이트
#     if mean_score > best_score:
#         best_score = mean_score
#         best_params = params
        
# print(f"\n=== BEST RESULT ===")
# print(f"Best Score: {best_score:.5f}")
# print(f"Best Params: {best_params}")


### Feature Engineering 

In [None]:
corr_with_target = train.corr()['target'].abs().sort_values(ascending=False)
# .abs(): 절댓값 제시, 타겟 예측에는 절댓값이 중요 
print(corr_with_target.head(15))

In [None]:
# 원본으로 초기화
train_data = train.copy()

## Feature Engineering으로 무한 머리박기 


# 타겟 정의  
X = train_data.drop(columns='target')
y = train_data['target']

# 전처리기 & 피처 생성기 정의 
# interaction_fe = InteractionFeatureGenerator()

# 모델 정의 / 머리 박아보면서 feature engineering 검증 -> 파이프라인 없이 과정 최소화하며 바로 검증 
model = LGBMClassifier(random_state=42)

# 교차 검증 및 평가 
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model.fit(X_train, y_train)
    
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    score = gini_normalized(y_val.values, y_pred_proba)
    cv_scores.append(score)
    
    print(f'Fold {fold+1} Gini Normalized: {score: .5f}')
    
# 결과
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f'Feature Engineered CV Score: {mean_cv_score: .5f} +/- {std_cv_score: .5f}')

### 분석
1. ps_ind_xx_bin: 거의 대부분의 feature가 의미 적음 


In [None]:
model.fit(X_train, y_train)
gain_importance = model.booster_.feature_importance(importance_type='gain')

feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gain_importance
}).sort_values('importance', ascending=False)

In [None]:
top_features = feature_importance_df.head(10)
top_features

In [None]:
top_features_names = feature_importance_df.head(10)['feature'].tolist()
correlation_matrix = train[top_featuers_names].corr()
correlation_matrix

In [None]:
# 타겟 변수와의 관계
corr_with_target = train.corr()['target'].abs().sort_values(ascending=False)
# .abs(): 절댓값 제시, 타겟 예측에는 절댓값이 중요 
print(corr_with_target.head(15))

## Insights
1. reg_03/02/01은 서로 상관관계가 높음 그러나 reg_03은 importance가 굉장히 높은데 01/02는 그렇지 않음 하지만 target과의 corr값에서 02가 제일 높고 오히려 03이 그보다 낮으며, 01은 상위에 포함되지도 않음
2. 



In [None]:
from itertools import combinations

generate_features(train, top_features_names)
good_features = []
for feat in new_features:
    if quick_cv_test(feat) > threshold:
        good_features.append(feat)