In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# ============================================================
# Top 500 피처 모델 학습 및 평가
# ============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("Top 500 피처 모델 학습 및 평가")
print("=" * 70)

# ============================================================
# 1. Top 500 피처 선정
# ============================================================
print(f"\n{'='*70}")
print("Top 500 피처 선정")
print(f"{'='*70}")

# Feature importance 로드
importance_df = pd.read_csv('feature_importance_ensemble_cv.csv')

# Top 500 선택
N_FEATURES = 500
selected_features = importance_df.head(N_FEATURES)['feature'].tolist()

print(f"\n[선정된 피처]")
print(f"  총 피처: {len(selected_features)}개")

# 타입별 분포
fp_cols = [f for f in selected_features if f.startswith(('ecfp_', 'fcfp_', 'ptfp_'))]
desc_cols = [f for f in selected_features if f in ['MolWt', 'clogp', 'sa_score', 'qed']]

ecfp_count = len([f for f in fp_cols if f.startswith('ecfp_')])
fcfp_count = len([f for f in fp_cols if f.startswith('fcfp_')])
ptfp_count = len([f for f in fp_cols if f.startswith('ptfp_')])

print(f"  - Descriptor: {len(desc_cols)}개")
print(f"  - Fingerprint: {len(fp_cols)}개")
print(f"    · ECFP: {ecfp_count}개")
print(f"    · FCFP: {fcfp_count}개")
print(f"    · PTFP: {ptfp_count}개")

# 누적 중요도 계산
cumsum_importance = importance_df.head(N_FEATURES)['ensemble_mean'].sum()
total_importance = importance_df['ensemble_mean'].sum()
cumsum_pct = cumsum_importance / total_importance * 100

print(f"\n[누적 중요도]")
print(f"  Top 500 중요도 합: {cumsum_importance:.2f}")
print(f"  전체 중요도 합: {total_importance:.2f}")
print(f"  누적 비율: {cumsum_pct:.2f}%")

# CSV 저장
pd.DataFrame({'feature': selected_features}).to_csv('selected_features_top500.csv', index=False)
print(f"\n✓ 피처 리스트 저장: selected_features_top500.csv")

# ============================================================
# 2. 데이터 준비
# ============================================================
print(f"\n{'='*70}")
print("데이터 로드")
print(f"{'='*70}")

# Train 데이터
df_train = pd.read_csv('train.csv')
X_train = df_train[selected_features]
y_train = df_train['label'].astype(int)

print(f"\n[Train 데이터]")
print(f"  Shape: {X_train.shape}")
print(f"  Label 분포: Class 0 = {sum(y_train==0)}, Class 1 = {sum(y_train==1)}")

# Test 데이터
try:
    df_test = pd.read_csv('predict_input.csv')
    X_test = df_test[selected_features]
    print(f"\n[Test 데이터]")
    print(f"  Shape: {X_test.shape}")
    test_available = True
except:
    print(f"\n⚠️  Test 데이터 없음")
    test_available = False

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ('fp', SimpleImputer(strategy='constant', fill_value=0), fp_cols),
        ('desc', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), desc_cols)
    ],
    remainder='drop'
)

# ============================================================
# 3. 5-Fold CV 학습
# ============================================================
print(f"\n{'='*70}")
print("5-Fold Cross-Validation 학습")
print(f"{'='*70}")

RANDOM_STATE = 42
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# 결과 저장
results = {
    'lgbm': {'models': [], 'oof_probabilities': np.zeros(len(X_train))},
    'xgb': {'models': [], 'oof_probabilities': np.zeros(len(X_train))},
    'catboost': {'models': [], 'oof_probabilities': np.zeros(len(X_train))},
    'fold_details': []
}

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\n{'─'*70}")
    print(f"📊 Fold {fold}/5")
    print(f"{'─'*70}")

    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    Xt_tr = preprocessor.fit_transform(X_tr)
    Xt_va = preprocessor.transform(X_va)

    print(f"  학습: {Xt_tr.shape}, 검증: {Xt_va.shape}")

    # LightGBM
    print(f"  [1/3] LightGBM...", end=' ')
    lgbm_model = LGBMClassifier(
        n_estimators=1000, learning_rate=0.03, max_depth=8,
        num_leaves=63, min_child_samples=30, subsample=0.8,
        colsample_bytree=0.8, reg_alpha=0.3, reg_lambda=0.3,
        class_weight={0: 1.5, 1: 1.0},
        random_state=RANDOM_STATE, n_jobs=-1, verbose=-1
    )
    lgbm_model.fit(Xt_tr, y_tr, eval_set=[(Xt_va, y_va)],
                   callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])

    lgbm_proba = lgbm_model.predict_proba(Xt_va)[:, 1]
    results['lgbm']['models'].append(lgbm_model)
    results['lgbm']['oof_probabilities'][va_idx] = lgbm_proba

    print(f"F1: {f1_score(y_va, (lgbm_proba >= 0.39).astype(int)):.4f}, Iter: {lgbm_model.best_iteration_}")

    # XGBoost
    print(f"  [2/3] XGBoost...", end=' ')
    xgb_model = XGBClassifier(
        n_estimators=1000, learning_rate=0.03, max_depth=7,
        min_child_weight=3, subsample=0.8, colsample_bytree=0.8,
        gamma=0.1, reg_alpha=0.3, reg_lambda=0.3,
        scale_pos_weight=0.67,
        random_state=RANDOM_STATE, n_jobs=-1,
        early_stopping_rounds=100, eval_metric='logloss', verbosity=0
    )
    xgb_model.fit(Xt_tr, y_tr, eval_set=[(Xt_va, y_va)], verbose=False)

    xgb_proba = xgb_model.predict_proba(Xt_va)[:, 1]
    results['xgb']['models'].append(xgb_model)
    results['xgb']['oof_probabilities'][va_idx] = xgb_proba

    print(f"F1: {f1_score(y_va, (xgb_proba >= 0.39).astype(int)):.4f}, Iter: {xgb_model.best_iteration}")

    # CatBoost
    print(f"  [3/3] CatBoost...", end=' ')
    cat_model = CatBoostClassifier(
        iterations=1000, learning_rate=0.03, depth=7,
        l2_leaf_reg=3, class_weights=[1.5, 1.0],
        random_seed=RANDOM_STATE, verbose=0,
        early_stopping_rounds=100
    )
    cat_model.fit(Xt_tr, y_tr, eval_set=(Xt_va, y_va), verbose=False)

    cat_proba = cat_model.predict_proba(Xt_va)[:, 1]
    results['catboost']['models'].append(cat_model)
    results['catboost']['oof_probabilities'][va_idx] = cat_proba

    print(f"F1: {f1_score(y_va, (cat_proba >= 0.39).astype(int)):.4f}, Iter: {cat_model.best_iteration_}")

    # Ensemble
    ensemble_proba = 0.20 * lgbm_proba + 0.60 * xgb_proba + 0.20 * cat_proba

    # Adaptive Threshold
    confidence = np.abs(ensemble_proba - 0.5)
    adaptive_thresholds = np.where(confidence < 0.05, 0.42,
                                    np.where(confidence < 0.10, 0.40, 0.39))
    ensemble_pred = (ensemble_proba >= adaptive_thresholds).astype(int)

    ensemble_f1 = f1_score(y_va, ensemble_pred)
    ensemble_auc = roc_auc_score(y_va, ensemble_proba)

    cm = confusion_matrix(y_va, ensemble_pred)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn)

    print(f"\n  [Ensemble] F1: {ensemble_f1:.4f}, AUC: {ensemble_auc:.4f}, FPR: {fpr:.4f}")

    results['fold_details'].append({
        'fold': fold,
        'ensemble_f1': ensemble_f1,
        'ensemble_auc': ensemble_auc,
        'fpr': fpr
    })

# ============================================================
# 4. OOF 성능 평가
# ============================================================
print(f"\n{'='*70}")
print("OOF 성능 평가")
print(f"{'='*70}")

# Ensemble 확률
ensemble_oof_proba = (
    0.20 * results['lgbm']['oof_probabilities'] +
    0.60 * results['xgb']['oof_probabilities'] +
    0.20 * results['catboost']['oof_probabilities']
)

# Adaptive Threshold 적용
confidence_oof = np.abs(ensemble_oof_proba - 0.5)

def get_adaptive_threshold(confidence):
    if confidence < 0.05:
        return 0.42
    elif confidence < 0.10:
        return 0.40
    else:
        return 0.39

adaptive_thresholds_oof = np.array([get_adaptive_threshold(c) for c in confidence_oof])
predictions_oof = (ensemble_oof_proba >= adaptive_thresholds_oof).astype(int)

# 성능 계산
oof_f1 = f1_score(y_train, predictions_oof)
oof_auc = roc_auc_score(y_train, ensemble_oof_proba)
oof_cm = confusion_matrix(y_train, predictions_oof)
tn, fp, fn, tp = oof_cm.ravel()
oof_fpr = fp / (fp + tn)
oof_precision = precision_score(y_train, predictions_oof)
oof_recall = recall_score(y_train, predictions_oof)

print(f"\n[Top 500 OOF 성능]")
print(f"  F1 Score:  {oof_f1:.4f}")
print(f"  AUC Score: {oof_auc:.4f}")
print(f"  Precision: {oof_precision:.4f}")
print(f"  Recall:    {oof_recall:.4f}")
print(f"  FPR:       {oof_fpr:.4f} ({oof_fpr*100:.2f}%)")

print(f"\n[OOF 혼동 행렬]")
print(f"  TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")

# Low Confidence
low_conf_mask = confidence_oof < 0.1
n_low_conf = low_conf_mask.sum()
low_conf_acc = (predictions_oof[low_conf_mask] == y_train[low_conf_mask]).mean() if n_low_conf > 0 else 0

print(f"\n[Low Confidence]")
print(f"  개수: {n_low_conf}개 ({n_low_conf/len(y_train)*100:.2f}%)")
print(f"  정확도: {low_conf_acc:.4f}")

# ============================================================
# 5. Top 300과 비교
# ============================================================
print(f"\n{'='*70}")
print("Top 300 vs Top 500 비교")
print(f"{'='*70}")

# Top 300 성능 (기준)
baseline_f1 = 0.8300
baseline_fpr = 0.2467

f1_improvement = oof_f1 - baseline_f1
fpr_improvement = oof_fpr - baseline_fpr

print(f"\n[성능 비교]")
print(f"{'지표':<15} {'Top 300':<12} {'Top 500':<12} {'변화':<15}")
print(f"{'-'*55}")
print(f"{'F1 Score':<15} {baseline_f1:<12.4f} {oof_f1:<12.4f} {f1_improvement:+.4f} ({f1_improvement/baseline_f1*100:+.2f}%)")
print(f"{'FPR':<15} {baseline_fpr*100:<12.2f}% {oof_fpr*100:<12.2f}% {fpr_improvement*100:+.2f}%p")
print(f"{'피처 수':<15} {'300':<12} {'500':<12} {'+200'}")

# ============================================================
# 6. Test 예측 (Test 데이터 있을 경우)
# ============================================================
if test_available:
    print(f"\n{'='*70}")
    print("Test 데이터 예측")
    print(f"{'='*70}")

    # 전체 Train 데이터로 전처리기 학습
    Xt_train_full = preprocessor.fit_transform(X_train)
    Xt_test = preprocessor.transform(X_test)

    print(f"\n  Test shape: {Xt_test.shape}")

    # 각 Fold 모델로 예측
    test_predictions = {
        'lgbm': np.zeros((len(X_test), 5)),
        'xgb': np.zeros((len(X_test), 5)),
        'catboost': np.zeros((len(X_test), 5))
    }

    for fold in range(5):
        print(f"\r  Fold {fold+1}/5 예측 중...", end='')
        test_predictions['lgbm'][:, fold] = results['lgbm']['models'][fold].predict_proba(Xt_test)[:, 1]
        test_predictions['xgb'][:, fold] = results['xgb']['models'][fold].predict_proba(Xt_test)[:, 1]
        test_predictions['catboost'][:, fold] = results['catboost']['models'][fold].predict_proba(Xt_test)[:, 1]

    print(f"\r  ✓ 5-Fold 예측 완료")

    # 평균 확률
    lgbm_proba_test = test_predictions['lgbm'].mean(axis=1)
    xgb_proba_test = test_predictions['xgb'].mean(axis=1)
    cat_proba_test = test_predictions['catboost'].mean(axis=1)

    # Ensemble
    ensemble_proba_test = 0.20 * lgbm_proba_test + 0.60 * xgb_proba_test + 0.20 * cat_proba_test

    # Adaptive Threshold 적용
    confidence_test = np.abs(ensemble_proba_test - 0.5)
    adaptive_thresholds_test = np.array([get_adaptive_threshold(c) for c in confidence_test])
    predictions_test = (ensemble_proba_test >= adaptive_thresholds_test).astype(int)

    print(f"\n[Test 예측 결과]")
    print(f"  예측 Class 0: {sum(predictions_test == 0)}개")
    print(f"  예측 Class 1: {sum(predictions_test == 1)}개")
    print(f"  평균 Confidence: {confidence_test.mean():.4f}")
    print(f"  Low Confidence (<0.1): {sum(confidence_test < 0.1)}개")

    # ============================================================
    # 7. Submission 파일 생성
    # ============================================================
    print(f"\n{'='*70}")
    print("Submission 파일 생성")
    print(f"{'='*70}")

    # SMILES 컬럼 확인
    if 'SMILES' in df_test.columns:
        smiles_col = 'SMILES'
    elif 'smiles' in df_test.columns:
        smiles_col = 'smiles'
    else:
        smiles_col = df_test.columns[0]

    # 기본 제출 파일
    submission = pd.DataFrame({
        'SMILES': df_test[smiles_col],
        'output': predictions_test
    })
    submission.to_csv('submission_top500.csv', index=False)
    print(f"\n✓ 기본 제출 파일: submission_top500.csv")

    # 상세 제출 파일
    submission_detailed = pd.DataFrame({
        'id': range(len(predictions_test)),
        'label': predictions_test,
        'probability': ensemble_proba_test,
        'confidence': confidence_test,
        'adaptive_threshold': adaptive_thresholds_test,
        'lgbm_proba': lgbm_proba_test,
        'xgb_proba': xgb_proba_test,
        'catboost_proba': cat_proba_test
    })
    submission_detailed.to_csv('submission_detailed_top500.csv', index=False)
    print(f"✓ 상세 제출 파일: submission_detailed_top500.csv")

# ============================================================
# 8. 시각화
# ============================================================
print(f"\n{'='*70}")
print("시각화 생성")
print(f"{'='*70}")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Feature Count Comparison
ax = axes[0, 0]
feature_counts = [300, 500]
f1_scores = [baseline_f1, oof_f1]
colors = ['lightblue', 'darkgreen']

bars = ax.bar([str(x) for x in feature_counts], f1_scores,
              color=colors, alpha=0.8, edgecolor='black', linewidth=2)
ax.set_xlabel('Number of Features', fontsize=11)
ax.set_ylabel('F1 Score', fontsize=11)
ax.set_title('F1 Score by Feature Count', fontsize=12, fontweight='bold')
ax.set_ylim([0.825, 0.835])
ax.grid(True, alpha=0.3, axis='y')

for bar, val in zip(bars, f1_scores):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.4f}',
            ha='center', va='bottom', fontweight='bold')

# 2. FPR Comparison
ax = axes[0, 1]
fpr_values = [baseline_fpr * 100, oof_fpr * 100]
bars = ax.bar([str(x) for x in feature_counts], fpr_values,
              color=['lightcoral', 'lightgreen'], alpha=0.8, edgecolor='black', linewidth=2)
ax.axhline(25, color='r', linestyle='--', alpha=0.5, label='Target: 25%')
ax.set_xlabel('Number of Features', fontsize=11)
ax.set_ylabel('FPR (%)', fontsize=11)
ax.set_title('FPR by Feature Count', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

for bar, val in zip(bars, fpr_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.2f}%',
            ha='center', va='bottom', fontweight='bold')

# 3. Confusion Matrix
ax = axes[1, 0]
sns.heatmap(oof_cm, annot=True, fmt='d', cmap='Blues', ax=ax, annot_kws={'size': 14})
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix (Top 500)', fontsize=12, fontweight='bold')

# 4. Performance Summary
ax = axes[1, 1]
ax.axis('off')

summary_text = f"""
═══════════════════════
 Top 500 성능 요약
═══════════════════════

F1 Score:  {oof_f1:.4f}
AUC:       {oof_auc:.4f}
Precision: {oof_precision:.4f}
Recall:    {oof_recall:.4f}
FPR:       {oof_fpr*100:.2f}%

─────────────────────
Top 300 대비
─────────────────────
F1:   {f1_improvement:+.4f} ({f1_improvement/baseline_f1*100:+.2f}%)
FPR:  {fpr_improvement*100:+.2f}%p

─────────────────────
피처: 500개 (16.3%)
압축률: 83.7%
"""

ax.text(0.5, 0.5, summary_text, ha='center', va='center',
        fontsize=11, family='monospace',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('top500_analysis.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"\n✓ 시각화 저장: top500_analysis.png")

# ============================================================
# 9. 최종 리포트
# ============================================================
print(f"\n{'='*70}")
print("최종 리포트")
print(f"{'='*70}")

print(f"\n[모델 사양]")
print(f"  피처: Top 500개 (압축률 83.7%)")
print(f"  Ensemble: LGBM(20%) + XGB(60%) + CAT(20%)")
print(f"  Threshold: Adaptive (0.42/0.40/0.39)")

print(f"\n[성능]")
print(f"  F1 Score:  {oof_f1:.4f}")
print(f"  AUC Score: {oof_auc:.4f}")
print(f"  FPR:       {oof_fpr*100:.2f}%")

print(f"\n[Top 300 대비]")
if f1_improvement > 0:
    print(f"  ✓✓ F1 향상: {f1_improvement:+.4f} ({f1_improvement/baseline_f1*100:+.2f}%)")
else:
    print(f"  △ F1 변화: {f1_improvement:+.4f} ({f1_improvement/baseline_f1*100:+.2f}%)")

if fpr_improvement < 0:
    print(f"  ✓✓ FPR 개선: {fpr_improvement*100:+.2f}%p")
else:
    print(f"  △ FPR 변화: {fpr_improvement*100:+.2f}%p")

if test_available:
    print(f"\n[제출 파일]")
    print(f"  메인: submission_top500.csv")
    print(f"  상세: submission_detailed_top500.csv")

print(f"\n{'='*70}")
print("✓ Top 500 모델 완성!")
print(f"{'='*70}")


Top 500 피처 모델 학습 및 평가

Top 500 피처 선정

[선정된 피처]
  총 피처: 500개
  - Descriptor: 4개
  - Fingerprint: 496개
    · ECFP: 158개
    · FCFP: 110개
    · PTFP: 228개

[누적 중요도]
  Top 500 중요도 합: 28620.44
  전체 중요도 합: 36487.29
  누적 비율: 78.44%

✓ 피처 리스트 저장: selected_features_top500.csv

데이터 로드

[Train 데이터]
  Shape: (8349, 500)
  Label 분포: Class 0 = 3807, Class 1 = 4542

[Test 데이터]
  Shape: (927, 500)

5-Fold Cross-Validation 학습

──────────────────────────────────────────────────────────────────────
📊 Fold 1/5
──────────────────────────────────────────────────────────────────────
  학습: (6679, 500), 검증: (1670, 500)
  [1/3] LightGBM... F1: 0.8387, Iter: 827
  [2/3] XGBoost... F1: 0.8470, Iter: 997
  [3/3] CatBoost... F1: 0.8376, Iter: 996

  [Ensemble] F1: 0.8429, AUC: 0.9072, FPR: 0.2378

──────────────────────────────────────────────────────────────────────
📊 Fold 2/5
──────────────────────────────────────────────────────────────────────
  학습: (6679, 500), 검증: (1670, 500)
  [1/3] LightGBM... F1: 0.8225, I

In [None]:
# ============================================================
# 최고 성능 모델: 전체 피처 + 2-Layer Stacking
# ============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("최고 성능 모델: 전체 피처 + 2-Layer Stacking")
print("=" * 70)
print("\n[전략]")
print("  Layer 1: LGBM, XGB, CAT (Base Models)")
print("  Layer 2: Logistic Regression (Meta-Learner)")
print("  피처: 전체 3076개 (압축 없음)")
print("  목표: F1 0.835+")

# ============================================================
# 1. 데이터 준비 (전체 피처)
# ============================================================
print(f"\n{'='*70}")
print("데이터 로드 (전체 피처)")
print(f"{'='*70}")

# Train 데이터
df_train = pd.read_csv('train.csv')

# 전체 피처 사용 (label 제외)
feature_columns = [col for col in df_train.columns if col != 'label']
X_train = df_train[feature_columns]
y_train = df_train['label'].astype(int)

print(f"\n[Train 데이터]")
print(f"  Shape: {X_train.shape}")
print(f"  피처: {len(feature_columns)}개 (전체)")
print(f"  Label 분포: Class 0 = {sum(y_train==0)}, Class 1 = {sum(y_train==1)}")

# Test 데이터
try:
    df_test = pd.read_csv('predict_input.csv')
    X_test = df_test[feature_columns]
    print(f"\n[Test 데이터]")
    print(f"  Shape: {X_test.shape}")
    test_available = True
except:
    print(f"\n⚠️  Test 데이터 없음")
    test_available = False

# 피처 타입 분류
fp_cols = [col for col in feature_columns if col.startswith(('ecfp_', 'fcfp_', 'ptfp_'))]
desc_cols = ['MolWt', 'clogp', 'sa_score', 'qed']

print(f"\n[피처 구성]")
print(f"  Fingerprint: {len(fp_cols)}개")
print(f"  Descriptor: {len(desc_cols)}개")

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ('fp', SimpleImputer(strategy='constant', fill_value=0), fp_cols),
        ('desc', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), desc_cols)
    ],
    remainder='drop'
)

# ============================================================
# 2. Layer 1: Base Models (5-Fold OOF)
# ============================================================
print(f"\n{'='*70}")
print("Layer 1: Base Models 학습 (5-Fold CV)")
print(f"{'='*70}")

RANDOM_STATE = 42
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# OOF 확률 저장
oof_probabilities = {
    'lgbm': np.zeros(len(X_train)),
    'xgb': np.zeros(len(X_train)),
    'catboost': np.zeros(len(X_train))
}

# Test 예측 저장 (각 Fold)
if test_available:
    test_predictions = {
        'lgbm': np.zeros((len(X_test), 5)),
        'xgb': np.zeros((len(X_test), 5)),
        'catboost': np.zeros((len(X_test), 5))
    }

# 모델 저장
models = {
    'lgbm': [],
    'xgb': [],
    'catboost': []
}

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\n{'─'*70}")
    print(f"📊 Fold {fold}/5")
    print(f"{'─'*70}")

    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

    # 전처리
    Xt_tr = preprocessor.fit_transform(X_tr)
    Xt_va = preprocessor.transform(X_va)

    print(f"  학습: {Xt_tr.shape}, 검증: {Xt_va.shape}")

    # ========================================
    # LightGBM
    # ========================================
    print(f"  [1/3] LightGBM...", end=' ')
    lgbm_model = LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=8,
        num_leaves=63,
        min_child_samples=30,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.3,
        reg_lambda=0.3,
        class_weight={0: 1.5, 1: 1.0},
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1
    )

    lgbm_model.fit(
        Xt_tr, y_tr,
        eval_set=[(Xt_va, y_va)],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )

    lgbm_proba_va = lgbm_model.predict_proba(Xt_va)[:, 1]
    oof_probabilities['lgbm'][va_idx] = lgbm_proba_va
    models['lgbm'].append((lgbm_model, preprocessor))

    print(f"완료 (Iter: {lgbm_model.best_iteration_})")

    # Test 예측
    if test_available:
        Xt_test = preprocessor.transform(X_test)
        test_predictions['lgbm'][:, fold-1] = lgbm_model.predict_proba(Xt_test)[:, 1]

    # ========================================
    # XGBoost
    # ========================================
    print(f"  [2/3] XGBoost...", end=' ')
    xgb_model = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=7,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.3,
        reg_lambda=0.3,
        scale_pos_weight=0.67,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        early_stopping_rounds=100,
        eval_metric='logloss',
        verbosity=0
    )

    xgb_model.fit(
        Xt_tr, y_tr,
        eval_set=[(Xt_va, y_va)],
        verbose=False
    )

    xgb_proba_va = xgb_model.predict_proba(Xt_va)[:, 1]
    oof_probabilities['xgb'][va_idx] = xgb_proba_va
    models['xgb'].append((xgb_model, preprocessor))

    print(f"완료 (Iter: {xgb_model.best_iteration})")

    # Test 예측
    if test_available:
        test_predictions['xgb'][:, fold-1] = xgb_model.predict_proba(Xt_test)[:, 1]

    # ========================================
    # CatBoost
    # ========================================
    print(f"  [3/3] CatBoost...", end=' ')
    cat_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=3,
        class_weights=[1.5, 1.0],
        random_seed=RANDOM_STATE,
        verbose=0,
        early_stopping_rounds=100
    )

    cat_model.fit(
        Xt_tr, y_tr,
        eval_set=(Xt_va, y_va),
        verbose=False
    )

    cat_proba_va = cat_model.predict_proba(Xt_va)[:, 1]
    oof_probabilities['catboost'][va_idx] = cat_proba_va
    models['catboost'].append((cat_model, preprocessor))

    print(f"완료 (Iter: {cat_model.best_iteration_})")

    # Test 예측
    if test_available:
        test_predictions['catboost'][:, fold-1] = cat_model.predict_proba(Xt_test)[:, 1]

print(f"\n✓ Layer 1 완료: 3개 Base Models × 5 Folds = 15개 모델 학습")

# ============================================================
# 3. Layer 1 Base Performance
# ============================================================
print(f"\n{'='*70}")
print("Layer 1 Base Models 성능 (Simple Ensemble)")
print(f"{'='*70}")

# Simple weighted ensemble (기준선)
simple_ensemble_proba = (
    0.20 * oof_probabilities['lgbm'] +
    0.60 * oof_probabilities['xgb'] +
    0.20 * oof_probabilities['catboost']
)

# Adaptive Threshold
def get_adaptive_threshold(confidence):
    if confidence < 0.05:
        return 0.42
    elif confidence < 0.10:
        return 0.40
    else:
        return 0.39

confidence_simple = np.abs(simple_ensemble_proba - 0.5)
adaptive_thresholds_simple = np.array([get_adaptive_threshold(c) for c in confidence_simple])
predictions_simple = (simple_ensemble_proba >= adaptive_thresholds_simple).astype(int)

# 성능
simple_f1 = f1_score(y_train, predictions_simple)
simple_auc = roc_auc_score(y_train, simple_ensemble_proba)
simple_cm = confusion_matrix(y_train, predictions_simple)
tn, fp, fn, tp = simple_cm.ravel()
simple_fpr = fp / (fp + tn)

print(f"\n[Simple Ensemble (Weighted Average)]")
print(f"  F1 Score:  {simple_f1:.4f}")
print(f"  AUC Score: {simple_auc:.4f}")
print(f"  FPR:       {simple_fpr:.4f} ({simple_fpr*100:.2f}%)")

# ============================================================
# 4. Layer 2: Stacking (Meta-Learner)
# ============================================================
print(f"\n{'='*70}")
print("Layer 2: Stacking Meta-Learner 학습")
print(f"{'='*70}")

# Meta-features (Layer 1의 OOF 확률)
meta_features_train = np.column_stack([
    oof_probabilities['lgbm'],
    oof_probabilities['xgb'],
    oof_probabilities['catboost']
])

print(f"\n[Meta-features]")
print(f"  Shape: {meta_features_train.shape}")
print(f"  Feature 1: LGBM 확률")
print(f"  Feature 2: XGBoost 확률")
print(f"  Feature 3: CatBoost 확률")

# Meta-Learner 학습 (Logistic Regression)
print(f"\n[Meta-Learner 학습]")
meta_model = LogisticRegression(
    C=0.1,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    max_iter=1000
)

meta_model.fit(meta_features_train, y_train)

print(f"✓ Meta-Learner 학습 완료")
print(f"\n[Meta-Learner 가중치]")
print(f"  LGBM:    {meta_model.coef_[0][0]:.4f}")
print(f"  XGBoost: {meta_model.coef_[0][1]:.4f}")
print(f"  CatBoost: {meta_model.coef_[0][2]:.4f}")
print(f"  Intercept: {meta_model.intercept_[0]:.4f}")

# ============================================================
# 5. Stacking Performance (OOF)
# ============================================================
print(f"\n{'='*70}")
print("Stacking 성능 평가 (OOF)")
print(f"{'='*70}")

# Stacking 확률
stacking_proba = meta_model.predict_proba(meta_features_train)[:, 1]

# Adaptive Threshold 적용
confidence_stacking = np.abs(stacking_proba - 0.5)
adaptive_thresholds_stacking = np.array([get_adaptive_threshold(c) for c in confidence_stacking])
predictions_stacking = (stacking_proba >= adaptive_thresholds_stacking).astype(int)

# 성능
stacking_f1 = f1_score(y_train, predictions_stacking)
stacking_auc = roc_auc_score(y_train, stacking_proba)
stacking_cm = confusion_matrix(y_train, predictions_stacking)
tn_s, fp_s, fn_s, tp_s = stacking_cm.ravel()
stacking_fpr = fp_s / (fp_s + tn_s)
stacking_precision = precision_score(y_train, predictions_stacking)
stacking_recall = recall_score(y_train, predictions_stacking)

print(f"\n[Stacking OOF 성능]")
print(f"  F1 Score:  {stacking_f1:.4f}")
print(f"  AUC Score: {stacking_auc:.4f}")
print(f"  Precision: {stacking_precision:.4f}")
print(f"  Recall:    {stacking_recall:.4f}")
print(f"  FPR:       {stacking_fpr:.4f} ({stacking_fpr*100:.2f}%)")

print(f"\n[OOF 혼동 행렬]")
print(f"  TN: {tn_s}, FP: {fp_s}, FN: {fn_s}, TP: {tp_s}")

# ============================================================
# 6. 성능 비교 (Simple vs Stacking)
# ============================================================
print(f"\n{'='*70}")
print("Simple Ensemble vs Stacking 비교")
print(f"{'='*70}")

f1_improvement = stacking_f1 - simple_f1
auc_improvement = stacking_auc - simple_auc
fpr_improvement = stacking_fpr - simple_fpr

print(f"\n[성능 변화]")
print(f"{'지표':<15} {'Simple':<12} {'Stacking':<12} {'변화':<15}")
print(f"{'-'*55}")
print(f"{'F1 Score':<15} {simple_f1:<12.4f} {stacking_f1:<12.4f} {f1_improvement:+.4f} ({f1_improvement/simple_f1*100:+.2f}%)")
print(f"{'AUC':<15} {simple_auc:<12.4f} {stacking_auc:<12.4f} {auc_improvement:+.4f}")
print(f"{'FPR':<15} {simple_fpr*100:<12.2f}% {stacking_fpr*100:<12.2f}% {fpr_improvement*100:+.2f}%p")

# Top 300 대비
baseline_f1 = 0.8300
baseline_fpr = 0.2467

print(f"\n[Top 300 대비]")
print(f"  F1:  {baseline_f1:.4f} → {stacking_f1:.4f} ({(stacking_f1-baseline_f1)*100:+.2f}%p)")
print(f"  FPR: {baseline_fpr*100:.2f}% → {stacking_fpr*100:.2f}% ({(stacking_fpr-baseline_fpr)*100:+.2f}%p)")

# ============================================================
# 7. Test 예측 (Stacking)
# ============================================================
if test_available:
    print(f"\n{'='*70}")
    print("Test 데이터 예측 (Stacking)")
    print(f"{'='*70}")

    # Test Meta-features (각 Fold 평균)
    meta_features_test = np.column_stack([
        test_predictions['lgbm'].mean(axis=1),
        test_predictions['xgb'].mean(axis=1),
        test_predictions['catboost'].mean(axis=1)
    ])

    print(f"\n  Test Meta-features shape: {meta_features_test.shape}")

    # Stacking 예측
    stacking_proba_test = meta_model.predict_proba(meta_features_test)[:, 1]

    # Adaptive Threshold 적용
    confidence_test = np.abs(stacking_proba_test - 0.5)
    adaptive_thresholds_test = np.array([get_adaptive_threshold(c) for c in confidence_test])
    predictions_test = (stacking_proba_test >= adaptive_thresholds_test).astype(int)

    print(f"\n[Test 예측 결과]")
    print(f"  예측 Class 0: {sum(predictions_test == 0)}개 ({sum(predictions_test == 0)/len(predictions_test)*100:.2f}%)")
    print(f"  예측 Class 1: {sum(predictions_test == 1)}개 ({sum(predictions_test == 1)/len(predictions_test)*100:.2f}%)")
    print(f"  평균 Confidence: {confidence_test.mean():.4f}")
    print(f"  Low Confidence (<0.1): {sum(confidence_test < 0.1)}개")

    # ============================================================
    # 8. Submission 파일 생성
    # ============================================================
    print(f"\n{'='*70}")
    print("Submission 파일 생성")
    print(f"{'='*70}")

    # SMILES 컬럼 확인
    if 'SMILES' in df_test.columns:
        smiles_col = 'SMILES'
    elif 'smiles' in df_test.columns:
        smiles_col = 'smiles'
    else:
        smiles_col = df_test.columns[0]

    # 기본 제출 파일
    submission = pd.DataFrame({
        'SMILES': df_test[smiles_col],
        'output': predictions_test
    })
    submission.to_csv('submission_stacking_final.csv', index=False)
    print(f"\n✓ 기본 제출 파일: submission_stacking_final.csv")

    # 상세 제출 파일
    submission_detailed = pd.DataFrame({
        'id': range(len(predictions_test)),
        'label': predictions_test,
        'probability': stacking_proba_test,
        'confidence': confidence_test,
        'adaptive_threshold': adaptive_thresholds_test,
        'lgbm_proba': test_predictions['lgbm'].mean(axis=1),
        'xgb_proba': test_predictions['xgb'].mean(axis=1),
        'catboost_proba': test_predictions['catboost'].mean(axis=1)
    })
    submission_detailed.to_csv('submission_detailed_stacking_final.csv', index=False)
    print(f"✓ 상세 제출 파일: submission_detailed_stacking_final.csv")

# ============================================================
# 9. 모델 저장
# ============================================================
print(f"\n{'='*70}")
print("모델 저장")
print(f"{'='*70}")

final_model_package = {
    'layer1_models': models,
    'meta_model': meta_model,
    'feature_columns': feature_columns,
    'oof_f1': stacking_f1,
    'oof_auc': stacking_auc,
    'oof_fpr': stacking_fpr
}

with open('final_stacking_model.pkl', 'wb') as f:
    pickle.dump(final_model_package, f)

print(f"\n✓ 모델 저장: final_stacking_model.pkl")
print(f"  - Layer 1: 15개 Base Models")
print(f"  - Layer 2: Meta-Learner")
print(f"  - 전체 피처 리스트")

# ============================================================
# 10. 시각화
# ============================================================
print(f"\n{'='*70}")
print("시각화 생성")
print(f"{'='*70}")

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. F1 Score Comparison
ax = axes[0, 0]
models_compare = ['Top 300\n(Adaptive)', 'Simple\nEnsemble\n(3076개)', 'Stacking\n(3076개)']
f1_scores = [baseline_f1, simple_f1, stacking_f1]
colors = ['lightblue', 'orange', 'darkgreen']

bars = ax.bar(models_compare, f1_scores, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
ax.set_ylabel('F1 Score', fontsize=11)
ax.set_title('F1 Score Comparison', fontsize=12, fontweight='bold')
ax.set_ylim([0.825, 0.840])
ax.grid(True, alpha=0.3, axis='y')

for bar, val in zip(bars, f1_scores):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.4f}',
            ha='center', va='bottom', fontweight='bold', fontsize=10)

# 2. FPR Comparison
ax = axes[0, 1]
fpr_values = [baseline_fpr * 100, simple_fpr * 100, stacking_fpr * 100]
bars = ax.bar(models_compare, fpr_values, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
ax.axhline(25, color='r', linestyle='--', alpha=0.5, label='Target: 25%')
ax.set_ylabel('FPR (%)', fontsize=11)
ax.set_title('FPR Comparison', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

for bar, val in zip(bars, fpr_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.2f}%',
            ha='center', va='bottom', fontweight='bold', fontsize=10)

# 3. Confusion Matrix (Stacking)
ax = axes[0, 2]
sns.heatmap(stacking_cm, annot=True, fmt='d', cmap='Blues', ax=ax, annot_kws={'size': 14})
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix (Stacking)', fontsize=12, fontweight='bold')

# 4. Meta-Learner Weights
ax = axes[1, 0]
model_names = ['LGBM', 'XGBoost', 'CatBoost']
weights = meta_model.coef_[0]
colors_weights = ['steelblue', 'orange', 'green']

bars = ax.bar(model_names, weights, color=colors_weights, alpha=0.7, edgecolor='black')
ax.set_ylabel('Weight', fontsize=11)
ax.set_title('Meta-Learner Weights', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

for bar, weight in zip(bars, weights):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{weight:.3f}',
            ha='center', va='bottom' if height > 0 else 'top', fontweight='bold')

# 5. Performance Improvement
ax = axes[1, 1]
metrics = ['F1\nScore', 'AUC', 'Precision', 'Recall']
simple_vals = [simple_f1, simple_auc,
               precision_score(y_train, predictions_simple),
               recall_score(y_train, predictions_simple)]
stacking_vals = [stacking_f1, stacking_auc, stacking_precision, stacking_recall]

x = np.arange(len(metrics))
width = 0.35

ax.bar(x - width/2, simple_vals, width, label='Simple', alpha=0.8)
ax.bar(x + width/2, stacking_vals, width, label='Stacking', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('All Metrics Comparison', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# 6. Summary
ax = axes[1, 2]
ax.axis('off')

summary_text = f"""
═══════════════════════
  Stacking 최종 성능
═══════════════════════

F1 Score:  {stacking_f1:.4f}
AUC:       {stacking_auc:.4f}
Precision: {stacking_precision:.4f}
Recall:    {stacking_recall:.4f}
FPR:       {stacking_fpr*100:.2f}%

─────────────────────
Top 300 대비
─────────────────────
F1:   {(stacking_f1-baseline_f1)*100:+.2f}%p
FPR:  {(stacking_fpr-baseline_fpr)*100:+.2f}%p

─────────────────────
Simple 대비
─────────────────────
F1:   {f1_improvement:+.4f}
FPR:  {fpr_improvement*100:+.2f}%p

피처: 3076개 (전체)
Layer 1: 15개 모델
Layer 2: Meta-Learner
"""

ax.text(0.5, 0.5, summary_text, ha='center', va='center',
        fontsize=10, family='monospace',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('stacking_final_analysis.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"\n✓ 시각화 저장: stacking_final_analysis.png")

# ============================================================
# 11. 최종 리포트
# ============================================================
print(f"\n{'='*70}")
print("최종 성능 리포트")
print(f"{'='*70}")

print(f"\n[모델 사양]")
print(f"  Architecture: 2-Layer Stacking")
print(f"    - Layer 1: LGBM + XGB + CAT")
print(f"    - Layer 2: Logistic Regression")
print(f"  피처: 전체 3076개")
print(f"  Threshold: Adaptive (0.42/0.40/0.39)")

print(f"\n[최종 성능]")
print(f"  F1 Score:  {stacking_f1:.4f}")
print(f"  AUC Score: {stacking_auc:.4f}")
print(f"  Precision: {stacking_precision:.4f}")
print(f"  Recall:    {stacking_recall:.4f}")
print(f"  FPR:       {stacking_fpr*100:.2f}%")

if stacking_f1 > baseline_f1:
    print(f"\n✓✓✓ 성공! Top 300 대비 F1 {(stacking_f1-baseline_f1)*100:+.2f}%p 향상")
else:
    print(f"\n△ Top 300과 유사한 성능")

if test_available:
    print(f"\n[제출 파일]")
    print(f"  메인: submission_stacking_final.csv")
    print(f"  상세: submission_detailed_stacking_final.csv")

print(f"\n{'='*70}")
print("✓ Stacking 모델 완성!")
print(f"{'='*70}")


최고 성능 모델: 전체 피처 + 2-Layer Stacking

[전략]
  Layer 1: LGBM, XGB, CAT (Base Models)
  Layer 2: Logistic Regression (Meta-Learner)
  피처: 전체 3076개 (압축 없음)
  목표: F1 0.835+

데이터 로드 (전체 피처)

[Train 데이터]
  Shape: (8349, 3077)
  피처: 3077개 (전체)
  Label 분포: Class 0 = 3807, Class 1 = 4542

[Test 데이터]
  Shape: (927, 3077)

[피처 구성]
  Fingerprint: 3072개
  Descriptor: 4개

Layer 1: Base Models 학습 (5-Fold CV)

──────────────────────────────────────────────────────────────────────
📊 Fold 1/5
──────────────────────────────────────────────────────────────────────
  학습: (6679, 3076), 검증: (1670, 3076)
  [1/3] LightGBM... 완료 (Iter: 874)
  [2/3] XGBoost... 완료 (Iter: 993)
  [3/3] CatBoost... 완료 (Iter: 999)

──────────────────────────────────────────────────────────────────────
📊 Fold 2/5
──────────────────────────────────────────────────────────────────────
  학습: (6679, 3076), 검증: (1670, 3076)
  [1/3] LightGBM... 완료 (Iter: 760)
  [2/3] XGBoost... 완료 (Iter: 731)
  [3/3] CatBoost... 완료 (Iter: 956)

───────────────

In [None]:
# ============================================================
# RDKit Descriptor 추가 피처 생성
# ============================================================

from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np

print("=" * 70)
print("RDKit Descriptor 피처 생성")
print("=" * 70)

def calculate_rdkit_descriptors(smiles):
    """
    SMILES에서 RDKit Descriptor 계산

    독성 예측 특화 Descriptors:
    - Lipinski descriptors (약물성)
    - 구조적 특징 (고리, 결합)
    - 전자적 특성
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        descriptors = {
            # Lipinski Descriptors (약물성)
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol),
            'MolLogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),

            # 구조적 특징
            'NumAromaticRings': Descriptors.NumAromaticRings(mol),
            'NumAliphaticRings': Descriptors.NumAliphaticRings(mol),
            'NumSaturatedRings': Descriptors.NumSaturatedRings(mol),
            'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
            'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),

            # 전자적 특성
            'FractionCsp3': Descriptors.FractionCsp3(mol),
            'Chi0v': Descriptors.Chi0v(mol),
            'HallKierAlpha': Descriptors.HallKierAlpha(mol),

            # 표면적 및 부피
            'LabuteASA': Descriptors.LabuteASA(mol),
            'PEOE_VSA1': Descriptors.PEOE_VSA1(mol),

            # 추가 (선택)
            'BertzCT': Descriptors.BertzCT(mol),  # 복잡도
            'Ipc': Descriptors.Ipc(mol),          # 정보 함량
            'RingCount': Descriptors.RingCount(mol),
            'NumBridgeheadAtoms': Descriptors.NumBridgeheadAtoms(mol),
            'NumSpiroAtoms': Descriptors.NumSpiroAtoms(mol)
        }

        return descriptors
    except Exception as e:
        print(f"Error processing {smiles}: {e}")
        return None

# ============================================================
# 1. Train 데이터에 RDKit Descriptor 추가
# ============================================================
print(f"\n{'='*70}")
print("Train 데이터 처리")
print(f"{'='*70}")

# Train 데이터 로드
df_train = pd.read_csv('train.csv')

# SMILES 컬럼 확인 (없으면 별도 파일에서 로드 필요)
if 'SMILES' in df_train.columns:
    train_smiles = df_train['SMILES']
elif 'smiles' in df_train.columns:
    train_smiles = df_train['smiles']
else:
    print("⚠️  Train 데이터에 SMILES 없음")
    print("   해결책:")
    print("   1. Train SMILES 파일 별도 제공 필요")
    print("   2. 또는 기존 SMILES로부터 재생성")
    train_smiles = None

if train_smiles is not None:
    print(f"\n  SMILES 발견: {len(train_smiles)}개")

    # RDKit Descriptor 계산
    print(f"  RDKit Descriptor 계산 중...")
    rdkit_descriptors_list = []

    for idx, smiles in enumerate(train_smiles):
        if idx % 1000 == 0:
            print(f"\r    진행: {idx}/{len(train_smiles)}", end='')

        desc = calculate_rdkit_descriptors(smiles)
        rdkit_descriptors_list.append(desc)

    print(f"\r    ✓ 완료: {len(train_smiles)}개")

    # DataFrame 변환
    rdkit_df = pd.DataFrame(rdkit_descriptors_list)

    print(f"\n  생성된 RDKit Descriptor: {len(rdkit_df.columns)}개")
    print(f"  컬럼: {list(rdkit_df.columns)}")

    # 결측치 확인
    missing_count = rdkit_df.isnull().sum().sum()
    if missing_count > 0:
        print(f"\n  ⚠️  결측치: {missing_count}개")
        print(f"     결측치 처리: median imputation")
        rdkit_df = rdkit_df.fillna(rdkit_df.median())

    # 기존 데이터에 추가
    df_train_enhanced = pd.concat([df_train, rdkit_df], axis=1)

    print(f"\n  최종 Train 데이터: {df_train_enhanced.shape}")
    print(f"    기존: {df_train.shape}")
    print(f"    추가: {rdkit_df.shape[1]}개 컬럼")

    # 저장
    df_train_enhanced.to_csv('train_with_rdkit.csv', index=False)
    print(f"\n✓ 저장: train_with_rdkit.csv")

# ============================================================
# 2. Test 데이터에 RDKit Descriptor 추가
# ============================================================
print(f"\n{'='*70}")
print("Test 데이터 처리")
print(f"{'='*70}")

df_test = pd.read_csv('predict_input.csv')

# SMILES 컬럼 확인
if 'SMILES' in df_test.columns:
    test_smiles = df_test['SMILES']
elif 'smiles' in df_test.columns:
    test_smiles = df_test['smiles']
else:
    test_smiles = df_test[df_test.columns[0]]  # 첫 번째 컬럼 시도

print(f"\n  SMILES: {len(test_smiles)}개")

# RDKit Descriptor 계산
print(f"  RDKit Descriptor 계산 중...")
rdkit_descriptors_test_list = []

for idx, smiles in enumerate(test_smiles):
    if idx % 100 == 0:
        print(f"\r    진행: {idx}/{len(test_smiles)}", end='')

    desc = calculate_rdkit_descriptors(smiles)
    rdkit_descriptors_test_list.append(desc)

print(f"\r    ✓ 완료: {len(test_smiles)}개")

# DataFrame 변환
rdkit_test_df = pd.DataFrame(rdkit_descriptors_test_list)

# 결측치 처리
if rdkit_test_df.isnull().sum().sum() > 0:
    rdkit_test_df = rdkit_test_df.fillna(rdkit_test_df.median())

# 기존 데이터에 추가
df_test_enhanced = pd.concat([df_test, rdkit_test_df], axis=1)

print(f"\n  최종 Test 데이터: {df_test_enhanced.shape}")

# 저장
df_test_enhanced.to_csv('predict_input_with_rdkit.csv', index=False)
print(f"\n✓ 저장: predict_input_with_rdkit.csv")

# ============================================================
# 3. 상관관계 분석 (중복 확인)
# ============================================================
print(f"\n{'='*70}")
print("기존 Descriptor와 상관관계 분석")
print(f"{'='*70}")

if train_smiles is not None:
    existing_desc = ['MolWt', 'clogp', 'sa_score', 'qed']

    print(f"\n[상관관계 매트릭스]")

    # 기존 + 새로운 Descriptor
    all_descriptors = existing_desc + list(rdkit_df.columns)
    correlation_df = df_train_enhanced[all_descriptors].corr()

    # 높은 상관관계 찾기 (|r| > 0.8)
    high_corr_pairs = []

    for i, col1 in enumerate(all_descriptors):
        for col2 in all_descriptors[i+1:]:
            corr = correlation_df.loc[col1, col2]
            if abs(corr) > 0.8:
                high_corr_pairs.append((col1, col2, corr))

    if high_corr_pairs:
        print(f"\n  ⚠️  높은 상관관계 발견 (|r| > 0.8):")
        for col1, col2, corr in high_corr_pairs[:10]:  # 상위 10개
            print(f"    {col1} ↔ {col2}: {corr:.3f}")
        print(f"\n  총 {len(high_corr_pairs)}쌍")
        print(f"  권장: 한 쪽 제거하여 다중공선성 방지")
    else:
        print(f"\n  ✓ 높은 상관관계 없음 - 모두 독립적")

# ============================================================
# 4. 예상 성능 개선
# ============================================================
print(f"\n{'='*70}")
print("예상 효과")
print(f"{'='*70}")

print(f"\n[RDKit Descriptor 추가 효과]")
print(f"  기존 피처: 3076개 (Fingerprint 3072 + Descriptor 4)")
print(f"  추가 피처: {len(rdkit_df.columns)}개 (RDKit Descriptors)")
print(f"  최종 피처: {3076 + len(rdkit_df.columns)}개")

print(f"\n[예상 성능 향상]")
print(f"  현재 F1: 0.8300~0.8306")
print(f"  예상 F1: 0.8350~0.8400 (+0.5~1.0%)")
print(f"  근거:")
print(f"    - Domain knowledge 활용")
print(f"    - 화학적 의미 있는 피처")
print(f"    - 독성과 직접 관련")

print(f"\n✓ RDKit Descriptor 생성 완료")
print(f"  다음: train_with_rdkit.csv로 모델 재학습")
