In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve

In [2]:
df_train = pd.read_csv('./data/df_train.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [3]:
df_train.columns

Index(['title', 'month', 'type_movie', 'type_tv', 'runtime', 'season_count',
       'episode_count', 'language', 'production_company_is_missing',
       'director_is_missing',
       ...
       'country_PL', 'country_RU', 'country_SE', 'country_TH', 'country_TR',
       'country_TW', 'country_US', 'country_US, GB', 'country_others',
       'success_label'],
      dtype='object', length=201)

In [4]:
print(df_train.shape)
print(df_test.shape)

(63278, 201)
(1477, 200)


In [5]:
# 1. X, y 준비 (title, language 제거한 상태)
X = df_train.drop(columns=['success_label', 'title', 'language'])
y = df_train['success_label']
X_test = df_test.copy()

# 2. 컬럼명 특수문자 처리
def clean_column_names(df):
    df.columns = df.columns.str.replace(r'[,\s/\\]', '_', regex=True)
    return df

X = clean_column_names(X)
X_test = clean_column_names(X_test)

# 3. train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=26, stratify=y
)

# 4. LGBM 모델 학습
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_model.fit(X_train, y_train)
y_valid_pred = lgb_model.predict(X_valid)
y_valid_prob_lgb = lgb_model.predict_proba(X_valid)[:, 1]


[LightGBM] [Info] Number of positive: 9802, number of negative: 37656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26697
[LightGBM] [Info] Number of data points in the train set: 47458, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206541 -> initscore=-1.345906
[LightGBM] [Info] Start training from score -1.345906


In [6]:
# 정확도
acc = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {acc:.4f}")

# F1-score (binary 기준)
f1 = f1_score(y_valid, y_valid_pred)
print(f"Validation F1-score: {f1:.4f}")

# ROC-AUC (확률 기반 평가)
roc_auc = roc_auc_score(y_valid, y_valid_prob_lgb)
print(f"Validation ROC-AUC: {roc_auc:.4f}")

# 좀 더 자세한 리포트
print(classification_report(y_valid, y_valid_pred))


Validation Accuracy: 0.8446
Validation F1-score: 0.5208
Validation ROC-AUC: 0.8482
              precision    recall  f1-score   support

           0       0.86      0.96      0.91     12553
           1       0.72      0.41      0.52      3267

    accuracy                           0.84     15820
   macro avg       0.79      0.68      0.71     15820
weighted avg       0.83      0.84      0.83     15820



In [7]:
from sklearn.metrics import roc_curve

# 1) valid 데이터 예측 확률 얻기
y_valid_probs = lgb_model.predict_proba(X_valid)[:, 1]

# 2) ROC 커브에서 임계값, TPR, FPR 구하기
fpr, tpr, thresholds = roc_curve(y_valid, y_valid_probs)

# 3) 최적 임계값 찾기 (예: Youden’s J statistic 최대값)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.3f}")

# 4) 임계값 기준으로 예측 변경
y_valid_pred_new = (y_valid_probs >= optimal_threshold).astype(int)

# 5) 성능 평가 다시 하기
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy:", accuracy_score(y_valid, y_valid_pred_new))
print("Precision:", precision_score(y_valid, y_valid_pred_new))
print("Recall:", recall_score(y_valid, y_valid_pred_new))
print("F1 Score:", f1_score(y_valid, y_valid_pred_new))
print(classification_report(y_valid, y_valid_pred_new))


Optimal Threshold: 0.210
Accuracy: 0.7753476611883692
Precision: 0.47295004712535343
Recall: 0.7679828588919498
F1 Score: 0.5853943070461969
              precision    recall  f1-score   support

           0       0.93      0.78      0.85     12553
           1       0.47      0.77      0.59      3267

    accuracy                           0.78     15820
   macro avg       0.70      0.77      0.72     15820
weighted avg       0.83      0.78      0.79     15820



In [8]:
lgb_model_tuned = lgb.LGBMClassifier(
    n_estimators=500,   # 100 → 500으로 늘림
    learning_rate=0.01, # 0.1 → 0.01로 낮춤
    max_depth=8,        # 기존 6 → 8로 조절
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_model_tuned.fit(X_train, y_train)

y_valid_probs_tuned = lgb_model_tuned.predict_proba(X_valid)[:, 1]
y_valid_pred_tuned = (y_valid_probs_tuned >= 0.5).astype(int)  # 기본 threshold 0.5

print("Tuned Model Performance")
print("Accuracy:", accuracy_score(y_valid, y_valid_pred_tuned))
print("Precision:", precision_score(y_valid, y_valid_pred_tuned))
print("Recall:", recall_score(y_valid, y_valid_pred_tuned))
print(classification_report(y_valid, y_valid_pred_new))
print("F1 Score:", f1_score(y_valid, y_valid_pred_tuned))


[LightGBM] [Info] Number of positive: 9802, number of negative: 37656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26697
[LightGBM] [Info] Number of data points in the train set: 47458, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206541 -> initscore=-1.345906
[LightGBM] [Info] Start training from score -1.345906
Tuned Model Performance
Accuracy: 0.8421618204804046
Precision: 0.7267373380447585
Recall: 0.37771655953474137
              precision    recall  f1-score   support

           0       0.93      0.78      0.85     12553
           1       0.47      0.77      0.59      3267

    accuracy                           0.78     15820
   macro avg       0.70      0.77      0.72     15820
weighted avg       0.83      0.78      0.79     15820

F1 Score: 0.49707955689828803


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, roc_auc_score
import lightgbm as lgb

# 1. 데이터 준비
X = df_train.drop(columns=['success_label', 'title', 'language'])
y = df_train['success_label']
X_test = df_test.copy()

# 2. 특수문자 컬럼명 처리
def clean_column_names(df):
    df.columns = df.columns.str.replace(r'[,\s/\\]', '_', regex=True)
    return df

X = clean_column_names(X)
X_test = clean_column_names(X_test)

# 3. train/validation 분할
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=26, stratify=y
)

# 4. 기본 모델 (Model A)
model_base = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model_base.fit(X_train, y_train)
proba_base = model_base.predict_proba(X_valid)[:, 1]

# 5. 튜닝 모델 (Model B)
model_tuned = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42
)
model_tuned.fit(X_train, y_train)
proba_tuned = model_tuned.predict_proba(X_valid)[:, 1]

# 6. Threshold 모델은 base 모델 기반으로 threshold만 조정 (Model C == Model A)
proba_thresh = proba_base.copy()  # 따로 훈련 안 함

# 7. 앙상블 (soft voting)
final_proba = (proba_base + proba_thresh + proba_tuned) / 3

# 8. Threshold 설정 및 예측
threshold = 0.21
final_preds = (final_proba >= threshold).astype(int)

# 9. 평가 출력
print("== Ensemble Results ==")
print(classification_report(y_valid, final_preds, digits=4))
print("Accuracy:", np.mean(final_preds == y_valid))
print("Precision:", precision_score(y_valid, final_preds))
print("Recall:", recall_score(y_valid, final_preds))
print("F1 Score:", f1_score(y_valid, final_preds))
print("ROC-AUC:", roc_auc_score(y_valid, final_proba))  # 확률 기반

# (선택) 추후 X_test에도 final_proba 적용 가능
# test_proba_base = model_base.predict_proba(X_test)[:, 1]
# test_proba_tuned = model_tuned.predict_proba(X_test)[:, 1]
# test_final_proba = (test_proba_base + test_proba_base + test_proba_tuned) / 3
# test_preds = (test_final_proba >= threshold).astype(int)


[LightGBM] [Info] Number of positive: 9802, number of negative: 37656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26697
[LightGBM] [Info] Number of data points in the train set: 47458, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206541 -> initscore=-1.345906
[LightGBM] [Info] Start training from score -1.345906
[LightGBM] [Info] Number of positive: 9802, number of negative: 37656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26697
[LightGBM] [Info] Number of data points in the train set: 47458, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206541 -> initscore=-1.345906
[LightGBM] [Info] Start training from score -1.345906
== Ensembl