In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve

In [3]:
df_train = pd.read_csv('./data/df_train.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [4]:
df_train.columns

Index(['title', 'month', 'type_movie', 'type_tv', 'runtime', 'season_count',
       'episode_count', 'language', 'production_company_is_missing',
       'director_is_missing',
       ...
       'country_PL', 'country_RU', 'country_SE', 'country_TH', 'country_TR',
       'country_TW', 'country_US', 'country_US, GB', 'country_others',
       'success_label'],
      dtype='object', length=201)

In [5]:
print(df_train.shape)
print(df_test.shape)

(63278, 201)
(1477, 200)


In [8]:
# 1. X, y 준비 (title, language 제거한 상태)
X = df_train.drop(columns=['success_label', 'title', 'language'])
y = df_train['success_label']
X_test = df_test.copy()

# 2. 컬럼명 특수문자 처리
def clean_column_names(df):
    df.columns = df.columns.str.replace(r'[,\s/\\]', '_', regex=True)
    return df

X = clean_column_names(X)
X_test = clean_column_names(X_test)

# 3. train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=26, stratify=y
)

In [9]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    #use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)
y_valid_pred = xgb_model.predict(X_valid)
y_valid_prob_xgb = xgb_model.predict_proba(X_valid)[:, 1]

In [10]:
# 정확도
acc = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {acc:.4f}")

# F1-score (binary 기준)
f1 = f1_score(y_valid, y_valid_pred)
print(f"Validation F1-score: {f1:.4f}")

# ROC-AUC (확률 기반 평가)
roc_auc = roc_auc_score(y_valid, y_valid_prob_xgb)
print(f"Validation ROC-AUC: {roc_auc:.4f}")

# 좀 더 자세한 리포트
print(classification_report(y_valid, y_valid_pred))


Validation Accuracy: 0.8442
Validation F1-score: 0.5205
Validation ROC-AUC: 0.8477
              precision    recall  f1-score   support

           0       0.86      0.96      0.91     12553
           1       0.71      0.41      0.52      3267

    accuracy                           0.84     15820
   macro avg       0.79      0.68      0.71     15820
weighted avg       0.83      0.84      0.83     15820

