In [1]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n_samples = 100
study_hours = np.random.randint(1, 11, size=n_samples)       # 1~10시간 사이
sleep_hours = np.random.randint(4, 10, size=n_samples)        # 4~9시간 사이
exercise = np.random.randint(0, 2, size=n_samples)            # 0 또는 1

passed = (
    (study_hours >= 6).astype(int) +
    (sleep_hours >= 6).astype(int) +
    (exercise == 1).astype(int)
) >= 2  # 세 조건 중 2개 이상 만족 시 합격

passed = passed.astype(int)

df = pd.DataFrame({
    'study_hours': study_hours,
    'sleep_hours': sleep_hours,
    'exercise': exercise,
    'passed': passed
})

df.head()

Unnamed: 0,study_hours,sleep_hours,exercise,passed
0,4,8,1,1
1,7,9,1,1
2,6,5,0,0
3,5,9,0,0
4,9,7,1,1


In [2]:
X = df[['study_hours', 'sleep_hours', 'exercise']]
y = df['passed']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
from sklearn.ensemble import AdaBoostClassifier

# 모델 학습
# n_estimators = 트리의 수
model = AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=1234)
model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=100, random_state=1234)

In [5]:
# 예측
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1])

In [6]:
# 평가
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("Precision :", precision_score(y_test, y_pred))

# pos_label=1 => 1이 양성 클래스라고 간주 (default)
print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))

Accuracy : 1.0
Recall : 1.0
Precision : 1.0
F1 Score: 1.0


In [7]:
# 변수 중요도 (Feature Importance)
model.feature_importances_

array([0.34, 0.32, 0.34])

In [8]:
imp_df = pd.DataFrame(model.feature_importances_, \
                      index=['study_hours', 'sleep_hours', 'exercise'], columns=['imp']).reset_index()
imp_df

Unnamed: 0,index,imp
0,study_hours,0.34
1,sleep_hours,0.32
2,exercise,0.34


In [9]:
# 중요한 변수 순서대로 정렬
imp_df.sort_values('imp', ascending=False)

Unnamed: 0,index,imp
0,study_hours,0.34
2,exercise,0.34
1,sleep_hours,0.32


In [10]:
# 각 샘플이 클래스에 속할 확률을 추정
probs = model.predict_proba(X_test)
probs

array([[0.27386158, 0.72613842],
       [0.71449502, 0.28550498],
       [0.56368172, 0.43631828],
       [0.41581761, 0.58418239],
       [0.57962454, 0.42037546],
       [0.42215843, 0.57784157],
       [0.27386158, 0.72613842],
       [0.56368172, 0.43631828],
       [0.56368172, 0.43631828],
       [0.41336848, 0.58663152],
       [0.43370542, 0.56629458],
       [0.7070663 , 0.2929337 ],
       [0.40636   , 0.59364   ],
       [0.41581761, 0.58418239],
       [0.71449502, 0.28550498],
       [0.27386158, 0.72613842],
       [0.27386158, 0.72613842],
       [0.42215843, 0.57784157],
       [0.29674776, 0.70325224],
       [0.42215843, 0.57784157],
       [0.57962454, 0.42037546],
       [0.27386158, 0.72613842],
       [0.56119831, 0.43880169],
       [0.40636   , 0.59364   ],
       [0.40636   , 0.59364   ]])

In [37]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, probs[:, 1])

1.0