In [45]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n_samples = 100
study_hours = np.random.randint(1, 11, size=n_samples)       # 1~10시간 사이
sleep_hours = np.random.randint(4, 10, size=n_samples)        # 4~9시간 사이
exercise = np.random.randint(0, 2, size=n_samples)            # 0 또는 1

passed = (
    (study_hours >= 6).astype(int) +
    (sleep_hours >= 6).astype(int) +
    (exercise == 1).astype(int)
) >= 2  # 세 조건 중 2개 이상 만족 시 합격

passed = passed.astype(int)

df = pd.DataFrame({
    'study_hours': study_hours,
    'sleep_hours': sleep_hours,
    'exercise': exercise,
    'passed': passed
})

df['exercise'] = np.where(df['exercise'] == 0, 'NO', 'YES')

df.head()

Unnamed: 0,study_hours,sleep_hours,exercise,passed
0,4,8,YES,1
1,7,9,YES,1
2,6,5,NO,0
3,5,9,NO,0
4,9,7,YES,1


In [46]:
X = df[['study_hours', 'sleep_hours', 'exercise']]
y = df['passed']

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [48]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75 entries, 15 to 51
Data columns (total 3 columns):
study_hours    75 non-null int32
sleep_hours    75 non-null int32
exercise       75 non-null object
dtypes: int32(2), object(1)
memory usage: 1.8+ KB


In [52]:
# categorize
X_train['exercise'] = X_train['exercise'].astype('category')
X_test['exercise'] = X_test['exercise'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
from lightgbm import LGBMClassifier

# 모델 학습
# n_estimators = 랜덤 포레스트를 구성하는 트리의 수
model = LGBMClassifier(random_state=1234, verbose=-1)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=None, num_leaves=31, objective=None,
               random_state=1234, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [54]:
# 예측
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1])

In [55]:
# 평가
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("Precision :", precision_score(y_test, y_pred))

# pos_label=1 => 1이 양성 클래스라고 간주 (default)
print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))

Accuracy : 1.0
Recall : 1.0
Precision : 1.0
F1 Score: 1.0


In [56]:
# 변수 중요도 (Feature Importance)
model.feature_importances_

array([75, 47, 44])

In [57]:
imp_df = pd.DataFrame(model.feature_importances_, \
                      index=['study_hours', 'sleep_hours', 'exercise'], columns=['imp']).reset_index()
imp_df

Unnamed: 0,index,imp
0,study_hours,75
1,sleep_hours,47
2,exercise,44


In [58]:
# 중요한 변수 순서대로 정렬
imp_df.sort_values('imp', ascending=False)

Unnamed: 0,index,imp
0,study_hours,75
1,sleep_hours,47
2,exercise,44


In [59]:
# 각 샘플이 클래스에 속할 확률을 추정
probs = model.predict_proba(X_test)
probs

array([[2.95786008e-04, 9.99704214e-01],
       [9.95246942e-01, 4.75305767e-03],
       [8.95403256e-01, 1.04596744e-01],
       [9.39047597e-02, 9.06095240e-01],
       [9.36042524e-01, 6.39574757e-02],
       [1.47723974e-01, 8.52276026e-01],
       [3.36620227e-04, 9.99663380e-01],
       [8.95403256e-01, 1.04596744e-01],
       [8.95403256e-01, 1.04596744e-01],
       [4.39465598e-02, 9.56053440e-01],
       [1.31211109e-01, 8.68788891e-01],
       [9.95246942e-01, 4.75305767e-03],
       [1.09912615e-01, 8.90087385e-01],
       [9.39047597e-02, 9.06095240e-01],
       [9.95246942e-01, 4.75305767e-03],
       [2.95786008e-04, 9.99704214e-01],
       [3.60106751e-04, 9.99639893e-01],
       [1.47723974e-01, 8.52276026e-01],
       [4.40390392e-04, 9.99559610e-01],
       [1.39429841e-01, 8.60570159e-01],
       [9.36042524e-01, 6.39574757e-02],
       [3.36620227e-04, 9.99663380e-01],
       [8.11221108e-01, 1.88778892e-01],
       [1.09912615e-01, 8.90087385e-01],
       [1.099126

In [12]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, probs[:, 1])

1.0

In [13]:
roc_auc_score(probs[:, 1], y_test)

ValueError: continuous format is not supported