In [115]:
import numpy as np
import pandas as pd

np.random.seed(1234)

n_samples = 1000

df = pd.DataFrame({
    'temperature': np.random.normal(70, 10, n_samples),         # 연속형
    'pressure': np.random.normal(30, 5, n_samples),             # 연속형
    'vibration': np.random.normal(0.5, 0.1, n_samples),         # 연속형
})

# 실패 종류 (타겟): 온도와 진동이 높고 압력이 낮으면 실패 확률 증가
df['failure'] = np.where(
    (df['temperature'] > 65) & (df['vibration'] > 0.55) & (df['pressure'] < 38),
    "Fail",
    "Not Fail"
)

# failure가 Fail인 데이터 중 30%는 Unknown으로 변경 (noise)
zero_idx = df.index[df['failure'] == 'Fail']
n_change = int(len(zero_idx) * 0.3)  
change_idx = np.random.choice(zero_idx, size=n_change, replace=False)
df.loc[change_idx, 'failure'] = 'Unknown'

df.head()

Unnamed: 0,temperature,pressure,vibration,failure
0,74.714352,22.08896,0.381296,Not Fail
1,58.090243,21.899049,0.661708,Not Fail
2,84.32707,30.232809,0.495732,Not Fail
3,66.873481,21.600856,0.536798,Not Fail
4,62.794113,36.979462,0.680919,Not Fail


In [116]:
df['failure'].value_counts()

Not Fail    797
Fail        143
Unknown      60
Name: failure, dtype: int64

In [117]:
X = df.drop(['failure'], axis=1).copy()
y = df['failure'].copy()

In [118]:
from sklearn.metrics import recall_score

# 'binary' = 이진 분류 전용. pos_label(기본: 1) 클래스만 대상으로 재현율 계산
# 'micro' = 전체 TP, FN을 전부 합산해서 재현율 계산 (샘플 기준)
# 'macro' = 클래스별 재현율을 계산한 후 단순 평균 (클래스 간 가중치 없음)
# 'weighted' = 클래스별 재현율을 계산한 후, 샘플 수 기준 가중 평균
# 'samples' = 다중 레이블 문제(multi-label)에서 사용. 각 샘플에 대한 평균
# None = 클래스별 재현율을 리스트 형태로 반환

In [119]:
# XGB
from xgboost import XGBClassifier

# n_estimators = 결정 트리의 수
# max_depth = 최대 깊이
model1 = XGBClassifier(n_estimators=100, max_depth=4, random_state=1234)
model1.fit(X, y)
pred1 = model1.predict(X)
recall_score(y, pred1, average='macro')

0.9198912198912198

In [120]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier

# n_estimators = 결정 트리의 수
# max_depth = 최대 깊이
model2 = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=1234)
model2.fit(X, y)
pred2 = model2.predict(X)
recall_score(y, pred2, average='macro')

0.6722222222222222

In [121]:
# GB
from sklearn.ensemble import GradientBoostingClassifier

model3 = GradientBoostingClassifier(random_state=1234)
model3.fit(X, y)
pred3 = model3.predict(X)
recall_score(y, pred3, average='macro')

0.9777777777777779

In [122]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

model4 = KNeighborsClassifier(n_neighbors=5)
model4.fit(X, y)
pred4 = model4.predict(X)
recall_score(y, pred4, average='macro')

0.42435375860720903

In [123]:
# Voting
result = {'PRED1': pred1, 'PRED2': pred2, 'PRED3': pred3, 'PRED4': pred4}
pred = pd.DataFrame(result)
pred

Unnamed: 0,PRED1,PRED2,PRED3,PRED4
0,Not Fail,Not Fail,Not Fail,Not Fail
1,Not Fail,Not Fail,Not Fail,Not Fail
2,Not Fail,Not Fail,Not Fail,Not Fail
3,Not Fail,Not Fail,Not Fail,Not Fail
4,Not Fail,Not Fail,Not Fail,Not Fail
...,...,...,...,...
995,Unknown,Fail,Unknown,Not Fail
996,Not Fail,Not Fail,Not Fail,Not Fail
997,Not Fail,Not Fail,Not Fail,Not Fail
998,Not Fail,Not Fail,Not Fail,Not Fail


In [124]:
pred['PRED'] = pred[['PRED1', 'PRED2', 'PRED3', 'PRED4']].mode(axis=1).iloc[:, 0]
pred

Unnamed: 0,PRED1,PRED2,PRED3,PRED4,PRED
0,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
1,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
2,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
3,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
4,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
...,...,...,...,...,...
995,Unknown,Fail,Unknown,Not Fail,Unknown
996,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
997,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail
998,Not Fail,Not Fail,Not Fail,Not Fail,Not Fail


In [126]:
recall_score(y, pred['PRED'], average='macro')

0.8722222222222222