## 랜덤포레스트 모델 학습 및 성능 평가 

##### 학습/평가 데이터 분리

In [None]:
# 측정 데이터와 레이블(정답) 분리
X = df.iloc[:, 1:5].values
y = df.iloc[:, -1:].values
y = np.where(y =='OK',1,0)
y = y.ravel() # 레이블을 1차원으로 변경함.
print(X.shape, y.shape)

(9383, 4) (9383,)


In [None]:
# 훈련셋과 테스트셋 분리
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)

In [None]:
print(len(y_train[y_train == 1]))
print(len(y_train[y_train == 0]))
print(len(y_test[y_test == 1]))
print(len(y_test[y_test == 0]))

5586
982
2382
433


In [None]:
new_y_train = y_train[y_train == 1][:len(y_train[y_train == 0])]
new_y_train = np.concatenate([new_y_train, y_train[y_train == 0]])
new_y_train

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
new_X_train = X_train[y_train == 1][:len(y_train[y_train == 0])]
new_X_train = np.concatenate([new_X_train, X_train[y_train == 0]])
new_X_train

array([[  0.,   0., 526., 584.],
       [  0.,   0., 571., 526.],
       [  0.,   0., 294., 314.],
       ...,
       [  0.,   0., 581., 580.],
       [  1.,   1., 521., 303.],
       [  0.,   0., 558., 573.]])

##### 모델 성능 평가 - 1) 분류성능평가지표

> 정량평가: 오차행렬, 정확도, 정밀도, 재현율, F1 score, AUC 살펴보기

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

In [None]:
def get_clf_eval(y_test=None, pred=None):
  confusion = confusion_matrix(y_test, pred)
  accuracy = accuracy_score(y_test, pred)
  precision = precision_score(y_test, pred)
  recall = recall_score(y_test, pred)
  f1 = f1_score(y_test, pred)
  roc_auc = roc_auc_score(y_test, pred)
  print('오차 행렬')
  print(confusion)
  print('정확도: {0: .4f}, 정밀도: {1: .4f}, 재현율: {2: .4f}, F1: {3: .4f}, AUC:{4: .4f}'.format(accuracy, precision, recall, f1, roc_auc))

> 정성평가: 그래프 살펴보기

In [None]:
import matplotlib.pyplot as plt
def draw_graph(y_test=None, pred=None):
  plt.figure(figsize=(40, 9))
  plt.plot(y_test, 'b', label = 'actual')
  plt.plot(pred, 'r', label = 'prediction')
  plt.legend()
  plt.show()

  # 따로 보기
  # plt.figure(figsize = (30,6)) # Plotting
  # plt.plot(y_test, 'b', label = 'actual')
  # plt.legend()
  # plt.show()
  # plt.figure(figsize=(30, 6))
  # plt.plot(pred, 'r', label = 'prediction')
  # plt.legend()
  # plt.show()

##### 모델 성능 평가 -2) ROC curve와 case별로 살펴보기 

In [None]:
def get_eval_2(y_test=None, pred=None):
  confusion = confusion_matrix(y_test, pred)
  TN = confusion[0][0]
  FP = confusion[0][1]
  FN = confusion[1][0]
  TP = confusion[1][1]
  print("📌실제 불량일 때 불량으로 판별할 확률\n= P(실제불량)∩P(에측불량)/P(실제불량): {:.4f}".format(TN/(TN+FP)))
  #print("📌실제 불량일 때 양품으로 판별할 확률\n= P(실제불량)∩P(에측양품)/P(실제불량): {:.4f}".format(FP/(TN+FP)))
  #print("📌실제 양품일 때 불량으로 판별할 확률\n= P(실제양품)∩P(에측불량)/P(실제양품): {:.4f}".format(FN/(FN+TP)))
  print("📌실제 양품일 때 양품으로 판별할 확률\n= P(실제양품)∩P(에측양품)/P(실제양품): {:.4f}".format(TP/(FN+TP)))

  roc_auc = roc_auc_score(y_test, pred)
  print('ROC_AUC:{:.4f}'.format(roc_auc))
  return(TN/(TN+FP),TP/(FN+TP),roc_auc)

##### 모델 학습 및 성능 평가

In [None]:
from sklearn.ensemble import RandomForestClassifier
fr_clf = RandomForestClassifier(n_estimators=32, max_depth=15,random_state=29)
fr_clf = fr_clf.fit(X_train, y_train)
fr_prediction = fr_clf.predict(X_test)
# 성능평가
get_clf_eval(y_test, fr_prediction)
get_eval_2(y_test, fr_prediction)

칼럼별 중요도 보기

In [None]:
fr_importances_values = fr_clf.feature_importances_
names = ['MIXA_PASTEUR_STATE', 'MIXB_PASTEUR_STATE', 'MIXA_PASTEUR_TEMP', 'MIXB_PASTEUR_TEMP']
fr_importances = pd.Series(fr_importances_values, index=names)
fr_top = fr_importances.sort_values(ascending=False)
plt.figure(figsize=(6, 3))
sns.barplot(x=fr_top, y=fr_top.index)
plt.show()
# 하지만 중요 칼럼에 대해서만 학습을 시키기에는 현재 칼럼 수가 너무 적다. 

In [None]:
for i in range(len(fr_prediction)):
  if (fr_prediction[i]!=y_test[i]):
    print(X_test[i])
    print(y_test[i], fr_prediction[i])

In [None]:
from sklearn.ensemble import RandomForestClassifier
Pa_arr = []
Pd_arr = []
Proc_arr = []
for d in range(2,100):
  fr_clf = RandomForestClassifier(n_estimators=32, max_depth=15,random_state=d)
  fr_clf = fr_clf.fit(X_train, y_train)
  fr_prediction = fr_clf.predict(X_test)
  # 성능평가
  print("---------------------ramdom_state가",d,"일 때-----------------------")
  get_clf_eval(y_test, fr_prediction)
  Pa, Pd, Proc = get_eval_2(y_test, fr_prediction)
  Pa_arr.append(Pa)
  Pd_arr.append(Pd)
  Proc_arr.append(Proc)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(Pa_arr, label = 'P(a)')
plt.plot(Pd_arr, label = 'P(d)')
plt.plot(Proc_arr, label = 'P(roc)')
plt.legend()