# 0. 코드 설명

# 1. 라이브러리 불러오기

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

# 2. 데이터 불러오기

In [6]:
df = pd.read_csv("../../data/processed/prepared_data.csv")
print(df.shape)
df.head()

(32048, 54)


Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,M_CURRENT_PROGRAM_NUMBER,M_sequence_number,M_CURRENT_FEEDRATE,Machining_Process,exp_num,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,202.0,4.0,4.0,202.0,4.0,4.0,0.18,0.0207,329,2.77,...,1,0,50,Prep,1,6,4.0,unworn,yes,yes
1,202.0,-6.8,-346.0,202.0,-9.6,-354.0,-10.9,0.186,328,23.3,...,1,4,50,Prep,1,6,4.0,unworn,yes,yes
2,200.0,-13.8,-2.25,200.0,-13.9,3.999905,-8.59,0.14,328,30.6,...,1,7,50,Prep,1,6,4.0,unworn,yes,yes
3,198.0,-14.0,4.0,198.0,-13.9,3.999905,-6.11,0.13,327,30.3,...,1,7,50,Prep,1,6,4.0,unworn,yes,yes
4,197.0,-13.9,-14.8,196.0,-13.9,4.000095,-5.7,0.114,328,30.5,...,1,7,50,Prep,1,6,4.0,unworn,yes,yes


# 3. Feature Engineering

In [9]:
feature_df = df.copy()
feature_df['Machining_Process'] = LabelEncoder().fit_transform(feature_df['Machining_Process']).astype(np.int8)
feature_df['tool_condition'] = LabelEncoder().fit_transform(feature_df['tool_condition']).astype(np.int8)
feature_df['machining_finalized'] = LabelEncoder().fit_transform(feature_df['machining_finalized']).astype(np.int8)
feature_df['passed_visual_inspection'] = LabelEncoder().fit_transform(feature_df['passed_visual_inspection']).astype(np.int8)
feature_df.drop(['exp_num'], axis=1, inplace=True)
feature_df.head(3)

Unnamed: 0,X_ActualPosition,X_ActualVelocity,X_ActualAcceleration,X_SetPosition,X_SetVelocity,X_SetAcceleration,X_CurrentFeedback,X_DCBusVoltage,X_OutputCurrent,X_OutputVoltage,...,S_SystemInertia,M_CURRENT_PROGRAM_NUMBER,M_sequence_number,M_CURRENT_FEEDRATE,Machining_Process,feedrate,clamp_pressure,tool_condition,machining_finalized,passed_visual_inspection
0,202.0,4.0,4.0,202.0,4.0,4.0,0.18,0.0207,329,2.77,...,16.0,1,0,50,7,6,4.0,0,1,1
1,202.0,-6.8,-346.0,202.0,-9.6,-354.0,-10.9,0.186,328,23.3,...,16.0,1,4,50,7,6,4.0,0,1,1
2,200.0,-13.8,-2.25,200.0,-13.9,3.999905,-8.59,0.14,328,30.6,...,16.0,1,7,50,7,6,4.0,0,1,1


# 4. Modeling


## Case1 : Tool Condition

In [48]:
# 데이터 분리
train_data = feature_df.drop(['tool_condition','machining_finalized','passed_visual_inspection'], axis=1) 
target_data = feature_df['tool_condition']

X_train, X_valid, Y_train, Y_valid = train_test_split(train_data, target_data, test_size=0.2, random_state=0, stratify=target_data)

In [49]:
# Defalut Parameter로 학습
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)

print("training set Accuracy: ",forest.score(X_train, Y_train))
print("test set Accuracy: ",forest.score(X_valid, Y_valid))

training set Accuracy:  0.9406739995319447
test set Accuracy:  0.8731669266770671


### RF Parameter 조절
GridSearchCV를 통한 하이퍼 파라미터 튜닝

In [50]:
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, Y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 8, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.9231


In [51]:
#위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 테스트 세트 데이터에서 예측 성능을 측정
rf_clf_tc = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 8,
                                min_samples_leaf = 8,
                                min_samples_split = 20,
                                random_state = 0,
                                n_jobs = -1)
rf_clf_tc.fit(X_train, Y_train)
pred = rf_clf_tc.predict(X_valid)
print('예측 정확도: {:.4f}'.format(accuracy_score(Y_valid, pred)))# 왼쪽 para와 오른쪽 para를 비교해서 얼마나 맞았는지 알려줌

예측 정확도: 0.9250


In [52]:
# 하이퍼 파라미터 튜닝 후 정확도
print("training set Accuracy: ",rf_clf_tc.score(X_train, Y_train))
print("test set Accuracy: ",rf_clf_tc.score(X_valid, Y_valid))

training set Accuracy:  0.927100397846946
test set Accuracy:  0.9249609984399376


## Case2 : Machining Finalized

In [53]:
# 데이터 분리
train_data = feature_df.drop(['tool_condition','machining_finalized','passed_visual_inspection'], axis=1) 
target_data = feature_df['machining_finalized']

X_train, X_valid, Y_train, Y_valid = train_test_split(train_data, target_data, test_size=0.2, random_state=0, stratify=target_data)

In [54]:
# Defalut Parameter로 학습
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)

RandomForestClassifier()

In [55]:
print("training set Accuracy: ",forest.score(X_train, Y_train))
print("test set Accuracy: ",forest.score(X_valid, Y_valid))

training set Accuracy:  1.0
test set Accuracy:  0.9984399375975039


### RF Parameter 조절
GridSearchCV를 통한 하이퍼 파라미터 튜닝

In [20]:
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, Y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고 예측 정확도: 0.9957


In [29]:
#위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 테스트 세트 데이터에서 예측 성능을 측정
rf_clf_mf = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                random_state = 0,
                                n_jobs = -1)
rf_clf_mf.fit(X_train, Y_train)
pred = rf_clf_mf.predict(X_valid)
print('예측 정확도: {:.4f}'.format(accuracy_score(Y_valid, pred)))# 왼쪽 para와 오른쪽 para를 비교해서 얼마나 맞았는지 알려줌

예측 정확도: 0.9991


In [22]:
# 하이퍼 파라미터 튜닝 후 정확도
print("training set Accuracy: ",rf_clf_mf.score(X_train, Y_train))
print("test set Accuracy: ",rf_clf_mf.score(X_valid, Y_valid))

training set Accuracy:  0.9888836882752164
test set Accuracy:  0.9859594383775351


## Case3 : Passed Visual Inspection

In [23]:
# 데이터 분리
train_data = feature_df.drop(['tool_condition','machining_finalized','passed_visual_inspection'], axis=1) 
target_data = feature_df['passed_visual_inspection']

X_train, X_valid, Y_train, Y_valid = train_test_split(train_data, target_data, test_size=0.2, random_state=0, stratify=target_data)

In [24]:
# Defalut Parameter로 학습
forest = RandomForestClassifier()
forest.fit(X_train, Y_train)

RandomForestClassifier()

In [25]:
print("training set Accuracy: ",forest.score(X_train, Y_train))
print("test set Accuracy: ",forest.score(X_valid, Y_valid))

training set Accuracy:  1.0
test set Accuracy:  0.9998439937597504


### RF Parameter 조절
GridSearchCV를 통한 하이퍼 파라미터 튜닝

In [26]:
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, Y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.9984


In [30]:
#위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 테스트 세트 데이터에서 예측 성능을 측정
rf_clf_pvi = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 20,
                                random_state = 0,
                                n_jobs = -1)
rf_clf_pvi.fit(X_train, Y_train)
pred = rf_clf_pvi.predict(X_valid)
print('예측 정확도: {:.4f}'.format(accuracy_score(Y_valid, pred)))# 왼쪽 para와 오른쪽 para를 비교해서 얼마나 맞았는지 알려줌

예측 정확도: 0.9991


In [28]:
# 하이퍼 파라미터 튜닝 후 정확도
print("training set Accuracy: ",rf_clf_pvi.score(X_train, Y_train))
print("test set Accuracy: ",rf_clf_pvi.score(X_valid, Y_valid))

training set Accuracy:  0.9951244246821125
test set Accuracy:  0.9954758190327613
