In [1]:
import pandas as pd

In [2]:
moldset_labeled = pd.read_csv('./data/labeled_data.csv', index_col=False)

In [3]:
moldset_labeled.drop(columns=['_id', 'TimeStamp', 'PART_FACT_PLAN_DATE',
                             'PART_FACT_SERIAL', 'EQUIP_CD', 'EQUIP_NAME', 'Reason',
                             'Mold_Temperature_1', 'Mold_Temperature_2','Barrel_Temperature_7','Switch_Over_Position',
                             'Mold_Temperature_5', 'Mold_Temperature_6','Mold_Temperature_7', 'Mold_Temperature_8',
                             'Mold_Temperature_9', 'Mold_Temperature_10', 'Mold_Temperature_11',
                             'Mold_Temperature_12'], inplace=True)

In [4]:
moldset_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7996 entries, 0 to 7995
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PART_NAME                 7996 non-null   object 
 1   PassOrFail                7996 non-null   object 
 2   Injection_Time            7996 non-null   float64
 3   Filling_Time              7996 non-null   float64
 4   Plasticizing_Time         7996 non-null   float64
 5   Cycle_Time                7996 non-null   float64
 6   Clamp_Close_Time          7996 non-null   float64
 7   Cushion_Position          7996 non-null   float64
 8   Plasticizing_Position     7996 non-null   float64
 9   Clamp_Open_Position       7996 non-null   float64
 10  Max_Injection_Speed       7996 non-null   float64
 11  Max_Screw_RPM             7996 non-null   float64
 12  Average_Screw_RPM         7996 non-null   float64
 13  Max_Injection_Pressure    7996 non-null   float64
 14  Max_Swit

In [5]:
moldset_labeled["PassOrFail"] = moldset_labeled["PassOrFail"].replace(to_replace="Y",value=1).replace(to_replace="N",value=0)

  moldset_labeled["PassOrFail"] = moldset_labeled["PassOrFail"].replace(to_replace="Y",value=1).replace(to_replace="N",value=0)


In [6]:
moldset_labeled["PART_NAME"].value_counts()
rg3_r = moldset_labeled[moldset_labeled["PART_NAME"] == "RG3 MOLD'G W/SHLD, RH"].copy()
rg3_l = moldset_labeled[moldset_labeled["PART_NAME"] == "RG3 MOLD'G W/SHLD, LH"].copy()

In [7]:
rg3_r.drop(columns=['PART_NAME'], inplace=True)
rg3_l.drop(columns=['PART_NAME'], inplace=True)

In [8]:
y = rg3_r['PassOrFail'].copy()
X = rg3_r.drop(columns=['PassOrFail']).copy()
y_test = rg3_l['PassOrFail'].copy()
X_test = rg3_l.drop(columns=['PassOrFail']).copy()

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [12]:
ss = StandardScaler()
ss.fit(X)
X = ss.transform(X)
X_test = ss.transform(X_test)

In [13]:
# 모델 생성
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

rf = RandomForestClassifier(random_state=42)
et = ExtraTreesClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
hgb = HistGradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)

model_list = [rf, et, gb, hgb, xgb]

In [14]:
grid_df = pd.DataFrame()
scores = ['accuracy', 'precision', 'recall', 'f1']
for score in scores:
    # GridSearchCV 이용해서 모든 모델 하이퍼파라미터 튜닝
    for model in model_list:
        model_name = model.__class__.__name__
        gridParams = dict()
        if model_name =="XGBClassifier" :
            gridParams["n_estimators"] = [20,50]
            gridParams["max_depth"] = [10,20]
            gridParams["min_child_weight"] = [1, 2,4]
        elif model_name == "HistGradientBoostingClassifier" :
            gridParams["max_iter"] = [20, 50]
            gridParams["max_depth"] = [10, 50]
            gridParams["min_samples_leaf"] = [1,2,4]
        else :
            gridParams["n_estimators"] = [20, 50]
            gridParams["max_depth"] = [10,20]
            gridParams["min_samples_split"] = [2,5,10]
            gridParams["min_samples_leaf"] = [1,2,4]

        grid_search_model = GridSearchCV(model, gridParams, scoring=score, cv=5, n_jobs=-1)
        grid_search_model.fit(X, y)
        model = grid_search_model.best_estimator_
        pred = model.predict(X)
        acc = accuracy_score(y, pred)
        pre = precision_score(y, pred)
        rec = recall_score(y, pred)
        f1 = f1_score(y, pred)
        df_temp = pd.DataFrame([[model_name,  acc, pre, rec, f1, str(grid_search_model.best_params_)]],
                                 columns=["모델명", "정확도", 
                                          "정밀도", "재현율", "f1-score", "파라미터"])
        grid_df = pd.concat([grid_df, df_temp], ignore_index=True)

In [15]:
grid_df

Unnamed: 0,모델명,정확도,정밀도,재현율,f1-score,파라미터
0,RandomForestClassifier,0.987261,0.986864,1.0,0.993388,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
1,ExtraTreesClassifier,0.968153,0.967794,1.0,0.983633,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
2,GradientBoostingClassifier,1.0,1.0,1.0,1.0,"{'max_depth': 20, 'min_samples_leaf': 2, 'min_..."
3,HistGradientBoostingClassifier,1.0,1.0,1.0,1.0,"{'max_depth': 50, 'max_iter': 20, 'min_samples..."
4,XGBClassifier,0.957006,0.957006,1.0,0.978031,"{'max_depth': 10, 'min_child_weight': 4, 'n_es..."
5,RandomForestClassifier,0.987261,0.986864,1.0,0.993388,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
6,ExtraTreesClassifier,0.968153,0.967794,1.0,0.983633,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
7,GradientBoostingClassifier,1.0,1.0,1.0,1.0,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
8,HistGradientBoostingClassifier,1.0,1.0,1.0,1.0,"{'max_depth': 50, 'max_iter': 20, 'min_samples..."
9,XGBClassifier,0.957006,0.957006,1.0,0.978031,"{'max_depth': 10, 'min_child_weight': 4, 'n_es..."


In [17]:
gridParams = dict()
gridParams["n_estimators"] = [20, 50]
gridParams["max_depth"] = [10,20]
gridParams["min_samples_split"] = [2,5,10]
gridParams["min_samples_leaf"] = [1,2,4]
grid_search_model = GridSearchCV(GradientBoostingClassifier(), gridParams, cv=5, n_jobs=-1)
grid_search_model.fit(X, y)
model = grid_search_model.best_estimator_
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
pre = precision_score(y_test, pred)
rec = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
acc, pre, rec, f1

(0.9490445859872612, 0.9916805324459235, 0.956661316211878, 0.9738562091503268)