In [57]:
import pandas as pd

In [58]:
### 시각화를 위한 라이브러리
# - 파이썬 시각화에서 가장 기본적인 시각화 라이브러리
import matplotlib.pyplot as plt

### 한글처리
from matplotlib import font_manager, rc

### 운영체제 확인 라이브러리
import platform

### 시각화 시 마이너스(-, 음수) 기호 깨짐 방지
plt.rcParams["axes.unicode_minus"] = False

### OS별 한글처리
# - 윈도우 운영체게
if platform.system() == "Windows" :
    # path = "c:/Windows/Fonts/malgun.ttf"
    # font_name = font_manager.FontProperties(fname=path).get_name()
    # rc("font", family = font_name)
    
    ### 또는 아래처럼 한줄로도 가능 (아래 한글처리를 주로 사용합니다.)
    plt.rc("font", family = "Malgun Gothic")

# - Mac 운영체제
elif platform.system() == "Darwin" :
    rc("font", family = "AppleGothic")
    
else :
    print("넌 누구?")

In [59]:
df = pd.read_csv("./data/labeled_data.csv")

In [60]:
data = df.drop(["_id", "TimeStamp","PART_FACT_PLAN_DATE", "Reason", "PART_FACT_SERIAL",
                "EQUIP_NAME", "EQUIP_CD",'Switch_Over_Position',"PART_NAME",
                "Mold_Temperature_1","Mold_Temperature_2","Mold_Temperature_5","Mold_Temperature_6",
                "Mold_Temperature_7","Mold_Temperature_8","Mold_Temperature_9","Mold_Temperature_10",
                "Mold_Temperature_11","Mold_Temperature_12","Barrel_Temperature_7"],
                axis=1)

In [61]:
import seaborn as sns

In [62]:
data = data.replace("Y", 1).replace("N", 0)

  data = data.replace("Y", 1).replace("N", 0)


In [63]:
data.columns

Index(['PassOrFail', 'Injection_Time', 'Filling_Time', 'Plasticizing_Time',
       'Cycle_Time', 'Clamp_Close_Time', 'Cushion_Position',
       'Plasticizing_Position', 'Clamp_Open_Position', 'Max_Injection_Speed',
       'Max_Screw_RPM', 'Average_Screw_RPM', 'Max_Injection_Pressure',
       'Max_Switch_Over_Pressure', 'Max_Back_Pressure',
       'Average_Back_Pressure', 'Barrel_Temperature_1', 'Barrel_Temperature_2',
       'Barrel_Temperature_3', 'Barrel_Temperature_4', 'Barrel_Temperature_5',
       'Barrel_Temperature_6', 'Hopper_Temperature', 'Mold_Temperature_3',
       'Mold_Temperature_4'],
      dtype='object')

In [64]:
data = data.drop(columns=["Filling_Time", "Plasticizing_Time", "Cycle_Time", "Plasticizing_Position",
                           "Clamp_Open_Position", "Max_Injection_Speed", "Max_Switch_Over_Pressure", "Max_Back_Pressure", 
                           "Barrel_Temperature_2", "Barrel_Temperature_3","Barrel_Temperature_5","Barrel_Temperature_6"])

In [65]:
data = data.drop(columns=["Mold_Temperature_4"])

In [66]:
data = data.drop(columns=["Cushion_Position"])

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7996 entries, 0 to 7995
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PassOrFail              7996 non-null   int64  
 1   Injection_Time          7996 non-null   float64
 2   Clamp_Close_Time        7996 non-null   float64
 3   Max_Screw_RPM           7996 non-null   float64
 4   Average_Screw_RPM       7996 non-null   float64
 5   Max_Injection_Pressure  7996 non-null   float64
 6   Average_Back_Pressure   7996 non-null   float64
 7   Barrel_Temperature_1    7996 non-null   float64
 8   Barrel_Temperature_4    7996 non-null   float64
 9   Hopper_Temperature      7996 non-null   float64
 10  Mold_Temperature_3      7996 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 687.3 KB


In [68]:
data["PART_NAME"].unique()

KeyError: 'PART_NAME'

In [69]:
cn7 = data[(data["PART_NAME"] == "CN7 W/S SIDE MLD'G RH") | (data["PART_NAME"] == "CN7 W/S SIDE MLD'G LH")]
rg3 = data[(data["PART_NAME"] == "RG3 MOLD'G W/SHLD, LH") | (data["PART_NAME"] == "RG3 MOLD'G W/SHLD, RH")]

KeyError: 'PART_NAME'

In [15]:
cn7 = cn7.drop(columns=["PART_NAME"])
rg3 = rg3.drop(columns=["PART_NAME"])

In [16]:
y = cn7["PassOrFail"]
X = cn7.drop(columns=["PassOrFail"])
y_test = rg3["PassOrFail"]
X_test = rg3.drop(columns=["PassOrFail"])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((4715, 10), (2021, 10), (4715,), (2021,))

In [19]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_val_scaled = ss.transform(X_val)
X_test_scaled = ss.transform(X_test)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

rf = RandomForestClassifier(random_state=42)
et = ExtraTreesClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
hgb = HistGradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)

model_list = [rf, et, gb, hgb, xgb]

grid_df = pd.DataFrame()
# GridSearchCV 이용해서 모든 모델 하이퍼파라미터 튜닝
for model in model_list:
    model_name = model.__class__.__name__
    gridParams = dict()
    if model_name =="XGBClassifier" :
        gridParams["n_estimators"] = [20,50]
        gridParams["max_depth"] = [10,20]
        gridParams["min_child_weight"] = [1, 2,4]
    elif model_name == "HistGradientBoostingClassifier" :
        gridParams["max_iter"] = [20, 50]
        gridParams["max_depth"] = [10, 50]
        gridParams["min_samples_leaf"] = [1,2,4]
    else :
        gridParams["n_estimators"] = [20, 50]
        gridParams["max_depth"] = [10,20]
        gridParams["min_samples_split"] = [2,5,10]
        gridParams["min_samples_leaf"] = [1,2,4]

    grid_search_model = GridSearchCV(model, gridParams, scoring='f1', cv=5, n_jobs=-1)
    grid_search_model.fit(X_train_scaled, y_train)
    model = grid_search_model.best_estimator_
    train_pred = model.predict(X_train_scaled)
    val_pred = model.predict(X_val_scaled)
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    pre = precision_score(y_val, val_pred)
    rec = recall_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred)
    df_temp = pd.DataFrame([[model_name, train_acc, val_acc, pre, rec, f1, train_acc - val_acc, str(grid_search_model.best_params_)]],
                             columns=["모델명", "훈령정확도", "검증정확도", 
                                      "정밀도", "재현율", "f1-score", "훈련-검증",  "파라미터"])
    grid_df = pd.concat([grid_df, df_temp], ignore_index=True)
grid_df

Unnamed: 0,모델명,훈령정확도,검증정확도,정밀도,재현율,f1-score,훈련-검증,파라미터
0,RandomForestClassifier,0.998727,0.996536,0.996529,1.0,0.998262,0.002191,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
1,ExtraTreesClassifier,0.998727,0.996536,0.996529,1.0,0.998262,0.002191,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
2,GradientBoostingClassifier,0.999152,0.995547,0.997019,0.998507,0.997763,0.003605,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
3,HistGradientBoostingClassifier,0.999152,0.995547,0.997019,0.998507,0.997763,0.003605,"{'max_depth': 50, 'max_iter': 50, 'min_samples..."
4,XGBClassifier,0.999152,0.995547,0.996526,0.999005,0.997764,0.003605,"{'max_depth': 10, 'min_child_weight': 1, 'n_es..."


In [44]:
grid_df["파라미터"][0]

"{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}"

In [47]:
model = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5,n_estimators=20)
model.fit(X_train_scaled, y_train)

In [49]:
val_pred = model.predict(X_val_scaled)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, val_pred)

array([[   4,    7],
       [   0, 2010]], dtype=int64)

In [53]:
test_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, test_pred)

array([[   0,   32],
       [   6, 1218]], dtype=int64)

In [54]:
acc = accuracy_score(y_test, test_pred)
pre = precision_score(y_test, test_pred)
rec = recall_score(y_test, test_pred)
f1 = f1_score(y_test, test_pred)
acc, pre, rec, f1

(0.9697452229299363, 0.9744, 0.9950980392156863, 0.98464025869038)

In [56]:
grid_df.to_csv("./simplemodel.csv")