In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
### 시각화를 위한 라이브러리
# - 파이썬 시각화에서 가장 기본적인 시각화 라이브러리
import matplotlib.pyplot as plt

### 한글처리
from matplotlib import font_manager, rc

### 운영체제 확인 라이브러리
import platform

### 시각화 시 마이너스(-, 음수) 기호 깨짐 방지
plt.rcParams["axes.unicode_minus"] = False

### OS별 한글처리
# - 윈도우 운영체게
if platform.system() == "Windows" :
    # path = "c:/Windows/Fonts/malgun.ttf"
    # font_name = font_manager.FontProperties(fname=path).get_name()
    # rc("font", family = font_name)
    
    ### 또는 아래처럼 한줄로도 가능 (아래 한글처리를 주로 사용합니다.)
    plt.rc("font", family = "Malgun Gothic")

# - Mac 운영체제
elif platform.system() == "Darwin" :
    rc("font", family = "AppleGothic")
    
else :
    print("넌 누구?")

In [3]:
labeled = pd.read_csv("./data/labeled_data.csv")
labeled_cn7 = labeled[(labeled["PART_NAME"] == "CN7 W/S SIDE MLD'G RH") |(labeled["PART_NAME"] == "CN7 W/S SIDE MLD'G LH")]
labeled_cn7 = labeled_cn7.drop(columns=["_id", "TimeStamp","PART_FACT_PLAN_DATE", "Reason", "PART_FACT_SERIAL", "PART_NAME",
               "EQUIP_NAME", "EQUIP_CD", "Switch_Over_Position", "Mold_Temperature_1","Mold_Temperature_2","Mold_Temperature_5",
                "Mold_Temperature_6", "Mold_Temperature_7","Mold_Temperature_8","Mold_Temperature_9","Mold_Temperature_10",
               "Mold_Temperature_11","Mold_Temperature_12","Barrel_Temperature_7"])
labeled_cn7

Unnamed: 0,PassOrFail,Injection_Time,Filling_Time,Plasticizing_Time,Cycle_Time,Clamp_Close_Time,Cushion_Position,Plasticizing_Position,Clamp_Open_Position,Max_Injection_Speed,...,Average_Back_Pressure,Barrel_Temperature_1,Barrel_Temperature_2,Barrel_Temperature_3,Barrel_Temperature_4,Barrel_Temperature_5,Barrel_Temperature_6,Hopper_Temperature,Mold_Temperature_3,Mold_Temperature_4
0,Y,9.59,4.47,16.920000,59.520000,7.13,653.409973,68.849998,647.98999,55.400002,...,59.299999,276.500000,274.700012,274.799988,269.200012,255.000000,229.699997,66.300003,24.799999,27.500000
1,Y,9.60,4.48,16.910000,59.580002,7.13,653.409973,68.839996,647.98999,55.299999,...,59.299999,276.200012,275.500000,275.299988,270.799988,254.699997,229.500000,67.199997,24.799999,27.600000
2,Y,9.60,4.48,16.910000,59.580002,7.13,653.409973,68.839996,647.98999,55.299999,...,59.299999,276.200012,275.500000,275.299988,270.799988,254.699997,229.500000,67.199997,24.799999,27.600000
3,Y,9.59,4.48,16.910000,59.560001,7.13,653.419983,68.839996,647.98999,55.299999,...,59.299999,276.500000,275.000000,275.399994,271.100006,254.899994,230.000000,66.900002,25.000000,27.600000
4,Y,9.59,4.48,16.910000,59.560001,7.13,653.419983,68.839996,647.98999,55.299999,...,59.299999,276.500000,275.000000,275.399994,271.100006,254.899994,230.000000,66.900002,25.000000,27.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7991,Y,9.60,4.48,16.620001,59.560001,7.11,653.429993,68.330002,647.98999,54.900002,...,59.900002,276.500000,274.899994,275.100006,269.500000,255.600006,229.600006,66.099998,21.000000,22.400000
7992,Y,9.60,4.48,16.650000,59.560001,7.11,653.429993,68.349998,647.98999,55.000000,...,59.900002,275.799988,275.299988,275.500000,270.700012,254.899994,230.000000,66.099998,21.000000,22.400000
7993,Y,9.60,4.48,16.650000,59.560001,7.11,653.429993,68.349998,647.98999,55.000000,...,59.900002,275.799988,275.299988,275.500000,270.700012,254.899994,230.000000,66.099998,21.000000,22.400000
7994,Y,9.60,4.48,16.629999,59.580002,7.11,653.429993,68.330002,647.98999,54.900002,...,59.900002,276.100006,275.799988,275.000000,271.299988,255.100006,230.199997,65.199997,21.000000,22.299999


In [4]:
labeled_cn7["PassOrFail"] = labeled_cn7["PassOrFail"].replace('Y',1).replace('N',0)
### Clamp_Open_Position에서 눈에띄게 낮은 값들 제거
cn7 = labeled_cn7[labeled_cn7["Clamp_Open_Position"]>=200] 
cn7 = cn7.drop(columns=["Clamp_Open_Position"])

  labeled_cn7["PassOrFail"] = labeled_cn7["PassOrFail"].replace('Y',1).replace('N',0)


In [5]:
def remove_outliers(df, column):
    # 1사분위수(Q1)와 3사분위수(Q3) 계산
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # IQR 계산
    IQR = Q3 - Q1
    
    # 이상치 범위 설정
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 이상치 제거
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return filtered_df

In [6]:
cn7_Y = cn7[cn7["PassOrFail"]==1].copy()
print("CN7의 양품 개수:", len(cn7_Y))
cn7_N = cn7[cn7["PassOrFail"]==0].copy()
print("CN7의 불량 개수:", len(cn7_N))

tmp1 = cn7_Y.copy()
tmp1 = remove_outliers(tmp1, "Injection_Time")
tmp1 = remove_outliers(tmp1, "Plasticizing_Time")
tmp1 = remove_outliers(tmp1, "Cycle_Time")
tmp1 = remove_outliers(tmp1, "Max_Injection_Speed")
tmp1 = remove_outliers(tmp1, "Mold_Temperature_3")
cn7_pass_removed = tmp1.copy()
cn7_pass_removed.info()

CN7의 양품 개수: 6661
CN7의 불량 개수: 39
<class 'pandas.core.frame.DataFrame'>
Index: 6534 entries, 0 to 7995
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PassOrFail                6534 non-null   int64  
 1   Injection_Time            6534 non-null   float64
 2   Filling_Time              6534 non-null   float64
 3   Plasticizing_Time         6534 non-null   float64
 4   Cycle_Time                6534 non-null   float64
 5   Clamp_Close_Time          6534 non-null   float64
 6   Cushion_Position          6534 non-null   float64
 7   Plasticizing_Position     6534 non-null   float64
 8   Max_Injection_Speed       6534 non-null   float64
 9   Max_Screw_RPM             6534 non-null   float64
 10  Average_Screw_RPM         6534 non-null   float64
 11  Max_Injection_Pressure    6534 non-null   float64
 12  Max_Switch_Over_Pressure  6534 non-null   float64
 13  Max_Back_Pressure         6534 non-n

In [7]:
cn7_removed = pd.concat([cn7_pass_removed, cn7_N], ignore_index=True)
cn7_removed["PassOrFail"].value_counts()

PassOrFail
1    6534
0      39
Name: count, dtype: int64

In [8]:
cn7_removed = cn7_removed.drop(columns=["Filling_Time", "Average_Back_Pressure", "Mold_Temperature_3", "Max_Injection_Pressure", "Plasticizing_Time", "Max_Injection_Speed"])

In [9]:
cn7_removed = cn7_removed.drop(columns=["Max_Screw_RPM", "Barrel_Temperature_1", "Barrel_Temperature_2", "Barrel_Temperature_3",
                                         "Barrel_Temperature_4", "Barrel_Temperature_5", "Barrel_Temperature_6"])

In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

grid_df = pd.DataFrame()

# 독립변수 종속변수 분리 
y = cn7_removed["PassOrFail"]
X = cn7_removed.drop(columns=["PassOrFail"])

# 테스트 데이터 분리
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.3, random_state=42)

# X_train, y_train 합치기
Xy_train = X_train.copy()
Xy_train["PassOrFail"] = y_train

# Average_Screw_RPM 기준으로 up만 추출하기
Xy_train = Xy_train[Xy_train["Average_Screw_RPM"]>250]

# X_train, y_train 분리
X_train = Xy_train.drop(columns=["PassOrFail"])
y_train = Xy_train["PassOrFail"]

# 데이터 증강
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# 검증 데이터 분리
X_train, X_val, y_train, y_val  = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# 데이터 정규화
ss=StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_val_scaled = ss.transform(X_val)
X_test_scaled=ss.transform(X_test)

# 모델 생성
rf = RandomForestClassifier(random_state=42)
et = ExtraTreesClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
hgb = HistGradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
model_list = [rf, et, gb, hgb, xgb]

# GridSearchCV 이용해서 모든 모델 하이퍼파라미터 튜닝
for model in model_list:
    model_name = model.__class__.__name__
    gridParams = dict()
    if model_name =="XGBClassifier" :
        gridParams["n_estimators"] = [20,50]
        gridParams["max_depth"] = [10,20]
        gridParams["min_child_weight"] = [1, 2,4]
    elif model_name == "HistGradientBoostingClassifier" :
        gridParams["max_iter"] = [20, 50]
        gridParams["max_depth"] = [10, 50]
        gridParams["min_samples_leaf"] = [1,2,4]
    else :
        gridParams["n_estimators"] = [20, 50]
        gridParams["max_depth"] = [10,20]
        gridParams["min_samples_split"] = [2,5,10]
        gridParams["min_samples_leaf"] = [1,2,4]

    grid_search_model = GridSearchCV(model, gridParams, scoring='f1', cv=5, n_jobs=-1)
    grid_search_model.fit(X_train_scaled, y_train)
    model = grid_search_model.best_estimator_
    train_pred = model.predict(X_train_scaled)
    val_pred = model.predict(X_val_scaled)
    
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    
    pre = precision_score(y_val, val_pred)
    rec = recall_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred)
    
    pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, pred)
    TN = cm[0, 0]  # True Negative
    FP = cm[0, 1]  # False Positive
    FN = cm[1, 0]  # False Negative
    TP = cm[1, 1]  # True Positive
    
    df_temp = pd.DataFrame([[model_name,  train_acc, val_acc, pre, rec, f1, train_acc-val_acc, TN, FP, FN, TP, str(grid_search_model.best_params_)]],
                             columns=["모델명", "훈련_정확도", "검증_정확도", "정밀도", "재현율", "f1-score", "훈련-검증", "TN", "FP", "FN", "TP", "파라미터"])

    grid_df = pd.concat([grid_df, df_temp], ignore_index=True)

In [11]:
grid_df

Unnamed: 0,모델명,훈련_정확도,검증_정확도,정밀도,재현율,f1-score,훈련-검증,TN,FP,FN,TP,파라미터
0,RandomForestClassifier,0.999119,0.99692,0.994175,1.0,0.997079,0.002199,10,4,4,1954,"{'max_depth': 10, 'min_samples_leaf': 1, 'min_..."
1,ExtraTreesClassifier,0.998678,0.997947,1.0,0.996094,0.998043,0.000732,10,4,7,1951,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
2,GradientBoostingClassifier,0.999119,0.99384,0.994141,0.994141,0.994141,0.005279,12,2,6,1952,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_..."
3,HistGradientBoostingClassifier,0.999559,0.994867,0.996086,0.994141,0.995112,0.004693,11,3,4,1954,"{'max_depth': 50, 'max_iter': 20, 'min_samples..."
4,XGBClassifier,0.997797,0.99692,0.998043,0.996094,0.997067,0.000877,11,3,5,1953,"{'max_depth': 10, 'min_child_weight': 4, 'n_es..."


In [34]:
from sklearn.ensemble import IsolationForest

grid_df = pd.DataFrame()

# 독립변수 종속변수 분리 
y = cn7_removed["PassOrFail"]
X = cn7_removed.drop(columns=["PassOrFail"])

# 테스트 데이터 분리
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.3, random_state=42)

# 검증 데이터 분리
X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# 데이터 정규화
ss=StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_val_scaled = ss.transform(X_val)
X_test_scaled=ss.transform(X_test)

model = LocalOutlierFactor()

# GridSearchCV 이용해서 모든 모델 하이퍼파라미터 튜닝
model_name = model.__class__.__name__

model.fit(X_train_scaled, y_train)
train_pred = model.predict(X_train_scaled)
val_pred = model.predict(X_val_scaled)
val_pred = list(map(lambda x : 1 if x == 1 else 0 , val_pred))

train_acc = accuracy_score(y_train, train_pred)
val_acc = accuracy_score(y_val, val_pred)

pre = precision_score(y_val, val_pred)
rec = recall_score(y_val, val_pred)
f1 = f1_score(y_val, val_pred)

pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, pred)
TN = cm[0, 0]  # True Negative
FP = cm[0, 1]  # False Positive
FN = cm[1, 0]  # False Negative
TP = cm[1, 1]  # True Positive

df_temp = pd.DataFrame([[model_name,  train_acc, val_acc, pre, rec, f1, train_acc-val_acc, TN, FP, FN, TP, str(grid_search_model.best_params_)]],
                         columns=["모델명", "훈련_정확도", "검증_정확도", "정밀도", "재현율", "f1-score", "훈련-검증", "TN", "FP", "FN", "TP", "파라미터"])

grid_df = pd.concat([grid_df, df_temp], ignore_index=True)

AttributeError: This 'LocalOutlierFactor' has no attribute 'predict'

In [33]:
grid_df

Unnamed: 0,모델명,훈련_정확도,검증_정확도,정밀도,재현율,f1-score,훈련-검증,TN,FP,FN,TP,파라미터
0,IsolationForest,0.899689,0.890659,0.997549,0.891892,0.941766,0.00903,0,0,11,0,"{'contamination': 0.1, 'max_samples': 'auto', ..."


In [28]:
val_pred = model.predict(X_val_scaled)

In [30]:

# np.unique 함수를 이용하여 유니크한 원소와 등장 횟수를 구한다.
unique_values, counts = np.unique(val_pred, return_counts=True)

# 결과를 딕셔너리 형태로 변환하여 출력
result = dict(zip(unique_values, counts))
print(result)

{-1: 168, 1: 1213}


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# --- 이상탐지 모델 정의 ---
models = {
    "Isolation Forest": IsolationForest(random_state=42),
    "One-Class SVM": OneClassSVM(),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, novelty=True, contamination='auto')
}

# --- 결과를 저장할 데이터프레임 생성 ---
grid_df = pd.DataFrame(columns=["모델명", "훈련_정확도", "검증_정확도", "정밀도", "재현율",
                                "f1-score", "훈련-검증", "TN", "FP", "FN", "TP", "파라미터"])

# --- 각 모델에 대해 예측 및 평가 ---
for model_name, model in models.items():
    # DBSCAN은 predict()가 없으므로 fit_predict() 사용
    if model_name == "DBSCAN":
        y_pred = model.fit_predict(X_test_scaled)
        # DBSCAN은 클러스터 번호를 반환하며, -1이면 이상치로 판단하므로 0(불량), 그 외는 1(정상)으로 변환
        y_pred_converted = [0 if label == -1 else 1 for label in y_pred]
    else:
        # 비지도 이상탐지 모델은 일반적으로 정상 데이터만으로 학습합니다.
        model.fit(X_train_scaled)
        y_pred = model.predict(X_test_scaled)
        # Isolation Forest, One-Class SVM, Local Outlier Factor는 정상: 1, 이상: -1 반환
        y_pred_converted = [1 if label == 1 else 0 for label in y_pred]
    
    # 평가 지표 계산
    acc = accuracy_score(y_test, y_pred_converted)
    pre = precision_score(y_test, y_pred_converted)
    rec = recall_score(y_test, y_pred_converted)
    f1  = f1_score(y_test, y_pred_converted)
    cm = confusion_matrix(y_test, y_pred_converted)
    TN, FP, FN, TP = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
    
    # 이상탐지 모델은 비지도이므로 훈련/검증 정확도, 훈련-검증 항목은 해당 없음
    train_acc = np.nan
    val_acc = np.nan
    train_val_diff = np.nan
    # 파라미터는 그대로 비워둠
    params = ""
    
    # 결과를 데이터프레임에 추가
    temp_df = pd.DataFrame([[model_name, train_acc, val_acc, pre, rec, f1, train_val_diff, TN, FP, FN, TP, params]],
                           columns=["모델명", "훈련_정확도", "검증_정확도", "정밀도", "재현율",
                                    "f1-score", "훈련-검증", "TN", "FP", "FN", "TP", "파라미터"])
    grid_df = pd.concat([grid_df, temp_df], ignore_index=True)

# 결과 출력
grid_df


  grid_df = pd.concat([grid_df, temp_df], ignore_index=True)


Unnamed: 0,모델명,훈련_정확도,검증_정확도,정밀도,재현율,f1-score,훈련-검증,TN,FP,FN,TP,파라미터
0,Isolation Forest,,,1.0,0.803882,0.89128,,14,0,384,1574,
1,One-Class SVM,,,1.0,0.477528,0.646388,,14,0,1023,935,
2,DBSCAN,,,1.0,0.684372,0.812614,,14,0,618,1340,
3,Local Outlier Factor,,,0.997894,0.967824,0.982629,,10,4,63,1895,


611     1
2498    1
1199    1
5668    1
1042    1
       ..
1791    1
3921    1
4698    1
4657    1
1287    1
Name: PassOrFail, Length: 1972, dtype: int64