# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
pip install optuna catboost xgboost

Collecting nodejs
  Downloading nodejs-0.1.1.tar.gz (2.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting optional-django==0.1.0
  Downloading optional-django-0.1.0.tar.gz (9.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: optional-django, nodejs
[33m  DEPRECATION: optional-django is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m  Running setup.py install for optional-django ... [?25ldone
[33m  DEPRECATION: nodejs is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-p

In [10]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import catboost
import lightgbm
import xgboost
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from collections import Counter
from sklearn.model_selection import StratifiedKFold



### 데이터 읽어오기


In [11]:
def clean_datas(df):
    
    no_value_col = df.columns[df.nunique()==0] 
    df.drop(columns=no_value_col,inplace=True)

    df.fillna(0,inplace=True)

    # 중복값 가진 컬럼 제거
    dfT= df.T
    df_unique = dfT[~dfT.duplicated()]
    filterd_df = df_unique.T
    return filterd_df.convert_dtypes()

In [12]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [13]:
## 테스트 , 훈련 데이터 통합 전처리

# 데이터에 라벨 추가 (train: 0, test: 1)
train_data['dataset_label'] = 0
test_data['dataset_label'] = 1

train_y = train_data['target']
test_set_id = test_data['Set ID']

train_x = train_data.drop(columns='target')
test_x = test_data.drop(columns='Set ID')

# 훈련 데이터와 테스트 데이터를 합침
combined_data = pd.concat([train_x, test_x], axis=0)

error_cols = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam','HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1','HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
combined_data.loc[:, error_cols] = combined_data.loc[:, error_cols].replace('OK', np.nan).astype('float')
combined_data = clean_datas(combined_data)

cat_features = list(combined_data.select_dtypes(include=['string','object']).columns)
# 결측값을 'Missing'으로 대체
combined_data[cat_features] = combined_data[cat_features].fillna('NA').astype('category')
combined_data_gx = pd.get_dummies(combined_data, columns=cat_features)


# 다시 훈련 데이터와 테스트 데이터로 분리
train_x = combined_data[combined_data['dataset_label'] == 0].drop(columns=['dataset_label'])
test_x = combined_data[combined_data['dataset_label'] == 1].drop(columns=['dataset_label'])

train_x_gx = combined_data_gx[combined_data_gx['dataset_label'] == 0].drop(columns=['dataset_label'])
test_x_gx = combined_data_gx[combined_data_gx['dataset_label'] == 1].drop(columns=['dataset_label'])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y_encoded = le.fit_transform(train_y)

  df.fillna(0,inplace=True)


In [1]:

qual_col = train_x.select_dtypes(include='object')

for i in qual_col:
    lec = LabelEncoder()
    lec = lec.fit(train_x[i])
    train_x[i] = lec.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in lec.classes_: 
            lec.classes_ = np.append(lec.classes_, label)
    test_x[i] = lec.transform(test_x[i]) 
print('Done.')

NameError: name 'train_x' is not defined

In [65]:
# 이상치 처리 
def clean_column(col, reference_col=None):
    if reference_col is None:
        reference_col = col
        
    z_scores = (reference_col - reference_col.mean()) / reference_col.std()
    col_cleaned = col.mask(abs(z_scores) > 3)

    most_frequent_value = reference_col.mode()[0]

    col_filled = col_cleaned.fillna(most_frequent_value)

    return col_filled



In [66]:

# 이상치 처리 train, test에 동일하게 적용
train_x = train_x.apply(clean_column)
test_x = test_x.apply(lambda col: clean_column(col, reference_col=train_x[col.name]))

## 3. 모델 학습


In [7]:

from sklearn.utils.class_weight import compute_class_weight

# 클래스 가중치 계산
class_weights = compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
class_weights_non_cat = dict(enumerate(class_weights))

### 모델 정의


In [35]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
import numpy as np
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# 단순화된 모델을 사용한 앙상블
cat_params = {
    'iterations': 500, 
    #'depth': 7,
    'learning_rate': 0.05, 
    'random_state': 42,
    'verbose': 100
}

lgbm_params = {
    'num_leaves': 31, 
    #'max_depth': -1, 
    'learning_rate': 0.05, 
    'n_estimators': 300, 
    'random_state': 42
}

# CatBoost 모델
cat_model = CatBoostClassifier(class_weights=class_weights_non_cat,cat_features=cat_features,**cat_params)

# LightGBM 모델
lgbm_model = LGBMClassifier(class_weight=class_weights_non_cat,**lgbm_params)

# Voting Classifier 생성 (간단한 앙상블)
voting_clf = VotingClassifier(
    estimators=[
        ('cat', cat_model),
        ('lgbm', lgbm_model),
        #('rf',lr_model)
    ],
    voting='soft'  # 'soft'는 각 모델의 확률을 평균, 'hard'는 다수결
)

# Stratified K-Fold 교차 검증
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')
cv_scores = cross_val_score(voting_clf, train_x, train_y_encoded, cv=skf, scoring=f1_scorer)

# 교차 검증 점수 출력
print(f"Stratified K-Fold Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean CV F1 Score: {np.mean(cv_scores)}")

# 훈련 데이터에서 모델을 학습하고 F1 스코어 평가
voting_clf.fit(train_x, train_y_encoded)
train_pred = voting_clf.predict(train_x)
train_f1 = f1_score(train_y_encoded, train_pred, average='macro')

print(f"Training F1 Score: {train_f1}")

# 최종 테스트 데이터 예측
voting_pred = voting_clf.predict(test_x)
voting_pred = le.inverse_transform(voting_pred)

0:	learn: 0.6900524	total: 14.3ms	remaining: 7.13s
100:	learn: 0.6194459	total: 1.09s	remaining: 4.33s
200:	learn: 0.5930066	total: 2.18s	remaining: 3.24s
300:	learn: 0.5505096	total: 3.25s	remaining: 2.15s
400:	learn: 0.5136037	total: 4.35s	remaining: 1.07s
499:	learn: 0.4826905	total: 5.43s	remaining: 0us
[LightGBM] [Info] Number of positive: 30524, number of negative: 1880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 32404, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499993 -> initscore=-0.000026
[LightGBM] [Info] Start training from score -0.000026
0:	learn: 0.6906761	total: 11.4ms	remaining: 5.67s
100:	learn: 0.6228436	total: 1.09s	remaining: 4.33s
200:	learn: 0.5971742	total: 2

### 모델 학습


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, make_scorer
import numpy as np
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

# 단순화된 모델을 사용한 앙상블
cat_params = {
    'iterations': 500, 
    'learning_rate': 0.05, 
    'random_state': 42,
    'verbose': 100
}

lgbm_params = {
    'num_leaves': 31, 
    'learning_rate': 0.05, 
    'n_estimators': 300, 
    'random_state': 42
}

# CatBoost 모델
cat_model = CatBoostClassifier(class_weights=class_weights_non_cat,cat_features=cat_features,**cat_params)

# LightGBM 모델
lgbm_model = LGBMClassifier(class_weight=class_weights_non_cat,**lgbm_params)

# 메타모델 (XGBoost)
meta_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# 스태킹 앙상블 생성
stacking_clf = StackingClassifier(
    estimators=[
        ('cat', cat_model),
        ('lgbm', lgbm_model)
    ],
    final_estimator=meta_model,
    cv=5
)

# Stratified K-Fold 교차 검증
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')
cv_scores = cross_val_score(stacking_clf, train_x, train_y_encoded, cv=skf, scoring=f1_scorer)

# 교차 검증 점수 출력
print(f"Stratified K-Fold Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean CV F1 Score: {np.mean(cv_scores)}")

# 훈련 데이터에서 모델을 학습하고 F1 스코어 평가
stacking_clf.fit(train_x, train_y_encoded)
train_pred = stacking_clf.predict(train_x)
train_f1 = f1_score(train_y_encoded, train_pred, average='macro')

print(f"Training F1 Score: {train_f1}")

# 최종 테스트 데이터 예측
stacking_pred = stacking_clf.predict(test_x)
stacking_pred = le.inverse_transform(stacking_pred)

0:	learn: 0.6900524	total: 13.9ms	remaining: 6.92s
100:	learn: 0.6194459	total: 1.11s	remaining: 4.4s
200:	learn: 0.5930066	total: 2.2s	remaining: 3.28s
300:	learn: 0.5505096	total: 3.3s	remaining: 2.18s
400:	learn: 0.5136037	total: 4.38s	remaining: 1.08s
499:	learn: 0.4826905	total: 5.47s	remaining: 0us
[LightGBM] [Info] Number of positive: 30524, number of negative: 1880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4254
[LightGBM] [Info] Number of data points in the train set: 32404, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499993 -> initscore=-0.000026
[LightGBM] [Info] Start training from score -0.000026
0:	learn: 0.6902073	total: 11.2ms	remaining: 5.58s
100:	learn: 0.6256845	total: 973ms	remaining: 3.84s
200:	learn: 0.5955001	total: 1.95

## 4. 제출하기


### 제출 파일 작성


In [38]:
import pandas as pd

# 기존 결과와 새로운 결과 로드
original_df = pd.read_csv('submission_best.csv')
new_df = original_df.copy()
new_df['target'] = voting_pred
print(type(new_df))

original_labels = original_df['target']
new_labels = new_df['target']

# 정상에서 비정상으로 변한 데이터
normal_to_abnormal = original_df[(original_labels == 'Normal') & (new_labels == 'AbNormal')]

# 비정상에서 정상으로 변한 데이터
abnormal_to_normal = original_df[(original_labels == 'AbNormal') & (new_labels == 'Normal')]

# 동일한 상태를 유지한 데이터
same_classification = original_df[original_labels == new_labels]

# 변화한 데이터 개수 확인
from collections import Counter
print(Counter(list(new_labels)))
print(f"Normal to AbNormal: {len(normal_to_abnormal)}")
print(f"AbNormal to Normal: {len(abnormal_to_normal)}")
print(f"Same classification: {len(same_classification)}")


<class 'pandas.core.frame.DataFrame'>
Counter({'Normal': 13662, 'AbNormal': 3699})
Normal to AbNormal: 2902
AbNormal to Normal: 41
Same classification: 14418


In [41]:
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = new_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
