In [31]:
!pip install catboost 
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt   
import seaborn as sns 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import regularizers
from keras.models import Sequential
from keras.layers import Dense,Dropout, Activation

from sklearn.metrics import f1_score
from catboost import CatBoostClassifier


import optuna
from optuna.samplers import TPESampler
from optuna import Trial

from google.colab import drive  # GoodleDrive mount 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [34]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])
# test_x = test_df

# 1.preprocessing
1. Label Encoding: categorical values `LINE`, `PRODUCT_CODE`
2. Missing values: fillna(0) 
3. scaling: StandardScaler

In [35]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')
submit = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')

train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class','Y_Quality'])
train_y = train_df['Y_Class']
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])   

# 1) qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']
for i in qual_col:
    le = LabelEncoder()    # TRY one-hot encoding 
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

# 2) Missing Values 
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

from imblearn.over_sampling import SMOTE
train_x, train_y = SMOTE(random_state=37).fit_resample(train_x, train_y)

# 3) scaling: only `X_???` values (continuous)
Xs = train_x.select_dtypes(include=float).iloc[:,1:].columns.tolist()
scaler = StandardScaler().fit(train_x.loc[:, Xs])
train_x.loc[:, Xs] = scaler.transform(train_x.loc[:, Xs])
test_x.loc[:, Xs] = scaler.transform(test_x.loc[:, Xs])
print('Done.')  

Done.


# 3.catBoost classifier 

In [None]:
X_train, X_test, y_train, y_test =train_test_split(train_x, train_y,
                                                   test_size=0.3,
                                                   random_state=37)
     

def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average='macro')
     
# hyper-parameter tuning with OPTUNA  
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=316)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ",trial.params)

Number of finished trials:  10
Best trial:
  Value:  0.9327800596718733
  Params:  {'iterations': 848, 'learning_rate': 0.030969090455030726, 'depth': 8, 'l2_leaf_reg': 6.761650264581644e-08, 'bootstrap_type': 'Bayesian', 'random_strength': 5.097812627692342e-07, 'bagging_temperature': 7.206905770161392, 'od_type': 'IncToDec', 'od_wait': 49}


In [None]:
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 0.9327800596718733
Best trial: {'iterations': 848, 'learning_rate': 0.030969090455030726, 'depth': 8, 'l2_leaf_reg': 6.761650264581644e-08, 'bootstrap_type': 'Bayesian', 'random_strength': 5.097812627692342e-07, 'bagging_temperature': 7.206905770161392, 'od_type': 'IncToDec', 'od_wait': 49}


In [None]:
model = CatBoostClassifier(**trial.params, verbose=False, random_state = 37)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
#   Params:  {'iterations': 848, 'learning_rate': 0.030969090455030726, 'depth': 8, 'l2_leaf_reg': 6.761650264581644e-08, 'bootstrap_type': 'Bayesian', 'random_strength': 5.097812627692342e-07, 'bagging_temperature': 7.206905770161392, 'od_type': 'IncToDec', 'od_wait': 49}

model = CatBoostClassifier( 
    iterations= 848, 
                                       learning_rate= 0.030969090455030726,
                                       depth= 8, 
                                       l2_leaf_reg= 6.761650264581644e-08, 
                                       bootstrap_type= 'Bayesian', 
                                       random_strength= 5.097812627692342e-07, 
                                       bagging_temperature= 7.206905770161392, 
                                       od_type= 'IncToDec', 
                                       od_wait=49)


In [None]:
model.fit(X_train, y_train)

0:	learn: 1.0883535	total: 458ms	remaining: 6m 27s
1:	learn: 1.0777085	total: 918ms	remaining: 6m 28s
2:	learn: 1.0657507	total: 1.38s	remaining: 6m 28s
3:	learn: 1.0573946	total: 1.84s	remaining: 6m 27s
4:	learn: 1.0459856	total: 2.29s	remaining: 6m 27s
5:	learn: 1.0364956	total: 2.75s	remaining: 6m 25s
6:	learn: 1.0262121	total: 3.21s	remaining: 6m 25s
7:	learn: 1.0140304	total: 3.67s	remaining: 6m 25s
8:	learn: 1.0059514	total: 4.13s	remaining: 6m 25s
9:	learn: 0.9939666	total: 4.59s	remaining: 6m 24s
10:	learn: 0.9883361	total: 5.06s	remaining: 6m 24s
11:	learn: 0.9789080	total: 5.54s	remaining: 6m 26s
12:	learn: 0.9701348	total: 6.03s	remaining: 6m 27s
13:	learn: 0.9639507	total: 6.58s	remaining: 6m 32s
14:	learn: 0.9543017	total: 7.09s	remaining: 6m 33s
15:	learn: 0.9422949	total: 7.59s	remaining: 6m 34s
16:	learn: 0.9337474	total: 8.06s	remaining: 6m 34s
17:	learn: 0.9209648	total: 8.56s	remaining: 6m 34s
18:	learn: 0.9134097	total: 9.05s	remaining: 6m 34s
19:	learn: 0.9061300	t

<catboost.core.CatBoostClassifier at 0x7f93cd3dae80>

In [None]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.92      0.96       118
           1       0.87      0.93      0.90       123
           2       0.94      0.94      0.94       126

    accuracy                           0.93       367
   macro avg       0.94      0.93      0.93       367
weighted avg       0.93      0.93      0.93       367



In [None]:
import pickle
pickle.dump(model, open("catboost_model.pkl", "wb"))
     

optuna.visualization.plot_optimization_history(study)
     

optuna.visualization.plot_parallel_coordinate(study)
     

# 각 파라미터들의 상관관계
optuna.visualization.plot_contour(
    study,
    params=[
        "iterations",
        "learning_rate",
        "depth",
        "l2_leaf_reg",
        "bootstrap_type",
        "random_strength",
        "bagging_temperature",
        "od_type",
        "od_wait"
        ],
)

[33m[W 2023-02-25 16:04:26,811][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,813][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,815][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,816][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,817][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,819][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,820][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,821][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,822][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 16:04:26,824][0m Param bootstrap_type unique value length is less than 2.[0m
[33m[W 2023-02-25 1

In [None]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
preds = model.predict(test_x)

preds

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    

In [36]:
submit = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
submit['Y_Class'] = preds

submit.to_csv('smartFactory_submission.csv', index=False)


In [39]:
# BEST OUTPUT 
# a = pd.read_csv('C:/Users/user/Downloads/smartFactory_submission18.csv')
# sns.countplot(x='Y_Class', data=a)