## Import

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(777) # Seed 고정

### train.csv [파일]
- PRODUCT_ID : 제품의 고유 ID
- Y_Class : 제품 품질 상태(Target) 
    - 0 : 적정 기준 미달 (부적합)
    - 1 : 적합
    - 2 : 적정 기준 초과 (부적합)
- Y_Quality : 제품 품질 관련 정량적 수치
- TIMESTAMP : 제품이 공정에 들어간 시각
- LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재)
- PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재)
- X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수


### test.csv [파일]
- PRODUCT_ID : 제품의 고유 ID
- TIMESTAMP : 제품이 공정에 들어간 시각
- LINE : 제품이 들어간 공정 LINE 종류 ('T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305' 존재)
- PRODUCT_CODE : 제품의 CODE 번호 ('A_31', 'T_31', 'O_31' 존재)
- X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수


### sample_submission.csv [파일] - 제출 양식
- PRODUCT_ID : 제품의 고유 ID
- Y_Class : 예측한 제품 품질 상태
    - 0 : 적정 기준 미달 (부적합)
    - 1 : 적합
    - 2 : 적정 기준 초과 (부적합)

실제 공정 과정에서의 데이터로, 보안상의 이유로 일부 변수가 비식별화 처리 되었습니다. (X변수)
'LINE', 'PRODUCT_CODE'는 Train / Test 모두 동일한 종류가 존재합니다.

## Data Load

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/datasets/aimers/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/datasets/aimers/test.csv')

In [5]:
train_df.index = pd.to_datetime(train_df["TIMESTAMP"])
test_df.index = pd.to_datetime(test_df["TIMESTAMP"])

train_df.head()

Unnamed: 0_level_0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-13 05:14:00,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
2022-06-13 05:22:00,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2022-06-13 05:30:00,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
2022-06-13 05:39:00,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
2022-06-13 05:47:00,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,,,,,...,38.7,41.89,46.93,33.09,76.97,,,,,


In [6]:
class_0 = train_df[train_df["Y_Class"] == 0]
print("Class 0: ", class_0["Y_Quality"].min(), "~" , class_0["Y_Quality"].max())
class_1 = train_df[train_df["Y_Class"] == 1]
print("Class 1: ", class_1["Y_Quality"].min(), "~" ,  class_1["Y_Quality"].max())
class_2 = train_df[train_df["Y_Class"] == 2]
print("Class 2: ", class_2["Y_Quality"].min(), "~" ,  class_2["Y_Quality"].max())

Class 0:  0.500855556 ~ 0.525066667
Class 1:  0.525085714 ~ 0.534842857
Class 2:  0.534950794 ~ 0.57884127


In [7]:
print(train_df.iloc[:, 6].nunique())
print(train_df.iloc[:, 6].unique())

print(train_df.iloc[:, 2880].nunique())
print(train_df.iloc[:, 2880].unique())

10
[ nan   2.   1.   8. 103.   4.   3.   6.   7.  40.  21.]
0
[nan]


In [8]:
del_list = []
for i in range(6, 2880):
    if train_df.iloc[:, i].nunique() == 0:
        del_list.append(train_df.columns[i])
        
print(del_list)

['X_934', 'X_935', 'X_936', 'X_937', 'X_2628', 'X_2629', 'X_2630', 'X_2631', 'X_2632', 'X_2633', 'X_2634', 'X_2635', 'X_2636', 'X_2637', 'X_2638', 'X_2639', 'X_2640', 'X_2641', 'X_2642', 'X_2643', 'X_2644', 'X_2645', 'X_2646', 'X_2647', 'X_2648', 'X_2649', 'X_2650', 'X_2651', 'X_2652', 'X_2653', 'X_2654', 'X_2655', 'X_2656', 'X_2657', 'X_2658', 'X_2659', 'X_2660', 'X_2661', 'X_2662', 'X_2663', 'X_2664', 'X_2665', 'X_2666', 'X_2667', 'X_2668', 'X_2669', 'X_2670', 'X_2671', 'X_2672', 'X_2673', 'X_2674', 'X_2675', 'X_2676', 'X_2677', 'X_2678', 'X_2679', 'X_2680', 'X_2681', 'X_2682', 'X_2683', 'X_2684', 'X_2685', 'X_2686', 'X_2687', 'X_2688', 'X_2689', 'X_2690', 'X_2691', 'X_2692', 'X_2693', 'X_2694', 'X_2695', 'X_2696', 'X_2697', 'X_2698', 'X_2699', 'X_2838', 'X_2844', 'X_2872', 'X_2873', 'X_2874']


In [9]:
train_df.drop(columns=del_list, inplace=True)
test_df.drop(columns=del_list, inplace=True)

In [10]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Quality']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [11]:
train_x.head()

Unnamed: 0_level_0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2875
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-13 05:14:00,T050304,A_31,,,,,,,,,...,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77,,
2022-06-13 05:22:00,T050307,A_31,,,,,,,,,...,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55,,
2022-06-13 05:30:00,T050304,A_31,,,,,,,,,...,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35,,
2022-06-13 05:39:00,T050307,A_31,,,,,,,,,...,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78,,
2022-06-13 05:47:00,T050304,A_31,,,,,,,,,...,383.0,367.351852,352.0,38.7,41.89,46.93,33.09,76.97,,


In [12]:
test_x.head()

Unnamed: 0_level_0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2875
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-09 02:01:00,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
2022-09-09 02:09:00,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2022-09-09 08:42:00,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2022-09-09 10:56:00,T010305,A_31,,,,,,,,,...,467.0,444.192308,423.0,,,,,,,
2022-09-09 11:04:00,T010306,A_31,,,,,,,,,...,465.0,443.211539,423.0,,,,,,,


## Data Pre-processing

In [13]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [14]:
from sklearn.preprocessing import RobustScaler

train_robust = train_x.iloc[:, 2:]

robust_scaler = RobustScaler()
fitted = robust_scaler.fit(train_robust)

train_robust = robust_scaler.transform(train_robust)
train_robust = pd.DataFrame(train_robust, columns=train_x.iloc[:, 2:].columns, index=train_x.index)
train_x_robust = pd.concat([train_x.iloc[:, 0:2], train_robust], axis=1)

train_x_robust.head()

Unnamed: 0_level_0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2875
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-13 05:14:00,T050304,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,0.997396,0.997188,1.0,39.34,40.89,32.56,34.09,77.77,0.0,0.0
2022-06-13 05:22:00,T050307,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,0.997396,0.99567,1.0,38.89,42.82,43.92,35.34,72.55,0.0,0.0
2022-06-13 05:30:00,T050304,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,0.997396,0.994547,1.0,39.19,36.65,42.47,36.53,78.35,0.0,0.0
2022-06-13 05:39:00,T050307,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,1.0,0.999604,1.0,37.74,39.17,52.17,30.58,71.78,0.0,0.0
2022-06-13 05:47:00,T050304,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,0.997396,0.994631,0.997167,38.7,41.89,46.93,33.09,76.97,0.0,0.0


In [15]:
test_df_robust = test_x.iloc[:, 2:]
test_df_robust = robust_scaler.transform(test_df_robust)
test_df_robust = pd.DataFrame(test_df_robust, columns=test_x.iloc[:, 2:].columns, index=test_x.index)

In [16]:
test_x_robust = pd.concat([test_x.iloc[:, 0:2], test_df_robust], axis=1)

test_x_robust.head()

Unnamed: 0_level_0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2875
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-09 02:01:00,T100306,T_31,0.0,0.031579,0.0,0.0,0.0,0.0,0.12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-09 02:09:00,T100304,T_31,0.0,0.021053,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-09 08:42:00,T100304,T_31,0.0,0.042105,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-09 10:56:00,T010305,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,1.216146,1.202682,1.1983,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-09 11:04:00,T010306,A_31,-1.0,-0.957895,0.0,-1.0,-1.0,0.0,-0.9,-1.0,...,1.210938,1.200026,1.1983,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x_robust[i] = le.transform(train_x_robust[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x_robust[i] = le.transform(test_x_robust[i]) 
print('Done.')

Done.


## Classification Model Fit

In [18]:
%%time

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

clf = XGBRegressor(tree_method="gpu_hist",
                   gpu_id=0,
                   reg_lambda=1,
                   random_state=777)

params = {
    "max_depth": range(5, 11),
    "n_estimators": range(10, 31, 5),
    "learning_rate": [0.0001, 0.001, 0.01, 0.02, 0.03, 0.04, 0.05],
    "max_leaves": range(5, 11, 2),
}

grid = GridSearchCV(estimator=clf, param_grid=params, cv=3)

grid.fit(train_x_robust, train_y)

best_params = grid.best_params_

print(best_params)

{'learning_rate': 0.05, 'max_depth': 7, 'max_leaves': 9, 'n_estimators': 30}
CPU times: user 18min 36s, sys: 2min 27s, total: 21min 4s
Wall time: 20min 50s


In [19]:
clf_best = XGBRegressor(max_depth=best_params["max_depth"], 
                        n_estimators=best_params["n_estimators"], 
                        learning_rate=best_params["learning_rate"],
                        max_leaves=best_params["max_leaves"],
                        reg_lambda=1,
                        random_state=777)

clf_best.fit(train_x_robust, train_y)



XGBRegressor(learning_rate=0.05, max_depth=7, max_leaves=9, n_estimators=30,
             random_state=777)

## Inference

In [20]:
train_preds = clf_best.predict(train_x_robust)

from sklearn.metrics import mean_squared_error

print("RMSE:", np.sqrt(mean_squared_error(train_y, train_preds)))

RMSE: 0.007875723970250109


In [21]:
preds_train = clf_best.predict(train_x_robust)
type(preds_train)

numpy.ndarray

In [22]:
preds_quality = clf_best.predict(test_x_robust)
print('Done.')

Done.


In [23]:
print(preds_quality)

[0.5239108  0.52874017 0.5278667  0.52092814 0.52553767 0.52547884
 0.5266578  0.51884717 0.51917505 0.5276464  0.52541834 0.5273532
 0.5276464  0.5222532  0.5170586  0.52084243 0.5239108  0.5212003
 0.5239108  0.5239108  0.5212003  0.5239108  0.52084243 0.52058387
 0.5212003  0.5212003  0.52355295 0.52084243 0.5212003  0.5239108
 0.5212003  0.5239108  0.52084243 0.52355295 0.52373856 0.5163833
 0.52272826 0.51598096 0.5360281  0.5210363  0.52369225 0.51911974
 0.5198393  0.5232759  0.5235401  0.52373856 0.5235401  0.51922005
 0.5234744  0.51938564 0.5239108  0.51922005 0.5195148  0.51973325
 0.52013874 0.52106977 0.5235401  0.5235401  0.52106977 0.52106977
 0.52106977 0.51835203 0.51975334 0.5219893  0.5174404  0.5166658
 0.5191132  0.5239108  0.52106977 0.52106977 0.52106977 0.5081894
 0.52553767 0.5278667  0.52355295 0.5278667  0.52355295 0.52106977
 0.5334789  0.52410334 0.52874017 0.5239108  0.5278667  0.5278667
 0.52874017 0.5239108  0.51491845 0.52253884 0.51325834 0.52158636
 0

In [24]:
# Class 0:  0.500855556 ~ 0.525066667
# Class 1:  0.525085714 ~ 0.534842857
# Class 2:  0.534950794 ~ 0.57884127

preds = []

for i in range(len(preds_quality)):
  if preds_quality[i] <= 0.525066667:
    preds.append(0)
  elif preds_quality[i] > 0.525066667 and preds_quality[i] <= 0.534842857:
    preds.append(1)
  else:
    preds.append(2)
preds = np.array(preds)
preds

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Submit

In [25]:
submit = pd.read_csv('/content/drive/MyDrive/datasets/aimers/sample_submission.csv')

In [26]:
submit['Y_Class'] = preds

In [27]:
submit.to_csv('/content/MyDrive/datasets/aimers/baseline_submission.csv', index=False)

FileNotFoundError: ignored