In [None]:
import pandas as pd
import random
import os
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import lightgbm 
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from lightgbm import plot_importance
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [3]:
# 파일 읽어오기
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
# 전처리 과정
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID' ,'TIMESTAMP'])

In [5]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [6]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [7]:
# 데이터 전처리 - 오버샘프링 SMOTE , 필수적인 요소! 꼭 해야합니다. 
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 777)

X_train_over,y_train_over = smote.fit_resample(train_x,train_y)

print('SMOTE 적용 전 학습용 피처/레이블 데이터 세트: ', train_x.shape, train_y.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트: ', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 레이블 값 분포: \n', pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트:  (598, 2877) (598,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트:  (1221, 2877) (1221,)
SMOTE 적용 후 레이블 값 분포: 
 1    407
2    407
0    407
Name: Y_Class, dtype: int64


In [8]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
X_train_over_0 = X_train_over.loc[X_train_over.LINE == 0]
X_train_over_1 = X_train_over.loc[X_train_over.LINE == 1]
X_train_over_2 = X_train_over.loc[X_train_over.LINE == 2]
X_train_over_3 = X_train_over.loc[X_train_over.LINE == 3]
X_train_over_4 = X_train_over.loc[X_train_over.LINE == 4]
X_train_over_5 = X_train_over.loc[X_train_over.LINE == 5]

In [9]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
y_train_over_0 = y_train_over.loc[X_train_over.LINE == 0]
y_train_over_1 = y_train_over.loc[X_train_over.LINE == 1]
y_train_over_2 = y_train_over.loc[X_train_over.LINE == 2]
y_train_over_3 = y_train_over.loc[X_train_over.LINE == 3]
y_train_over_4 = y_train_over.loc[X_train_over.LINE == 4]
y_train_over_5 = y_train_over.loc[X_train_over.LINE == 5]

In [10]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
test_x_0 = test_x.loc[test_x.LINE == 0] 
test_x_1 = test_x.loc[test_x.LINE == 1]
test_x_2 = test_x.loc[test_x.LINE == 2]
test_x_3 = test_x.loc[test_x.LINE == 3]
test_x_4 = test_x.loc[test_x.LINE == 4]
test_x_5 = test_x.loc[test_x.LINE == 5]

# 나중에 오름차순 정리를 위해서 필요
#sliced_df_with_index0 = test_x_0.reset_index()
#sliced_df_with_index0

In [11]:
x0_train, x0_valid, y0_train, y0_valid = train_test_split(X_train_over_0, y_train_over_0, test_size=0.3, shuffle=True, random_state=777)
y0_true = y0_valid.values

In [12]:
# 결측치 확인 
def check_missing_col(dataframe):
    missing_col = []  
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col_x_train = check_missing_col(X_train_over)

결측치가 존재하지 않습니다


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import GridSearchCV

In [14]:
#모델들을 할당할 리스트를 만들어줍니다.
clfs = []

In [15]:
#estimators 리스트에 모델들을 추가해줍니다.
rf = RandomForestClassifier()
clfs.append(rf)

gbc = GradientBoostingClassifier()
clfs.append(gbc)

etc = ExtraTreesClassifier()
clfs.append(etc)

xgb = XGBClassifier()
clfs.append(xgb)

gbm = GradientBoostingClassifier()
clfs.append(gbm)

lgb = LGBMClassifier()
clfs.append(lgb)

cat = CatBoostClassifier()
clfs.append(cat)

In [16]:
#모들의 파라미터들을 할당할 리스트를 만들어줍니다.
params = []

# params 리스트에 성능을 비교하고자하는 파라미터들 추가해줍니다.
#params_rf = {'n_estimators' : [90, 100, 110, 120],
            #'min_samples_split' : [2,3,4]}
#params.append(params_rf)

#params_etc = {'n_estimators' : [50,60,70,80,90,100,110,120,130,140,150]}
#params.append(params_etc)

#XGB
param_xgb = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6,10],
              "n_estimators": [200,300,500,1000]
              }    

# LGB                        
param_lgb = {"learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5],
             "max_depth" : [25, 50, 75],
             "num_leaves" : [100,300,500,900,1200],
             "n_estimators" : [100, 200, 300,500,800,1000],
             "learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5]
              }
# GBM              
param_gbm = {"max_depth" : [4,5,6,7,8,9,10],
             "learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5],
             "n_estimators" : [100,200,300,500]
              }
# CAT
param_cat = {"depth" : [6,4,5,7,8,9,10],
          "iterations" : [250,100,500,1000],
          "learning_rate" : [0.001,0.01,0.1,0.2,0.3], 
          "l2_leaf_reg" : [2,5,10,20,30],
          "border_count" : [254]
          }


In [None]:
gscv_xgb = GridSearchCV (estimator = xgb, param_grid = param_xgb, scoring ='accuracy', cv = 10, refit=True, n_jobs=-1, verbose=2)
gscv_lgb = GridSearchCV (estimator = lgb, param_grid = param_lgb, scoring ='accuracy', cv = 10, refit=True, n_jobs=-1, verbose=2)
gscv_gbm = GridSearchCV (estimator = gbm, param_grid = param_gbm, scoring ='accuracy', cv = 10, refit=True, n_jobs=-1, verbose=2)
gscv_cat = GridSearchCV (estimator = cat, param_grid = param_cat, scoring ='accuracy', cv = 10, refit=True, n_jobs=-1, verbose=2)

gscv_xgb.fit(X_train_over, y_train_over)
gscv_lgb.fit(X_train_over, y_train_over)
gscv_gbm.fit(X_train_over, y_train_over)
gscv_cat.fit(X_train_over, y_train_over)


Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [None]:
#GridSearchCV 를 이용해 모델들을 최적화시켜줍니다.
from tqdm.auto import tqdm
def gridSearchCV(models,params):
    best_models=[]
    for i in tqdm(range(0,len(models))):
        model_grid = GridSearchCV(models[i], params[i], n_jobs = 10,verbose=1, cv=5)
        model_grid.fit(X_train_over,y_train_over)
        best_models.append(model_grid.best_estimator_)
    return best_models

best_model_list = gridSearchCV(clfs,params)

In [None]:
#GridSerachCV 를 통해 최적화된 모델들을 확인합니다.
best_model_list

In [None]:
print("="*30)
print('XGB 파라미터: ', gscv_xgb.best_params_)
print('XGB 예측 정확도: {:.4f}'.format(gscv_xgb.best_score_))
print("="*30)
print('LGB 파라미터: ', gscv_lgb.best_params_)
print('LGB 예측 정확도: {:.4f}'.format(gscv_lgb.best_score_))
print("="*30)
print('GBM 파라미터: ', gscv_gbm.best_params_)
print('GBM 예측 정확도: {:.4f}'.format(gscv_gbm.best_score_))
print("="*30)
print('CAT 파라미터: ', gscv_cat.best_params_)
print('CAT 예측 정확도: {:.4f}'.format(gscv_cat.best_score_))
print("="*30)
print('Lreg 파라미터: ', gscv_lreg.best_params_)
print('Lreg 예측 정확도: {:.4f}'.format(gscv_lreg.best_score_))
print("="*30)

In [None]:
#GridSearchCV 를 통해 최적화된 모델들을 사용합니다.
best_models = [
    ('rf', RandomForestClassifier(min_samples_split=3, n_estimators=90)),
    ('GBR',GradientBoostingClassifier(learning_rate=0.06, n_estimators=60)),
    ('ET', ExtraTreesClassifier(n_estimators=140))
]

#앙상블 기법을 위한 패키지를 불러옵니다.
from sklearn.ensemble import VotingClassifier

#앙상블 모델을 학습시켜줍니다.
voting_clf = VotingClassifier(estimators=best_models,voting='hard')
voting_clf.fit(train_x,train_y)

In [None]:
predictions = voting_clf.predict(test_x)

In [None]:
submit = pd.read_csv('./sample_submission0.csv')

In [None]:
submit['Y_Class'] = predictions
submit.to_csv("submission.csv",index = False)