In [108]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [109]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [110]:
# 파일 읽어오기
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [111]:
# 전처리 과정
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [112]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [113]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
train_x_0 = train_x.loc[train_x.LINE == 0]
train_x_1 = train_x.loc[train_x.LINE == 1]
train_x_2 = train_x.loc[train_x.LINE == 2]
train_x_3 = train_x.loc[train_x.LINE == 3]
train_x_4 = train_x.loc[train_x.LINE == 4]
train_x_5 = train_x.loc[train_x.LINE == 5]

In [114]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
train_y_0 = train_y.loc[train_x.LINE == 0]
train_y_1 = train_y.loc[train_x.LINE == 1]
train_y_2 = train_y.loc[train_x.LINE == 2]
train_y_3 = train_y.loc[train_x.LINE == 3]
train_y_4 = train_y.loc[train_x.LINE == 4]
train_y_5 = train_y.loc[train_x.LINE == 5]



In [115]:
# 각 라인 별로 각자의 모델을 적용시켜 학습을 시킬 예정
test_x_0 = test_x.loc[test_x.LINE == 0]
test_x_1 = test_x.loc[test_x.LINE == 1]
test_x_2 = test_x.loc[test_x.LINE == 2]
test_x_3 = test_x.loc[test_x.LINE == 3]
test_x_4 = test_x.loc[test_x.LINE == 4]
test_x_5 = test_x.loc[test_x.LINE == 5]

test_x_0

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
3,0,0,,,,,,,,,...,,,,,,,,,,
6,0,0,,,,,,,,,...,,,,,,,,,,
10,0,0,,,,,,,,,...,,,,,,,,,,
11,0,0,,,,,,,,,...,,,,,,,,,,
38,0,0,,,,,,,,,...,,,,,,,,,,
40,0,0,,,,,,,,,...,,,,,,,,,,
71,0,0,,,,,,,,,...,,,,,,,,,,
86,0,0,,,,,,,,,...,,,,,,,,,,
88,0,0,,,,,,,,,...,,,,,,,,,,
89,0,0,,,,,,,,,...,,,,,,,,,,


In [116]:
# If all columns are nan -> -99 function
def replace_nan_with_value(df):
    df.loc[:, df.isnull().all()] = -99
    return df

#train x replace

replace_nan_with_value(train_x_0)
replace_nan_with_value(train_x_1)
replace_nan_with_value(train_x_2)
replace_nan_with_value(train_x_3)
replace_nan_with_value(train_x_4)
replace_nan_with_value(train_x_5)

# test x replace

replace_nan_with_value(test_x_0)
replace_nan_with_value(test_x_1)
replace_nan_with_value(test_x_2)
replace_nan_with_value(test_x_3)
replace_nan_with_value(test_x_4)
replace_nan_with_value(test_x_5)


# replace empty space is 0
train_x_0.fillna(0)
train_x_1.fillna(0)
train_x_2.fillna(0)
train_x_3.fillna(0)
train_x_4.fillna(0)
train_x_5.fillna(0)

test_x_0.fillna(0)
test_x_1.fillna(0)
test_x_2.fillna(0)
test_x_3.fillna(0)
test_x_4.fillna(0)
test_x_5.fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, df.isnull().all()] = -99


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
16,5,2,2.0,93.0,0.0,45.0,10.0,0.0,54.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
18,5,2,2.0,96.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
19,5,2,2.0,87.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
21,5,2,2.0,96.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
305,5,2,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
307,5,2,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99
308,5,2,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,-99,-99,-99,-99,-99,-99,-99,-99,-99,-99


In [128]:
# construct model
import lightgbm as lgb

#random forest use
#rf = RandomForestClassifier(n_estimators=500)
# gradient boost use
#gb = GradientBoostingClassifier(n_estimators=500)
# ada boost use
#ab =AdaBoostClassifier(n_estimators=500)
# lgbm use
#def lgbm_use(x_train, y_train):
    #lgbm = lightgbm.LGBMClassifier(num_leaves=50000, objective='binary')
    #return lgbm
    
# classifers = {}    

#'''
#lgbm0 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')
#lgbm1 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')
#lgbm2 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')
#lgbm3 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')
#lgbm4 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')
#lgbm5 = lightgbm.LGBMClassifier(num_leaves=50000, objective='multiclass')'''


#'''train_data = lgb.Dataset(X_train, label=y_train)
#test_data = lgb.Dataset(X_test, label=y_test)'''

In [129]:
#split df
from sklearn.model_selection import train_test_split

X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(train_x_0, train_y_0, test_size=0.2, random_state=156)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(train_x_1, train_y_1, test_size=0.2, random_state=156)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(train_x_2, train_y_2, test_size=0.2, random_state=156)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(train_x_3, train_y_3, test_size=0.2, random_state=156)
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(train_x_4, train_y_4, test_size=0.2, random_state=156)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(train_x_5, train_y_5, test_size=0.2, random_state=156)


train_data0 = lgb.Dataset(X_train_0, label=y_train_0)
test_data0 = lgb.Dataset(X_test_0, label=y_test_0)

train_data1 = lgb.Dataset(X_train_1, label=y_train_1)
test_data1 = lgb.Dataset(X_test_1, label=y_test_1)

train_data2 = lgb.Dataset(X_train_2, label=y_train_2)
test_data2 = lgb.Dataset(X_test_2, label=y_test_2)

train_data3 = lgb.Dataset(X_train_3, label=y_train_3)
test_data3 = lgb.Dataset(X_test_3, label=y_test_3)

train_data4 = lgb.Dataset(X_train_4, label=y_train_4)
test_data4 = lgb.Dataset(X_test_4, label=y_test_4)

train_data5 = lgb.Dataset(X_train_5, label=y_train_5)
test_data5 = lgb.Dataset(X_test_5, label=y_test_5)

train_data0 = pd.DataFrame(train_data0)
test_data0 = pd.DataFrame(test_data0)

train_data1 = pd.DataFrame(train_data1)
test_data1 = pd.DataFrame(test_data1)

train_data2 = pd.DataFrame(train_data2)
test_data2 = pd.DataFrame(test_data2)

train_data3 = pd.DataFrame(train_data3)
test_data3 = pd.DataFrame(test_data3)

train_data4 = pd.DataFrame(train_data4)
test_data4 = pd.DataFrame(test_data4)

train_data5 = pd.DataFrame(train_data5)
test_data5 = pd.DataFrame(test_data5)

ValueError: DataFrame constructor not properly called!

In [125]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score
from sklearn.metrics import confusion_matrix

#def get_clf_eval(y_test, y_pred=None, pred_proba=None):
    #confusion = confusion_matrix(y_test, y_pred)
    #accuracy = accuracy_score(y_test, y_pred)
    #precision = precision_score(y_test, y_pred, average='micro')
    #recall = recall_score(y_test, y_pred, average='micro')
    #F1 = f1_score(y_test, y_pred, average = 'micro')
    #AUC = roc_auc_score(y_test, y_pred, pred_proba)

    #print('오차행렬:\n', confusion)
    #print('\n정확도: {:.4f}'.format(accuracy))
    #print('정밀도: {:.4f}'.format(precision))
    #print('재현율: {:.4f}'.format(recall))
    #print('F1: {:.4f}'.format(F1))
    #print('AUC: {:.4f}'.format(AUC))


In [126]:
# 파라미터 할당 
params = {'objective': 'multiclass',
          'metric': 'multi_logloss',
          'num_class': 3,
          'num_leaves': 31,
          'learning_rate': 0.05}

#train_data = lgbm0.Dataset(X_train_0, label=y_train_0)
#test_data = lgbm0.Dataset(X_test_0, label=y_test_0)

In [133]:
from lightgbm import plot_importance
import matplotlib.pyplot as plt

#line  fit
eval0 = [(train_data0, test_data0)]
eval1 = [(train_data1, test_data1)]
eval2 = [(train_data2, test_data2)]
eval3 = [(train_data3, test_data3)]
eval4 = [(train_data4, test_data4)]
eval5 = [(train_data5, test_data5)]

# num_rounds 할당 
num_rounds = 100

# train(params, train_data, num_rounds, valid_sets=[train_data, test_data], early_stopping_rounds=10)
model0 = lgb.train(params, train_data0, num_rounds, valid_sets = [(train_data0, test_data0)] ,early_stopping_rounds = 100)
model1 = lgb.train(params, train_data1, num_rounds, valid_sets = [(train_data1, test_data1)] ,early_stopping_rounds = 100)
model2 = lgb.train(params, train_data2, num_rounds, valid_sets = [(train_data2, test_data2)] ,early_stopping_rounds = 100)
model3 = lgb.train(params, train_data3, num_rounds, valid_sets = [(train_data3, test_data3)] ,early_stopping_rounds = 100)
model4 = lgb.train(params, train_data4, num_rounds, valid_sets = [(train_data4, test_data4)] ,early_stopping_rounds = 100)
model5 = lgb.train(params, train_data5, num_rounds, valid_sets = [(train_data5, test_data5)] ,early_stopping_rounds = 100)

#model0 = lgbm0.train(params ,X_train_0, y_train_0, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval0, verbose=True)
#model1 = lgbm1.fit(X_train_1, y_train_1, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval1, verbose=True)
#model2 = lgbm2.fit(X_train_2, y_train_2, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval2, verbose=True)
#model3 = lgbm3.fit(X_train_3, y_train_3, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval3, verbose=True)
#model4 = lgbm4.fit(X_train_4, y_train_4, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval4, verbose=True)
#model5 = lgbm5.fit(X_train_4, y_train_4, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval4, verbose=True)




# model5.fit(X_train_5, y_train_5, early_stopping_rounds=100, eval_metric='multi_logloss', eval_set=eval5, verbose=True)


TypeError: Training only accepts Dataset object

In [None]:
pred0 = lgbm0.predict(X_test_0)
pred1 = lgbm1.predict(X_test_1)
pred2 = lgbm2.predict(X_test_2)
pred3 = lgbm3.predict(X_test_3)
pred4 = lgbm4.predict(X_test_4)
pred5 = lgbm5.predict(X_test_5)

# 질문.. 
#pred_proba0 = lgbm.predict_proba(X_test_0)[:6]
#pred_proba1 = lgbm.predict_proba(X_test_1)[:6]
#pred_proba2 = lgbm.predict_proba(X_test_2)[:6]
#pred_proba3 = lgbm.predict_proba(X_test_3)[:6]
#pred_proba4 = lgbm.predict_proba(X_test_4)[:6]
#pred_proba5 = lgbm.predict_proba(X_test_5)[:6]

# 모델 예측
prediction0 = model0.predict(X_test_0)
prediction1 = model1.predict(X_test_1)
prediction2 = model2.predict(X_test_2)
prediction3 = model3.predict(X_test_3)
prediction4 = model4.predict(X_test_4)
prediction5 = model5.predict(X_test_5)


pred_class0 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction0]
pred_class1 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction1]
pred_class2 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction2]
pred_class3 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction3]
pred_class4 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction4]
pred_class5 = [max(enumerate(prob), key=lambda x:x[1])[0] for prob in prediction5]


# 모델 평가
print('0번라인')
print('F1 score:', f1_score(y_test_0, pred_class0, average='macro'))
print('1번라인')
print('F1 score:', f1_score(y_test_1, pred_class1, average='macro'))
print('2번라인')
print('F1 score:', f1_score(y_test_2, pred_class2, average='macro'))
print('3번라인')
print('F1 score:', f1_score(y_test_3, pred_class3, average='macro'))
print('4번라인')
print('F1 score:', f1_score(y_test_4, pred_class4, average='macro'))
print('5번라인')
print('F1 score:', f1_score(y_test_5, pred_class5, average='macro'))


# get_clf_eval(y_test_0, pred0, pred_proba0)



In [None]:
# 데이터프레임 합치기 


In [57]:
submit0 = pd.read_csv('./sample_submission0.csv')
submit1 = pd.read_csv('./sample_submission1.csv')
submit2 = pd.read_csv('./sample_submission2.csv')
submit3 = pd.read_csv('./sample_submission3.csv')
submit4 = pd.read_csv('./sample_submission4.csv')
submit5 = pd.read_csv('./sample_submission5.csv')


In [58]:
submit0['Y_Class'] = pred0
submit1['Y_Class'] = pred1
submit2['Y_Class'] = pred2
submit3['Y_Class'] = pred3
submit4['Y_Class'] = pred4
submit5['Y_Class'] = pred5

ValueError: Length of values (12) does not match length of index (310)

In [None]:
submit = pd.concat([pred0,pred1,pred2,pred3,pred4,pred5])

In [None]:
submit.to_csv('./submission_0218.csv', index=False)

In [42]:
y_test_0

299    2
436    1
220    1
190    1
59     2
455    1
553    0
57     2
40     1
52     2
41     1
312    0
Name: Y_Class, dtype: int64

In [130]:
train_data0

<lightgbm.basic.Dataset at 0x2abdcb7b550>