# 데이터 불러오기

In [None]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader import data_loader # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)
from tqdm import tqdm
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할

In [19]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [None]:
def data_loader_all(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files), total = len(files)))
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [3]:
train_folder = 'train/'
test_folder = 'test/'
train_label_path = 'train_label.csv'
label = pd.read_csv('train_label.csv')
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
df = pd.read_csv('/Volumes/GoogleDrive/내 드라이브/Dacon/additional_data/additional_data_02.csv')

In [20]:
df1 = pd.read_csv('/Volumes/GoogleDrive/내 드라이브/Dacon/additional_data/additional_data_01.csv')

In [14]:
pd.set_option('display.max_columns', 50)

In [21]:
df_con = pd.concat([df,df1])

In [62]:
print(pd.Series(df['V0050'].any()))

0    True
dtype: bool


In [43]:
df_con['V0021'].unique()

array(['Bad'], dtype=object)

In [24]:
list1 =[]
for i in list(df_con.columns):
    if is_string_dtype(train[i]):
        list1.append(df_con[i].unique())

In [27]:
list1

[array(['Equip Fail'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Equip Fail'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Equip Fail'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Equip Fail'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['Bad'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['ON'], dtype=object),
 array(['ON'], dtype=object),
 array(['ON'], dtype=object),
 array(['ON'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'], dtype=object),
 array(['OFF'],

In [35]:
df_con.replace(to_replace ="No Data", 
                 value =0)

Unnamed: 0,time,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,V0009,V0010,V0011,V0012,V0013,V0014,V0015,V0016,V0017,V0018,V0019,V0020,V0021,V0022,V0023,...,V5096,V5097,V5098,V5099,V5100,V5101,V5102,V5103,V5104,V5105,V5106,V5107,V5108,V5109,V5110,V5111,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120
0,0,34.676640,8.606974,7.275319,7.644240,8.811648,99.048312,37.217275,-0.159304,0.005494,-0.224119,0.56689,-0.257085,1.028314,271.546848,271.766592,274.227552,272.909160,25.627500,38.023006,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,117.5079,123.2137,0.106897,0,-0.019608
1,1,34.676640,8.606974,7.275319,7.644240,8.811648,99.048312,37.217275,-0.159304,0.005494,-0.224119,0.56689,-0.257085,1.028314,271.546848,271.766592,274.227552,272.909160,25.627500,38.023006,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,117.5079,123.2137,0.106897,0,-0.019608
2,2,34.676640,8.617082,7.275319,7.593703,8.771220,84.348648,119.680704,-0.159304,0.005494,-0.224119,0.56689,-0.257085,1.028314,271.546848,272.206080,274.754880,274.623048,25.711877,38.195186,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,117.3061,123.2127,0.106897,0,-0.020000
3,3,34.676640,8.617082,7.275319,7.593703,8.771220,84.348648,119.680704,-0.159304,0.005494,-0.224119,0.56689,-0.257085,1.028314,271.546848,272.206080,274.754880,274.623048,25.711877,38.195186,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,117.3061,123.2127,0.106897,0,-0.020000
4,4,34.676640,8.617082,7.275319,7.593703,8.771220,84.348648,119.680704,-0.159304,0.005494,-0.224119,0.56689,-0.257085,1.028314,271.546848,272.206080,274.754880,274.623048,25.711877,38.195186,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,117.3061,123.2127,0.106897,0,-0.020000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,595,32.850338,8.273434,7.047905,7.826177,8.773747,119.482920,25.912346,-0.131843,0.005494,-0.191165,0.56689,-0.257085,1.028314,275.106456,274.930632,274.886712,278.622072,25.824377,38.314735,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,122.8576,116.3505,0.137021,0,0.020939
596,596,32.850338,8.273434,7.047905,7.826177,8.773747,119.482920,25.912346,-0.131843,0.005494,-0.191165,0.56689,-0.257085,1.028314,275.106456,274.930632,274.886712,278.622072,25.824377,38.314735,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,122.8576,116.3505,0.137021,0,0.020939
597,597,32.850338,8.273434,7.047905,7.826177,8.773747,119.482920,25.912346,-0.131843,0.005494,-0.191165,0.56689,-0.257085,1.028314,274.095720,275.941440,274.623048,276.908184,25.824377,38.314735,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,122.1104,116.3505,0.137021,0,0.020939
598,598,32.850338,8.361871,7.052959,7.891877,8.756057,67.836175,43.446535,-0.131843,0.005494,-0.191165,0.56689,-0.257085,1.028314,274.095720,275.941440,274.623048,276.908184,25.869377,38.314735,Equip Fail,Bad,Bad,Equip Fail,Bad,...,OFF,OFF,ON,OFF,OFF,OFF,ON,ON,ON,ON,ON,ON,OFF,ON,ON,ON,ON,ON,ON,0,122.1104,116.3505,0.137021,0,0.020939


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Columns: 5122 entries, time to V5120
dtypes: float64(2497), int64(1599), object(1026)
memory usage: 23.4+ MB


In [4]:
label = pd.read_csv('train_label.csv')
train = data_loader_all(data_loader, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

100%|████████████████████████████████████████████████████████████████████████████████| 828/828 [01:01<00:00, 13.44it/s]


In [5]:
validation = data_loader_all(data_loader, train_list, folder=train_folder, train_label=train_label, event_time=60, nrows=110)

100%|████████████████████████████████████████████████████████████████████████████████| 828/828 [01:20<00:00, 10.26it/s]


In [2]:
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 30)

In [19]:
# 실험 데이터 생성
train1 = train.copy()
train1 = train1.reset_index()
#train1.to_csv('train1.csv')

In [30]:
train1.head()

Unnamed: 0,id,V0000,V0001,V0002,V0003,V0004,V0005,V0006,V0007,V0008,...,V5112,V5113,V5114,V5115,V5116,V5117,V5118,V5119,V5120,label
0,0,30.474394,8.691177,8.714483,8.687399,8.72123,207.697895,165.86573,-6.018876999999999e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,1.42162e-05,85.4,0.0,110
1,0,30.470463,8.736521,8.682769,8.717135,8.682402,192.66508,191.006871,-3.9187579999999997e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-6.114455e-06,85.4,0.0,110
2,0,30.465427,8.753559,8.663426,8.700049,8.734147,187.065171,192.700238,-1.7991789999999997e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-1.813291e-05,85.4,0.0,110
3,0,30.458532,8.715056,8.714854,8.717174,8.699257,188.500036,180.150567,-6.636970999999999e-19,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,-5.745568e-07,85.4,0.0,110
4,0,30.475773,8.790241,8.735125,8.703167,8.72103,193.269046,195.98489,-6.379752e-20,0.0,...,1.0,1.0,1.0,60.0,0.0,0.0,8.437883e-06,85.4,0.0,110


In [29]:
train1 = train1.reset_index()
train1.rename(columns = {'index':'id'},inplace=True)

In [3]:
train1 = pd.read_csv('train1.csv')

In [5]:
train1.drop(columns=['Unnamed: 0'],inplace=True)

# 상태 변하는 시간 찾아보기 
A에서 B로 넘어갈 때 보통 0~15초 사이에서 발생한다. 0 ~ 15초 사이에서 변하는 기준을 찾아보는 것이 중요할 듯 (영상에서 모두 같은 시간에 변하는 것은 아니라고 함. data_loader를 약간 바꿔줄 필요성이 있다. 

In [0]:
columns_list = list(train1.drop(columns=['label']).columns)
# 약 5000개 열.... 

In [0]:
len(columns_list)

5121

In [0]:
# 변화 없는 데이터
df.shape

(600, 5122)

In [0]:
# 변화 있는 데이터
df.shape

(600, 2310)

# Xgboost

In [None]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']
X_test = validation.drop(['label'], axis=1)
y_test = validation['label']
model = xgb.XGBClassifier(random_state = 0
                          ,nthread = 16, objective = 'multi:softmax')
model.fit(X_train, y_train)

In [None]:
test = data_loader_all(data_loader, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [None]:
pred = model.predict_proba(test)

In [None]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True) #제출 파일 만들기

In [None]:
pred = model.predict_proba(X_test)

In [None]:
submission = pd.DataFrame(data=pred)
submission.index = train.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True)

In [None]:
submission

In [None]:
from sklearn.metrics import log_loss
score = log_loss(y_test, pred)

In [74]:
score

1.3218847071159865

In [15]:
from sklearn.metrics import log_loss
score = log_loss(y_test, pred)

In [18]:
score

0.43245245928564985

In [21]:
from sklearn.metrics import log_loss
score = log_loss(y_test, pred)

In [22]:
score

0.4270724833887654

In [24]:
submission

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.507019e-07,4.087269e-07,3.758185e-07,5.392422e-07,3.171123e-07,3.129179e-07,3.191492e-07,3.136805e-07,4.586709e-07,4.378315e-07,...,3.502794e-07,3.592490e-07,3.480287e-07,3.375774e-07,2.599553e-07,5.032601e-07,7.652320e-07,1.146020e-06,3.539243e-07,2.016041e-06
1,2.692105e-06,2.861458e-06,3.017929e-06,3.244982e-06,2.019443e-06,2.116202e-06,2.231221e-06,2.049049e-06,5.533273e-06,2.807979e-06,...,2.327211e-06,2.589135e-06,2.312258e-06,2.273074e-06,1.715711e-06,2.908921e-06,3.733378e-06,3.132263e-05,5.390675e-06,7.076001e-06
2,7.298408e-07,6.744295e-07,6.076032e-07,9.174294e-07,5.190665e-07,5.313883e-07,5.269019e-07,5.094844e-07,7.630091e-07,8.073993e-07,...,5.812057e-07,5.950117e-07,5.774711e-07,5.566259e-07,4.301535e-07,7.943490e-07,8.970331e-07,1.798920e-06,6.204431e-07,8.601844e-06
3,7.714874e-07,8.903140e-07,7.925829e-07,8.755113e-07,6.743670e-07,6.836639e-07,7.358963e-07,6.570506e-07,9.433689e-07,1.175744e-06,...,7.742998e-07,7.925508e-07,7.693249e-07,7.560594e-07,5.770975e-07,8.283041e-07,1.146402e-06,3.544222e-06,2.640491e-06,3.343330e-06
4,2.672857e-06,2.274533e-06,2.027922e-06,2.831356e-06,1.761138e-06,1.741411e-06,1.806767e-06,1.683395e-06,2.180410e-06,2.369462e-06,...,1.906468e-06,1.951472e-06,1.894217e-06,1.842582e-06,1.461056e-06,2.793690e-06,2.692748e-06,7.559773e-06,1.814465e-06,5.298099e-06
5,2.261869e-07,3.070070e-07,2.106828e-07,2.947638e-07,1.865377e-07,2.007515e-07,1.972797e-07,1.811004e-07,3.143352e-07,2.882294e-07,...,2.127986e-07,2.177263e-07,2.114312e-07,2.046383e-07,1.570796e-07,2.578010e-07,3.339925e-07,1.232254e-06,8.689900e-07,7.058857e-07
6,8.968984e-05,8.950728e-05,7.916387e-05,1.102650e-04,7.106535e-05,7.106065e-05,7.707703e-05,6.911505e-05,9.112755e-05,1.077284e-04,...,8.005096e-05,8.184245e-05,7.953659e-05,7.690871e-05,5.932820e-05,7.700540e-05,1.126510e-04,1.176293e-03,1.674994e-03,6.842408e-04
7,8.979509e-07,8.085588e-07,8.991451e-07,8.303820e-07,6.492611e-07,6.094909e-07,6.888521e-07,6.699821e-07,5.018669e-07,1.942795e-05,...,4.478671e-07,4.587384e-07,4.449892e-07,4.305039e-07,3.301849e-07,4.610373e-07,7.541486e-07,1.810118e-06,4.210030e-07,2.036839e-06
8,7.264042e-07,7.785479e-07,6.696986e-07,9.326331e-07,5.947670e-07,5.889329e-07,6.432813e-07,5.770304e-07,8.019881e-07,8.989645e-07,...,6.766932e-07,6.922193e-07,6.723449e-07,6.499387e-07,4.988844e-07,6.477093e-07,8.504556e-07,1.281089e-04,1.217626e-06,6.202409e-06
9,1.615867e-07,1.919077e-07,1.554838e-07,1.809480e-07,1.265445e-07,1.255452e-07,1.394263e-07,1.204367e-07,1.860754e-07,2.044672e-07,...,1.419362e-07,1.453421e-07,1.409547e-07,1.385445e-07,1.054481e-07,1.527435e-07,2.165562e-07,4.080360e-07,1.490885e-07,7.831189e-07


# Random Forest

In [15]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']
X_test = validation.drop(['label'],axis =1)
y_test = validation['label']
model = RandomForestClassifier(n_estimators = 2000, max_features = 600, random_state=0, verbose=1, n_jobs=-1)
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 52.1min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 95.0min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 149.2min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 216.8min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 245.6min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=600, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [16]:
pred = model.predict_proba(X_test)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   13.5s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:   32.5s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:   58.7s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:  1.5min
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:  2.2min
[Parallel(n_jobs=16)]: Done 2000 out of 2000 | elapsed:  2.5min finished


In [17]:
from sklearn.metrics import log_loss
score = log_loss(y_test, pred)

In [18]:
score

0.6086889896400293

In [26]:
test = data_loader_all(data_loader, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

100%|████████████████████████████████████████████████████████████████████████████████| 720/720 [00:55<00:00, 13.03it/s]


In [19]:
pred = model.predict_proba(test)
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True) #제출 파일 만들기

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   11.9s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:   28.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:   51.7s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:  1.4min
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:  2.0min
[Parallel(n_jobs=16)]: Done 2000 out of 2000 | elapsed:  2.2min finished


In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [26]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [14]:
len(y_train.unique())

198

# lightGBM

In [9]:
from lightgbm import LGBMClassifier

In [9]:
import lightgbm as lgb

In [7]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']
X_test = validation.drop(['label'], axis=1)
y_test = validation['label']

#train_ds =lgb.Dataset(X_train, label = y_train)
#test_ds =lgb.Dataset(X_test, label = y_test)


In [34]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 1000,  
    'max_depth': 10,  
    'min_child_samples': 100,  
    'max_bin': 120,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
    'verbose': 0,
    'num_class' : 198,
    'seed' : 40,
    'device' : 'gpu',
    'gpu_platform_id' : 0,
    'gpu_device_id' : 0
    }

In [35]:
dtrain = lgb.Dataset(X_train, label=y_train,
                      )
dvalid = lgb.Dataset(X_test, label=y_test
                      )

In [None]:
evals_results = {}
print("Training the model...")
lgb_model = lgb.train(params, 
                 dtrain, 
                 valid_sets=[dtrain, dvalid], 
                 valid_names=['train','valid'], 
                 evals_result=evals_results, 
                 num_boost_round=1000,
                 early_stopping_rounds=30,
                 verbose_eval=True, 
                 feval=None)

Training the model...
[1]	train's multi_logloss: 4.75204	valid's multi_logloss: 4.755
Training until validation scores don't improve for 30 rounds
[2]	train's multi_logloss: 4.70976	valid's multi_logloss: 4.7162
[3]	train's multi_logloss: 4.66933	valid's multi_logloss: 4.67703
[4]	train's multi_logloss: 4.63043	valid's multi_logloss: 4.64189
[5]	train's multi_logloss: 4.59356	valid's multi_logloss: 4.60734
[6]	train's multi_logloss: 4.55802	valid's multi_logloss: 4.57411
[7]	train's multi_logloss: 4.52316	valid's multi_logloss: 4.5415
[8]	train's multi_logloss: 4.49062	valid's multi_logloss: 4.51094
[9]	train's multi_logloss: 4.45902	valid's multi_logloss: 4.48166
[10]	train's multi_logloss: 4.42886	valid's multi_logloss: 4.45312
[11]	train's multi_logloss: 4.39902	valid's multi_logloss: 4.42468
[12]	train's multi_logloss: 4.37082	valid's multi_logloss: 4.39907
[13]	train's multi_logloss: 4.34271	valid's multi_logloss: 4.37281
[14]	train's multi_logloss: 4.31575	valid's multi_logloss: 

In [None]:
#model = LGBMClassifier(n_jobs=-1,learning_rate = 0.01, n_estimators = 2000, objective = 'multiclass'
                      # ,num_leaves = 80, max_bin = 540)
#model.fit(X_train, y_train)

In [None]:
#pred = lgb_model.predict_proba(X_test)

In [31]:
pred = lgb_model.predict(test)
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('submission.csv', index=True) #제출 파일 만들기