*참고자료*
- https://nurilee.com/2020/04/03/lightgbm-definition-parameter-tuning/
- https://towardsdatascience.com/understanding-lightgbm-parameters-and-how-to-tune-them-6764e20c6e5b

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb


# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
sns.set(font="THEGaeideuk",rc={"axes.unicode_minus":False},style='white')

## 데이터 로드

In [2]:
err=pd.read_csv('./data/train_err_data.csv')
quality=pd.read_csv('./data/train_quality_data.csv')
problem=pd.read_csv('./data/train_problem_data.csv')
test_err=pd.read_csv('./data/test_err_data.csv')
test_quality=pd.read_csv('./data/test_quality_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# data 타입 맞추기
err['errtype']=err['errtype'].astype('str')
test_err['errtype']=test_err['errtype'].astype('int')
test_err['errtype']=test_err['errtype'].astype('str')

In [4]:
# train, test set 만들기
train=[i for i in range(10000,25000)]
test=[i for i in range(30000,44999)]
train=pd.DataFrame(train,columns=['user_id'])
test=pd.DataFrame(test,columns=['user_id'])

In [5]:
err.isna().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     1
dtype: int64

In [6]:
test_err.isna().sum()

user_id     0
time        0
model_nm    0
fwver       0
errtype     0
errcode     4
dtype: int64

In [7]:
err.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554663 entries, 0 to 16554662
Data columns (total 6 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   time      int64 
 2   model_nm  object
 3   fwver     object
 4   errtype   object
 5   errcode   object
dtypes: int64(2), object(4)
memory usage: 757.8+ MB


In [8]:
test_err.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16532648 entries, 0 to 16532647
Data columns (total 6 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   time      int64 
 2   model_nm  object
 3   fwver     object
 4   errtype   object
 5   errcode   object
dtypes: int64(2), object(4)
memory usage: 756.8+ MB


In [9]:
err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0
...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0


In [10]:
test_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1
...,...,...,...,...,...,...
16532643,44998,20201130210050,model_1,04.16.3553,40,0
16532644,44998,20201130211831,model_1,04.16.3553,31,1
16532645,44998,20201130211832,model_1,04.16.3553,15,1
16532646,44998,20201130212259,model_1,04.16.3553,16,1


<br><br>

## Feature Engineering

### model컬럼

In [12]:
# train
model_col=pd.get_dummies(err[['user_id','model_nm']],columns=['model_nm']).groupby(['user_id']).sum()
model_col=(model_col>0).reset_index()
train=pd.merge(train,model_col)

# test
test_err.loc[~test_err['model_nm'].isin(err['model_nm'].unique()),'model_nm']=np.nan
model_col=pd.get_dummies(test_err[['user_id','model_nm']],columns=['model_nm']).groupby(['user_id']).sum()
model_col=(model_col>0).reset_index()
test=pd.merge(test,model_col,how='left',on='user_id')

<br>

### fwver

In [13]:
# train
fwver_col=pd.get_dummies(err[['user_id','fwver']],columns=['fwver']).groupby(['user_id']).sum()
fwver_col=(fwver_col>0).reset_index()
train=pd.merge(train,fwver_col)

# test
test_err.loc[~test_err['fwver'].isin(err['fwver'].unique()),'fwver']=np.nan
fwver_col=pd.get_dummies(test_err[['user_id','fwver']],columns=['fwver']).groupby(['user_id']).sum()
fwver_col=(fwver_col>0).reset_index()
test=pd.merge(test,fwver_col,how='left',on='user_id')

<br>

### errtype

In [14]:
# train
errtype_col=pd.get_dummies(err[['user_id','errtype']],columns=['errtype']).groupby(['user_id']).sum()
errtype_col=(errtype_col>0).reset_index()
train=pd.merge(train,errtype_col)

# test
test_err.loc[~test_err['errtype'].isin(err['errtype'].unique()),'errtype']=np.nan
errtype_col=pd.get_dummies(test_err[['user_id','errtype']],columns=['errtype']).groupby(['user_id']).sum()
errtype_col=(errtype_col>0).reset_index()
test=pd.merge(test,errtype_col,how='left',on='user_id')

<br>

### errcode

In [15]:
# train
err_val=err['errcode'].value_counts(normalize=True).cumsum()[:20].index
err.loc[~(err['errcode'].isin(err_val).values),'errcode']='etc'

In [16]:
# test
test_err.loc[~(test_err['errcode'].isin(err_val).values),'errcode']='etc'

In [17]:
# train
errcode_col=pd.get_dummies(err[['user_id','errcode']],columns=['errcode']).groupby(['user_id']).sum()
errcode_col=(errcode_col>0).reset_index()
train=pd.merge(train,errcode_col)

# test
test_err.loc[~test_err['errcode'].isin(err['errcode'].unique()),'errcode']=np.nan
errcode_col=pd.get_dummies(test_err[['user_id','errcode']],columns=['errcode']).groupby(['user_id']).sum()
errcode_col=(errcode_col>0).reset_index()
test=pd.merge(test,errcode_col,how='left',on='user_id')

<br>

### problem

In [18]:
prob_col=np.array(np.zeros(15000))
prob_col[problem['user_id'].unique()-10000]=1

In [19]:
prob_col

array([0., 1., 0., ..., 1., 1., 0.])

<br>

### 컬럼 정리

In [24]:
# 없는 행이 하나 존재
test.isna().sum()

user_id                                 0
model_nm_model_0                        1
model_nm_model_1                        1
model_nm_model_2                        1
model_nm_model_3                        1
                                       ..
errcode_active                          1
errcode_connection fail to establish    1
errcode_connection timeout              1
errcode_etc                             1
errcode_standby                         1
Length: 103, dtype: int64

In [26]:
# nan값은 False로 채워 넣는다
test=test.fillna(False)

In [29]:
# test set에 없는 컬럼은 False로 설정
for col in train.columns:
    if not col in test.columns:
        print(col)
        test[col]=False

fwver_04.16.2641
fwver_04.16.3345
fwver_04.22.1442
fwver_04.33.1095
fwver_05.15.2090
fwver_05.15.2122


In [30]:
# 순서 맞추기
test=test[train.columns]

<br><br>

## Modeling

In [31]:
X=train.loc[:,'model_nm_model_0':'errcode_standby']
y=prob_col
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.75,random_state=2020)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11250, 108)
(3750, 108)
(11250,)
(3750,)


In [32]:
X_train

Unnamed: 0,model_nm_model_0,model_nm_model_1,model_nm_model_2,model_nm_model_3,model_nm_model_4,model_nm_model_5,model_nm_model_6,model_nm_model_7,model_nm_model_8,fwver_03.11.1141,...,errcode_94,errcode_95,errcode_B-A8002,errcode_NFANDROID2,errcode_S-61001,errcode_active,errcode_connection fail to establish,errcode_connection timeout,errcode_etc,errcode_standby
11512,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,True
1565,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
3925,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,True,True,False
7524,False,False,True,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,True
811,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11971,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
14966,False,True,False,False,False,False,False,False,False,False,...,False,False,True,True,True,False,True,True,True,False
7491,False,False,True,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,True
12680,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,True,True


In [52]:
light_gbm=lgb.Dataset(X_train, label=y_train)

In [234]:
params={'learning_rate':0.01,'boosting_type':'dart','objective':'binary','metric':'binary_logloss','num_leaves': 40,
        'max_depth':30,'feature_fraction': 0.6,'subsample': 0.7}
clf = lgb.train(params, light_gbm, 4000)

[LightGBM] [Info] Number of positive: 5000, number of negative: 10000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147


In [235]:
y_pred=clf.predict(X_train)
y_pred[y_pred>0.5]=1
y_pred[y_pred<=0.5]=0
accuracy_score(y_pred,y_train)

0.8206222222222223

In [236]:
y_pred=clf.predict(X_test)
y_pred[y_pred>0.5]=1
y_pred[y_pred<=0.5]=0

In [237]:
sum(y_pred==1)

762

In [238]:
accuracy_score(y_pred,y_test)

0.8216

<br>

## Submission

In [40]:
test

Unnamed: 0,user_id,model_nm_model_0,model_nm_model_1,model_nm_model_2,model_nm_model_3,model_nm_model_4,model_nm_model_5,model_nm_model_6,model_nm_model_7,model_nm_model_8,...,errcode_94,errcode_95,errcode_B-A8002,errcode_NFANDROID2,errcode_S-61001,errcode_active,errcode_connection fail to establish,errcode_connection timeout,errcode_etc,errcode_standby
0,30000,False,True,True,False,False,False,False,False,False,...,False,False,True,False,True,True,True,True,True,True
1,30001,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,30002,True,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,True
3,30003,True,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,False
4,30004,False,True,True,False,False,False,False,False,False,...,False,False,False,True,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,44994,False,True,False,False,False,False,False,False,False,...,False,False,False,True,True,False,True,True,True,False
14995,44995,True,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
14996,44996,True,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,True,True,True,True
14997,44997,True,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,True,True,True,True


In [41]:
test.loc[:,'model_nm_model_0':'errcode_standby']

Unnamed: 0,model_nm_model_0,model_nm_model_1,model_nm_model_2,model_nm_model_3,model_nm_model_4,model_nm_model_5,model_nm_model_6,model_nm_model_7,model_nm_model_8,fwver_03.11.1141,...,errcode_94,errcode_95,errcode_B-A8002,errcode_NFANDROID2,errcode_S-61001,errcode_active,errcode_connection fail to establish,errcode_connection timeout,errcode_etc,errcode_standby
0,False,True,True,False,False,False,False,False,False,False,...,False,False,True,False,True,True,True,True,True,True
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,True
3,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,True,True,False
4,False,True,True,False,False,False,False,False,False,False,...,False,False,False,True,True,True,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,False,True,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,True,True,True,False
14995,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
14996,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,True,True,True,True
14997,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,True,True,True,True


In [241]:
light_gbm=lgb.Dataset(X, label=y)

In [242]:
clf = lgb.train(params, light_gbm, 4000)

[LightGBM] [Info] Number of positive: 5000, number of negative: 10000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147


In [243]:
y_pred=clf.predict(test.loc[:,'model_nm_model_0':'errcode_standby'])

In [244]:
submission=pd.read_csv('./data/sample_submission.csv')

In [245]:
submission['problem']=y_pred

In [246]:
submission.to_csv('./result/submission ver1.0.2.csv',index=False)