## 라이브러리

In [1]:
# GPU 메모리 할당 문제
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

import numpy as np
import random
import gc
from tqdm import tqdm
from glob import glob

import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.layers import Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Activation, Convolution2D, MaxPooling2D, BatchNormalization, Flatten, Dense, Dropout, Conv2D,MaxPool2D, ZeroPadding2D
from tensorflow.keras.layers import LeakyReLU

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.metrics import log_loss
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.stats import skew, kurtosis
from sklearn.manifold import TSNE
import missingno

## 데이터 전처리

In [64]:
# 데이터 불러오기

path = './data/'
train = pd.read_csv(path + 'train_features.csv')
train_label = pd.read_csv(path + 'train_labels.csv')
test = pd.read_csv(path + 'test_features.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [65]:
# Pre-Processing Effect on the Accuracy of Event-Based Activity Segmentation and Classification through Inertial Sensors 
# https://www.researchgate.net/publication/281836367_Pre-Processing_Effect_on_the_Accuracy_of_Event-Based_Activity_Segmentation_and_Classification_through_Inertial_Sensors

train['acc_t']  = train.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] ** 2 )**(1/3), axis=1)
test['acc_t']  = test.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] ** 2 )**(1/3), axis=1)

train['gy_t']  = train.apply(lambda x : (x['gy_x']**2 + x['gy_y'] **2 +  x['gy_z'] ** 2 )**(1/3), axis=1)
test['gy_t']  = test.apply(lambda x : (x['gy_x']**2 + x['gy_y'] **2 +  x['gy_z'] ** 2 )**(1/3), axis=1)

In [5]:
# https://dacon.io/competitions/official/235689/codeshare/2375?page=1&dtype=recent&ptype=pub
# 간단한 LGBM으로 접근해보기

X_pivot_train = pd.pivot_table(data = train, # X_train의 데이터를 통해서
                               values = train.columns[2:],  # id와 time을 제외한 피쳐를 대상으로
                               index = 'id', # id를 기준으로 잡아
                               aggfunc = ['sum','mean',         # 합, 평균
                                          'median','min','max', # 중앙값 최소값, 최대값
                                          'std','var'           # 베셀 보정 표본 표준편차, 비편향 편차 의 값을 구합니다.
                                         ]
                              )

X_pivot_test = pd.pivot_table(data = test, #
                               values = test.columns[2:], 
                               index = 'id', # id를 기준으로 잡아
                               aggfunc = ['sum','mean',        
                                          'median','min','max',
                                          'std','var'          
                                         ]
                              )

In [6]:
X_columns = [agg + '_' + column for agg,column in X_pivot_train.columns]
X_pivot_train.columns = X_columns
X_pivot_test.columns = X_columns
X_pivot_train = X_pivot_train.reset_index()
X_pivot_test = X_pivot_test.reset_index()

train_data = pd.merge(X_pivot_train, train_label.loc[:,['id','label']], on='id') # label_desc는 사용하지 않을 예정입니다.
train_data.label = train_data.label.astype('category')

train_data.shape

Unnamed: 0,id,sum_acc_t,sum_acc_x,sum_acc_y,sum_acc_z,sum_gy_t,sum_gy_x,sum_gy_y,sum_gy_z,mean_acc_t,...,std_gy_z,var_acc_t,var_acc_x,var_acc_y,var_acc_z,var_gy_t,var_gy_x,var_gy_y,var_gy_z,label
0,0,619.171101,558.797337,-131.082711,-222.252919,5815.160027,-1119.161589,-2015.703683,709.264425,1.031952,...,25.275185,0.012741,0.036664,0.031375,0.018260,16.353172,176.470384,590.513292,638.834979,37
1,1,632.688598,-459.948117,-190.354639,-2.534051,12858.851545,6642.960123,1044.284884,835.976169,1.054481,...,75.545343,0.037258,0.245548,0.113175,0.249396,151.398806,6279.700472,9217.015511,5707.098884,26
2,2,593.195479,23.901616,-49.441742,375.607013,12864.850286,-5083.770868,358.725917,1831.974458,0.988659,...,13.920337,0.007896,0.506904,0.021646,0.061905,81.812660,646.325142,14150.683677,193.775778,3
3,3,585.741919,-532.621192,-52.600737,136.413976,7807.353545,10646.500409,2880.558352,-3521.938833,0.976237,...,23.647153,0.004687,0.017134,0.037639,0.042387,43.389223,1842.887012,1365.558625,559.187841,26
4,4,637.338408,-395.410844,-202.240064,121.654507,8067.185605,-2891.782899,5791.027696,2672.029417,1.062231,...,46.148326,0.031108,0.245193,0.325247,0.151824,221.765323,11719.982095,3662.008463,2129.668017,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3120,3120,588.797343,-180.272174,-401.525652,201.560530,5731.683639,-3229.789337,-2941.679051,-32.415360,0.981329,...,24.913819,0.006641,0.162550,0.035006,0.117825,67.859710,1324.702298,1457.382016,620.698354,26
3121,3121,604.650076,-584.578686,-140.023548,-44.262826,5027.134552,6836.985564,2272.105392,-1675.342583,1.007750,...,12.786464,0.011917,0.028887,0.013767,0.007417,38.327494,1253.062926,238.087738,163.493653,26
3122,3122,720.129478,-668.547732,-217.317785,144.910824,25470.832381,-2292.798232,6103.301689,-1033.698245,1.200216,...,131.916609,0.088040,0.467567,0.112211,0.044597,175.976939,22051.857079,57621.020319,17401.991847,15
3123,3123,615.533707,-66.799727,528.216970,-73.654160,12758.593344,-3734.255616,-6965.012570,-3558.151108,1.025890,...,71.243150,0.008436,0.187248,0.046994,0.069216,63.795352,2582.777254,6398.528531,5075.586383,26


In [7]:
# 푸리에 변환

def ft_trans(name,train,test):
    def train_test(check,num_col):

        
        if check =='train':
            df_checking=train.copy()
            train_datas = np.zeros((len(df_checking.id.unique()),304))
            
        elif check =='test':
            df_checking=test.copy()
            train_datas = np.zeros((len(df_checking.id.unique()),304))
                       

        for i,num in enumerate(tqdm(df_checking.id.unique())):

            tt = df_checking.loc[df_checking.id==num][name] -df_checking.loc[df_checking.id==num][name].mean()
            fmax = 50      # sampling frequency 1000 Hz
            dt = 1/fmax      # sampling period
            N  = 600      # length of signal

            t  = np.arange(0,N)*dt   # time = [0, dt, ..., (N-1)*dt]
            x = tt.values
            df = fmax/N   # df = 1/N = fmax/N
            f = np.arange(0,N)*df     #   frq = [0, df, ..., (N-1)*df]
            xf = np.fft.fft(x)*dt
            tq_index=f[0:int(N/2+1)]
            tq_abs= np.abs(xf[0:int(N/2+1)])

            results = pd.DataFrame(tq_abs,tq_index).reset_index().rename(columns={'index':'hz',0:'abs_value'})
            
            ar0 = np.array([num])
            ar1 =results.abs_value.values
            ar2 = np.array([skew(results.abs_value),kurtosis(results.abs_value, fisher=True)])
            return_value = np.concatenate([ar0,ar1 ,ar2])    
            train_datas[i] = return_value

        return train_datas

    
    col_ft = ['_'+str(x) for x in range(304)]
    
    num_col = len(col_ft)
    train_datas = train_test('train',num_col)
    test_datas = train_test('test',num_col)
    
    col_ft_F = ['id']+[name+"_"+x for x in col_ft[1:]]        
    train_df = pd.DataFrame(train_datas,columns= col_ft_F)
    test_df = pd.DataFrame(test_datas,columns= col_ft_F)
    
    train_df.id = train_df.id.astype('int')
    test_df.id = test_df.id.astype('int')
    
    
    return train_df ,test_df

# https://dacon.io/competitions/official/235689/codeshare/2374?page=1&dtype=recent&ptype=pub

In [8]:
train_fft,test_fft = ft_trans('acc_t',train,test)
train_fft.shape, test_fft.shape

100%|█████████████████████████████████████████████████████████████████████████████| 3125/3125 [00:12<00:00, 242.11it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 782/782 [00:01<00:00, 477.45it/s]


((3125, 304), (782, 304))

In [12]:
from sklearn.preprocessing import RobustScaler
transformer_pivot = RobustScaler().fit(train_data.drop(['id', 'label'], axis=1))
pivot_train = transformer_pivot.transform(train_data.drop(['id', 'label'], axis=1))
pivot_test = transformer_pivot.transform(X_pivot_test.drop('id', axis=1))

transformer_fft = RobustScaler().fit(train_fft)
train_fft = transformer_fft.transform(train_fft)
test_fft = transformer_fft.transform(test_fft)

# 2번째 모델 : XGBoost

In [13]:
# 모델 2번: xgboost

def build_xgboost(split_num, train, target, test, rnd):
    
    params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.04,
                'max_depth': 12,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':61,
                }
    
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((train.shape[0], 61)), np.zeros((test.shape[0], 61))
    
    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=233*rnd)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target[train_idx]
        valid_x = train[val_idx]
        valid_y = target[val_idx]

        d_train = xgb.DMatrix(X, y)
        d_valid = xgb.DMatrix(valid_x, valid_y)
        d_temp = xgb.DMatrix(valid_x)
        d_test = xgb.DMatrix(test)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        #run traning
        model = xgb.train(params, d_train, 2000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=100)

        # save feat
        train_pred[val_idx] = model.predict(d_temp)
        test_pred += model.predict(d_test)/split_num
        
        # release
        del model
        gc.collect()
        print('  ==============================================================  ')
        
    return train_pred, test_pred

xgb_train1, xgb_test1 = build_xgboost(5, np.array(pivot_train), np.array(train_data.label), np.array(pivot_test), 1)
xgb_train2, xgb_test2 = build_xgboost(5, np.array(pivot_train), np.array(train_data.label), np.array(pivot_test), 1)

[0]	train-mlogloss:3.56176	valid-mlogloss:3.58476
[100]	train-mlogloss:0.21011	valid-mlogloss:0.98174
[200]	train-mlogloss:0.05514	valid-mlogloss:0.87153
[300]	train-mlogloss:0.03116	valid-mlogloss:0.85338
[400]	train-mlogloss:0.02372	valid-mlogloss:0.84793
[500]	train-mlogloss:0.02040	valid-mlogloss:0.84682
[600]	train-mlogloss:0.01861	valid-mlogloss:0.84501
[662]	train-mlogloss:0.01786	valid-mlogloss:0.84505
------------------
[0]	train-mlogloss:3.55079	valid-mlogloss:3.59125
[100]	train-mlogloss:0.20451	valid-mlogloss:1.06810
[200]	train-mlogloss:0.05383	valid-mlogloss:0.96120
[300]	train-mlogloss:0.03075	valid-mlogloss:0.94174
[400]	train-mlogloss:0.02356	valid-mlogloss:0.93376
[500]	train-mlogloss:0.02031	valid-mlogloss:0.93082
[573]	train-mlogloss:0.01892	valid-mlogloss:0.93143
------------------
[0]	train-mlogloss:3.55553	valid-mlogloss:3.58545
[100]	train-mlogloss:0.20572	valid-mlogloss:0.94304
[200]	train-mlogloss:0.05450	valid-mlogloss:0.83961
[300]	train-mlogloss:0.03105	val

## 3번째 모델 : CATBoost

In [55]:
# 모델 3번: catboost

def build_catboost(split_num, train, target, test, rnd):
    # return train pred prob and test pred prob
    train_pred, test_pred = np.zeros((train.shape[0], 61)), np.zeros((test.shape[0], 61))

    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=233*rnd)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target.iloc[train_idx]
        valid_x = train[val_idx]
        valid_y = target.iloc[val_idx]

        model = cb.CatBoostClassifier(iterations=10000,
                                      learning_rate=0.01,
                                      l2_leaf_reg=3.5,
                                      depth=6,
                                      loss_function= 'MultiClass',
                                      eval_metric='MultiClass',
                                      use_best_model=True,
                                      random_seed=42,
                                      verbose=500,
                                      task_type="GPU")

        model.fit(X, y,
                  eval_set=(valid_x, valid_y),
                  early_stopping_rounds=50)
        
        # save feat
        train_pred[val_idx] = model.predict(valid_x)
        test_pred += model.predict_proba(test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred, test_pred

catboost_train1, catboost_test1 = build_catboost(5, pivot_train, train_data.label, pivot_test, 1)
catboost_train2, catboost_test2 = build_catboost(5, pivot_train, train_data.label, pivot_test, 2)

0:	learn: 3.9378090	test: 3.9375637	best: 3.9375637 (0)	total: 28.3ms	remaining: 4m 42s
500:	learn: 1.0395626	test: 1.2650794	best: 1.2650794 (500)	total: 12.2s	remaining: 3m 51s
1000:	learn: 0.6696813	test: 1.0079386	best: 1.0079386 (1000)	total: 24.7s	remaining: 3m 42s
1500:	learn: 0.4865061	test: 0.8980774	best: 0.8980774 (1500)	total: 37.1s	remaining: 3m 29s
2000:	learn: 0.3758241	test: 0.8372569	best: 0.8372569 (2000)	total: 49.3s	remaining: 3m 16s
2500:	learn: 0.2988821	test: 0.7996224	best: 0.7996224 (2500)	total: 1m 1s	remaining: 3m 4s
3000:	learn: 0.2424718	test: 0.7724925	best: 0.7724925 (3000)	total: 1m 13s	remaining: 2m 51s
3500:	learn: 0.2004491	test: 0.7543519	best: 0.7543519 (3500)	total: 1m 25s	remaining: 2m 38s
4000:	learn: 0.1686975	test: 0.7408762	best: 0.7408548 (3999)	total: 1m 37s	remaining: 2m 26s
4500:	learn: 0.1446332	test: 0.7313281	best: 0.7313132 (4499)	total: 1m 49s	remaining: 2m 14s
5000:	learn: 0.1257219	test: 0.7232240	best: 0.7232240 (5000)	total: 2m 1s

In [56]:
# sample_submssion = pd.read_csv(path + 'sample_submission.csv')
# sample_submssion.iloc[:,1:] = catboost_test1
# sample_submssion.to_csv("catboost.csv", index = False)

## 3번째 모델 : LightGBM

## Ensemble

In [41]:
xgb1_onehot = np.argmax(xgb_train1, axis=1).reshape(-1,1)
xgb2_onehot = np.argmax(xgb_train2, axis=1).reshape(-1,1)
catboost1_onehot = np.argmax(catboost_train1, axis=1).reshape(-1,1)
catboost2_onehot = np.argmax(catboost_train2, axis=1).reshape(-1,1)
# lgb1_onehot = np.argmax(lgb_train1, axis=1).reshape(-1,1)
# lgb2_onehot = np.argmax(lgb_train2, axis=1).reshape(-1,1)

xgb1_onehot_test = np.argmax(xgb_test1, axis=1).reshape(-1,1)
xgb2_onehot_test = np.argmax(xgb_test2, axis=1).reshape(-1,1)
catboost1_onehot_test = np.argmax(catboost_test1, axis=1).reshape(-1,1)
catboost2_onehot_test = np.argmax(catboost_test2, axis=1).reshape(-1,1)
# lgb1_onehot_test = np.argmax(lgb_test1, axis=1).reshape(-1,1)
# lgb2_onehot_test = np.argmax(lgb_test2, axis=1).reshape(-1,1)

train_final = np.hstack([xgb1_onehot, xgb2_onehot,
                         catboost_train1])
#                          lgb1_onehot, lgb2_onehot])

test_final = np.hstack([xgb1_onehot_test, xgb2_onehot_test,
                        catboost_test1])
#                         lgb1_onehot_test, lgb2_onehot_test])

print(train_final.shape)
print(test_final.shape)

# https://m.blog.naver.com/PostView.nhn?blogId=wideeyed&logNo=221343373342&proxyReferer=https:%2F%2Fwww.google.com%2F

(3125, 63)
(782, 63)


In [43]:
# 최종 앙상블

def ensemble_xgb(split_num, train, target, test):

    test_pred = np.zeros((test.shape[0], 61))
    
    params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.04,
                'max_depth': 12,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':61,
                }
    
    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=2021)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target[train_idx]
        valid_x = train[val_idx]
        valid_y = target[val_idx]
        
        d_train = xgb.DMatrix(X, y)
        d_valid = xgb.DMatrix(valid_x, valid_y)
        d_test = xgb.DMatrix(test)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        #run traning
        model = xgb.train(params, d_train, 2000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=100)

        # save feat
        test_pred += model.predict(d_test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
    
    sample_submssion = pd.read_csv(path + 'sample_submission.csv')
    sample_submssion.iloc[:,1:] = test_pred
    sample_submssion.to_csv("ensemble.csv", index = False)
        
ensemble_xgb(5, train_final, np.array(train_data.label), test_final)

[0]	train-mlogloss:3.53030	valid-mlogloss:3.53577
[100]	train-mlogloss:0.87806	valid-mlogloss:1.12807
[187]	train-mlogloss:0.79313	valid-mlogloss:1.12446
------------------
[0]	train-mlogloss:3.53543	valid-mlogloss:3.53955
[100]	train-mlogloss:0.89676	valid-mlogloss:1.08027
[200]	train-mlogloss:0.80460	valid-mlogloss:1.07546
[202]	train-mlogloss:0.80377	valid-mlogloss:1.07581
------------------
[0]	train-mlogloss:3.52972	valid-mlogloss:3.54448
[100]	train-mlogloss:0.87049	valid-mlogloss:1.17482
[165]	train-mlogloss:0.79719	valid-mlogloss:1.18287
------------------
[0]	train-mlogloss:3.53118	valid-mlogloss:3.53613
[100]	train-mlogloss:0.89716	valid-mlogloss:1.08300
[200]	train-mlogloss:0.80831	valid-mlogloss:1.06700
[218]	train-mlogloss:0.80202	valid-mlogloss:1.06742
------------------
[0]	train-mlogloss:3.53902	valid-mlogloss:3.54060
[100]	train-mlogloss:0.91778	valid-mlogloss:1.00202
[199]	train-mlogloss:0.82698	valid-mlogloss:0.99122
------------------
