In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import log_loss
import lightgbm as lgb
from datetime import datetime

/kaggle/input/15th-data60sec/var_list.csv
/kaggle/input/15th-data60sec/train_label.csv
/kaggle/input/15th-data60sec/train_data.pickle/train_data.pickle
/kaggle/input/15th-data60sec/test_data.pickle/test_data.pickle


In [2]:
train = pd.read_pickle('/kaggle/input/15th-data60sec/train_data.pickle/train_data.pickle')
test = pd.read_pickle('/kaggle/input/15th-data60sec/test_data.pickle/test_data.pickle')
train_label = train['label']

var_list = list(pd.read_csv('/kaggle/input/15th-data60sec/var_list.csv')['var'])

In [3]:
var_list = list(reversed(sorted(list(set(var_list) - set(train.columns[train.isna().any()].tolist() + test.columns[test.isna().any()].tolist())))))

train = train.loc[(train['time'] >= 10),var_list].reset_index(drop=True)
test = test.loc[(test['time'] >= 10),list(set(var_list) - set(['label']))].reset_index(drop=True)

In [4]:
print(train[var_list].shape)
print(test[list(set(var_list) - set(['label']))].shape)

(41400, 2545)
(36000, 2544)


In [5]:
%%time

train = train.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
train_label = train['label']
train_id = train['id']

test = test.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
test_id = test['id']

var_model = list(set(train.columns) & set(test.columns) - set(['id']))

CPU times: user 1min 30s, sys: 2.27 s, total: 1min 32s
Wall time: 1min 32s


In [6]:
def tr_vl_split(train_df, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    '''
    
    np.random.seed(seed)
    
    valid_id = []
    vc = train[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train_df[train_df['label'] == a]['id'])
        valid_id += random.sample(id_list,num)
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train[train['id'].isin(train_id)]
    y_tr_ = train_label[train['id'].isin(train_id)]

    x_vl_ = train[~train['id'].isin(train_id)]
    y_vl_ = train_label[~train['id'].isin(train_id)]
    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

In [7]:
x_tr, y_tr, x_vl, y_vl = tr_vl_split(train, 3, seed = 1995)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)
print('test shape :', test.shape)

train shape : (29624, 2545)
validation shape : (8464, 2545)
test shape : (33120, 2544)


In [8]:
lgb_tr = lgb.Dataset(x_tr[var_model], label=y_tr)
lgb_vl = lgb.Dataset(x_vl[var_model], label=y_vl)

watchlist_1 = [lgb_tr, lgb_vl]
watchlist_2 = [lgb_vl, lgb_tr]

params = {
    "objective": "multiclass",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "multiclass",
    "num_class" : 198
}

lgb_model = lgb.train(params, train_set=lgb_tr, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=100, early_stopping_rounds=100)

prediction = pd.DataFrame(lgb_model.predict(test[var_model]))

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0246187	valid_1's multi_logloss: 0.781002
[200]	training's multi_logloss: 0.0028939	valid_1's multi_logloss: 0.729679
[300]	training's multi_logloss: 0.00128082	valid_1's multi_logloss: 0.728472
Early stopping, best iteration is:
[257]	training's multi_logloss: 0.00169826	valid_1's multi_logloss: 0.727662


In [9]:
time_now = datetime.now()
submission_name = str(time_now)[:16] + '_submission.csv'
sub = pd.concat([pd.DataFrame(test_id),prediction],axis=1).groupby('id').mean().reset_index()
sub.to_csv(submission_name,index=False)