In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')
import random
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import log_loss
import lightgbm as lgb
from datetime import datetime, timedelta

In [None]:
train = pd.read_pickle('/kaggle/input/15th-data60sec/train_data.pickle/train_data.pickle')
test = pd.read_pickle('/kaggle/input/15th-data60sec/test_data.pickle/test_data.pickle')
train_label = train['label']

var_list = list(pd.read_csv('/kaggle/input/15th-data60sec/var_list.csv')['var'])

In [None]:
var_list = list(reversed(sorted(list(set(var_list) - set(train.columns[train.isna().any()].tolist() + test.columns[test.isna().any()].tolist())))))

train = train.loc[(train['time'] >= 10),var_list].reset_index(drop=True)
test = test.loc[(test['time'] >= 10),list(set(var_list) - set(['label']))].reset_index(drop=True)

In [None]:
all = pd.concat([train,test],axis=0)
all_value_cnt = all.nunique()

In [None]:
cat_var = list(all_value_cnt[(all_value_cnt < 11) & (all_value_cnt > 2)].index)
bin_var = list(all_value_cnt[all_value_cnt == 2].index)
num_var = list(set(var_list) - set(cat_var) - set(bin_var) - set(['label','id','time']))
etc_var = ['label','id','time']

In [None]:
print('# of binary feature :',len(bin_var))
print('# of categorical feature :',len(cat_var))
print('# of numeric feature :',len(num_var))
print('# of etc feature :',len(etc_var))

In [None]:
col_index = pd.Series([int(col.replace('V','')) for col in list(num_var) if 'V' in col])

col_index.hist()

# sum(col_index < 100)
# sum((col_index < 200) & (col_index >= 100))
# sum((col_index < 200) & (col_index >= 100))
# sum((col_index < 300) & (col_index >= 200))
# sum((col_index < 400) & (col_index >= 300))
# sum((col_index < 500) & (col_index >= 400))

In [None]:
mean_of_num_var = all[num_var].mean().reset_index().rename(columns = {'index' : 'var', 0 : 'mean'})

kmeans = MiniBatchKMeans(n_clusters = 50, batch_size = 10000, random_state = 1995).fit(mean_of_num_var['mean'].values.reshape(-1,1))
mean_of_num_var['group'] = kmeans.predict(mean_of_num_var['mean'].values.reshape(-1,1))
mean_of_num_var.head()

In [None]:
temp = mean_of_num_var.groupby('group')['mean'].mean()
target_var = list(mean_of_num_var[mean_of_num_var['group'].isin(list(temp[temp > 10000].index))]['var'])

In [None]:
cat_lbl = pd.DataFrame()
for a in (bin_var + cat_var):
    cat_lbl[a] = pd.factorize(all[a])[0]

dup_cols = {}
for i, c1 in enumerate(tqdm(cat_lbl.columns)):
    for c2 in cat_lbl.columns[i+1:]:
        if c2 not in dup_cols and np.all(cat_lbl[c1] == cat_lbl[c2]):
            dup_cols[c2] = c1

cat_lbl.drop(dup_cols.keys(), axis = 1, inplace = True)
cat_lbl.shape

In [None]:
num = cat_lbl.nunique()
cat_var = list(num[num > 2].index)
bin_var = list(num[num == 2].index)

var_type_list = pd.DataFrame({'var' : etc_var + cat_var + bin_var + num_var,
              'type' : np.concatenate([np.repeat(['etc'],3),
                                       np.repeat(['cat'],len(cat_var)),
                                       np.repeat(['bin'],len(bin_var)),
                                       np.repeat(['num'],len(num_var))])})

grp_var = list(var_type_list[var_type_list['type'] == 'cat']['var']);grp_var[:5]

In [None]:
all = all[list(var_type_list['var'])]

for col in tqdm(grp_var):
    temp = all.groupby(col).mean()[target_var].reset_index()
    temp.columns = [col] + [col+'_grp'+'_'+a+'_mean' for a in target_var]
    all = pd.merge(all,temp, how = 'left', on = col)
    del temp

In [None]:
train = all[~all['label'].isnull()].reset_index(drop=True).drop(columns = ['time'])
test = all[all['label'].isnull()].reset_index(drop=True).drop(columns = ['time','label'])

del all

In [None]:
%%time

train = train.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
train_label = train['label']
train_id = train['id']

test = test.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
test_id = test['id']

var_model = list(set(train.columns) & set(test.columns) - set(['id']))

In [None]:
def tr_vl_split(train_df, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    '''
    
    np.random.seed(seed)
    
    valid_id = []
    vc = train[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train_df[train_df['label'] == a]['id'])
        valid_id += random.sample(id_list,num)
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train[train['id'].isin(train_id)]
    y_tr_ = train_label[train['id'].isin(train_id)]

    x_vl_ = train[~train['id'].isin(train_id)]
    y_vl_ = train_label[~train['id'].isin(train_id)]
    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

In [None]:
x_tr, y_tr, x_vl, y_vl = tr_vl_split(train, 2, seed = 1995)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)
print('test shape :', test.shape)

In [None]:
%%time

lgb_tr = lgb.Dataset(x_tr[var_model], label=y_tr)
lgb_vl = lgb.Dataset(x_vl[var_model], label=y_vl)

watchlist_1 = [lgb_tr, lgb_vl]
watchlist_2 = [lgb_vl, lgb_tr]

params = {
    "objective": "multiclass",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "multiclass",
    "num_class" : 198
}

lgb_model = lgb.train(params, train_set=lgb_tr, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=100, early_stopping_rounds=100)

prediction = pd.DataFrame(lgb_model.predict(test[var_model]))

In [None]:
time_now = datetime.now() + timedelta(hours = 9)
submission_name = str(time_now)[:16] + '_submission.csv'
sub = pd.concat([pd.DataFrame(test_id),prediction],axis=1).groupby('id').mean().reset_index()
sub.to_csv(submission_name,index=False)