## Dacon  15회 원자력발전소 상태 판단 모델링 경진대회
## 생물학적 수처리 
## 2020년 2월 16일

# 모델링 코드 작성방법

1) 입상자는 코드 제출 필수. 제출 코드는 예측 결과를 리더보드 점수로 복원할 수 있어야 함

2) 코드 제출시 확장자가 R user는 R or .rmd. Python user는 .py or .ipynb

3) 코드에 ‘/data’ 데이터 입/출력 경로 포함 제출 or R의 경우 setwd(" "), python의 경우 os.chdir을 활용하여 경로 통일

4) 전체 프로세스를 일목요연하게 정리하여 주석을 포함하여 하나의 파일로 제출

5) 모든 코드는 오류 없이 실행되어야 함(라이브러리 로딩 코드 포함되어야 함).

6) 코드와 주석의 인코딩은 모두 UTF-8을 사용하여야 함

## 1. 라이브러리 및 데이터
## Library & Data

In [1]:
import time
import pandas as pd
import zipfile ## zip file을 read하는 역할
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import pickle
import gc
import random
import joblib # 모델을 저장하고 불러오는 역할

## plot
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## 모델링에 사용한 라이브러리
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import log_loss
import lightgbm as lgb
import xgboost as xgb

import os



## 경로 설정
path_ = '../0_Data/'
soft_train = '../1_Code_train/soft_code/'
hard_train = '../1_Code_train/hard_code/'

soft_pred = '../2_Code_pred/soft_code/'
hard_pred = '../2_Code_pred/hard_code/'

## 2. 데이터 전처리 :  Data Cleansing & Pre-Processing

## 2-1 60초까지 데이터만 사용
- 60초 이후의 데이터는 삭제
- train데이터와 label을 붙여주기
- zip파일이 아닌 folder를 read

In [None]:
train_label = pd.read_csv(path_ + 'train_label.csv')
def make_one_file(path_, is_train):
    test_list = os.listdir(path_)
    li = []
    for filename in tqdm(test_list):    
        df = pd.read_csv(path_ + filename, index_col=None, header=0)
        df['id'] = int(filename.split('.')[0])
        
        if is_train:
            ## train은 label을 붙여주기 
            ## 10초 이하는 label을 999라고 임시로 붙여줌
            df['label'] = 999
            df.loc[10:,'label'] = train_label.loc[train_label['id'] == int(filename.split('.')[0]) , 'label'].values[0] 
            df = df.loc[:59]    
            
        li.append(df)

    data = pd.concat(li, axis=0, ignore_index=True)
    return data


# train = make_one_file('../0_Data/train/', True)
test = make_one_file('../0_Data/test/', False)

train.to_pickle('train_df.pickle')
test.to_pickle('test_df.pickle')

del train, test
gc.collect()



## 2-2 pickle file load

In [None]:
train = pd.read_pickle('train_df.pickle')
test = pd.read_pickle('test_df.pickle')

## 3. 탐색적 자료분석
## Exploratory Data Analysis


### train, test 변수별 분포 확인

In [None]:
def plot_dist_col(column):
    '''plot dist curves for train and test weather data for the given column name'''
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.lineplot(data=train, x='time', y=column, color='green', ax=ax).set_title(column, fontsize=16)
    sns.lineplot(data=test, x='time', y=column, color='purple', ax=ax).set_title(column, fontsize=16)
    plt.xlabel('time', fontsize=15)
    plt.legend(['train', 'test'])
    plt.show()

In [None]:
plot_dist_col('V0000')
plot_dist_col('V0001')
plot_dist_col('V0012')

### 일부 변수 correlation 확인

In [None]:
correlations =  train.iloc[:,0:22].corr()
# plt.figure(figsize = (20, 12))

# Heatmap of correlations
sns.clustermap(correlations, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

## 4. 변수 선택  Feature Engineering 

## Soft 전처리

### 4-1-1 Text column

In [None]:
trim_col = list(set(test.columns) - set(['id', 'time']))
for col in tqdm(trim_col):
    train[col] = train[col].apply(lambda x: pd.to_numeric( x,  errors='coerce' ) )
    test[col] = test[col].apply(lambda x: pd.to_numeric( x,  errors='coerce' ) )

### 4-1-2 High correlation column

In [None]:
train_var = list(train.T.drop_duplicates().reset_index()['index'])
test_var = list(test.T.drop_duplicates().reset_index()['index'])

var_list = list(set(train_var)|set(test_var))
train = train[var_list]

var_list.remove('label')
test = test[var_list]
var_list.append('label')

### 4-1-3 Constant column


In [None]:
constant_var = []
for col in tqdm(var_list):
    if train[col].nunique() == 1:
        constant_var += [col]

var_list.remove('label')        
for col in tqdm(var_list):
    if test[col].nunique() == 1:
        constant_var += [col]
var_list.append('label')        
        
constant_var = list(np.unique(constant_var))

In [None]:
## 최종 column 삽입
final_var = list(set(var_list) - set(constant_var))
train = train[final_var]

final_var.remove('label')
test = test[final_var]

In [None]:
## train : 상태 A의 label을 999로 했던 것 => 상태B와 동일한 값으로 변경 해줌

start = 0
if start == 0:
    for x in tqdm(range(len(train))):
        if train.loc[x:x,'label'].values[0] == 999:
            train.loc[x:x, 'label'] = train.loc[x+10:x+10, 'label'].values[0]
    start = 1
    
    

In [None]:
## column 정렬
train_col = list(train.columns)
train_col.sort()
train = train[train_col]


test_col = list(test.columns)
test_col.sort()
test = test[test_col]


## 나중에 test 데이터에 사용할 soft var list
Soft_var_list = pd.DataFrame(test_col, columns =['var'])
Soft_var_list.to_csv('../2_Code_pred/Soft_var_list.csv',index=False)


In [None]:
## 최종 전처리된 파일을 train_soft.pickle, test_soft.pickle로 저장
train.to_pickle(soft_train + 'train_soft.pickle')
test.to_pickle(soft_train + 'test_soft.pickle')

del train, test
gc.collect()

## Hard 전처리

In [None]:
train = pd.read_pickle('train_df.pickle')
test = pd.read_pickle('test_df.pickle')

### 4-2-1 Text column

In [None]:
# train / test 중에서 한번이라도 object인 변수 제거
train = train.select_dtypes(exclude = 'object')
test = test.select_dtypes(exclude = 'object')
label = train['label']

var_list = sorted(list(set(train.columns) & set(test.columns)))

train = train[var_list]
test = test[var_list]

### 4-2-2 High correlation column

In [None]:
# train / test 각 변수의 평균을 계산하여, 동일하면 중복 컬럼이라고 생각하고 제거.
var_list = list(set(list(train.mean().drop_duplicates().index)) & set(list(test.mean().drop_duplicates().index)))

### 4-2-3 Constant column

In [None]:
# 모든 row에서 동일한 값을 갖는 constant 변수 제거
constant_var = []
for col in tqdm(var_list):
    if train[col].nunique() == 1:
        constant_var += [col]

for col in tqdm(var_list):
    if test[col].nunique() == 1:
        constant_var += [col]
        
constant_var = list(np.unique(constant_var))

var_list = ['label'] + list(reversed(sorted(list(set(var_list) - set(constant_var)))))

del train, test
gc.collect()

### 4-2-4 NA 있는 변수 제거, 10초 이상 데이터만 사용

In [None]:
train = pd.read_pickle('train_df.pickle')
test = pd.read_pickle('test_df.pickle')

train_label = train['label']

# na가 있는 변수 제거
var_list = list(reversed(sorted(list(
    set(var_list) - 
    set(train.columns[train.isna().any()].tolist() 
        + test.columns[test.isna().any()].tolist())))))

# 10초 이상부터 학습에 사용할 것임
train = train.loc[(train['time'] >= 10),var_list].reset_index(drop=True)
test = test.loc[(test['time'] >= 10),list(set(var_list) - set(['label']))].reset_index(drop=True)

### 4-2-5 categorical / binary 변수 label encoding하여 중복 제거

In [None]:
all = pd.concat([train,test],axis=0).reset_index(drop=True)
all_value_cnt = all.nunique()

# 변수별 nunique를 확인하여, 변수의 타입을 예상.
cat_var = sorted(list(all_value_cnt[(all_value_cnt < 11) & (all_value_cnt > 2)].index))
bin_var = sorted(list(all_value_cnt[all_value_cnt == 2].index))
num_var = sorted(list(set(var_list) - set(cat_var) - set(bin_var) - set(['label','id','time'])))
etc_var = ['label','id','time']

cat_lbl = pd.DataFrame()
for a in (bin_var + cat_var):
    cat_lbl[a] = pd.factorize(all[a])[0]

dup_cols = {}
for i, c1 in enumerate(tqdm(cat_lbl.columns)):
    for c2 in cat_lbl.columns[i+1:]:
        if c2 not in dup_cols and np.all(cat_lbl[c1] == cat_lbl[c2]):
            dup_cols[c2] = c1

cat_lbl.drop(dup_cols.keys(), axis = 1, inplace = True)
cat_lbl.shape

In [None]:
num = cat_lbl.nunique()
cat_var = sorted(list(num[num > 2].index))
bin_var = sorted(list(num[num == 2].index))

var_type_list = pd.DataFrame({'var' : etc_var + cat_var + bin_var + num_var,
              'type' : np.concatenate([np.repeat(['etc'],3),
                                       np.repeat(['cat'],len(cat_var)),
                                       np.repeat(['bin'],len(bin_var)),
                                       np.repeat(['num'],len(num_var))])})

all = all[etc_var + num_var + cat_var + bin_var]

In [None]:
var_type_list.to_csv('../2_Code_pred/var_type_list.csv',index=False)

### 4-2-6 Feature Engineering

In [None]:
# categorical 변수에 대해 frequency encoding
for col in tqdm(cat_var):
    temp = all.groupby(col)['id'].count().to_dict()
    all[col+'_freq'] = all[col].map(temp)
    del temp

In [None]:
# lgbm feature importance top 25 변수
# feature engineering 진행 전 초기 모델링에서 산정한 값

import_var = ['V3616','V0081','V3324','V3615','V2855','V2859','V1821','V1818','V3098',
 'V3461','V4505','V1820','V2861','V2860','V1819','V3432','V2586','V2854',
 'V4743','V4824','V3103','V2853','V4495','V2076','V4525']

In [None]:
# lgbm Feature importance에서 상위 5개 변수들로 interaction feature 생성
for i,col1 in enumerate(import_var[:5]):
    for i2 in range(i+1,5):
        all[col1+'*'+import_var[:5][i2]] = all[col1] * all[import_var[:5][i2]]
        all[col1+'/'+import_var[:5][i2]] = all[col1] / all[import_var[:5][i2]]
        all[col1+'+'+import_var[:5][i2]] = all[col1] + all[import_var[:5][i2]]
        all[col1+'-'+import_var[:5][i2]] = all[col1] - all[import_var[:5][i2]]

In [None]:
# numeric 변수들을 소수 둘째자리에서 반올림 하고, 
# time변수와 concat하여 frequency encoding

num_var_round = round(all[num_var],2)
freq_target = sorted(list(num_var_round.columns[num_var_round.nunique() > 10]))
pd.DataFrame({'var' : freq_target}).to_csv('../2_Code_pred/freq_target.csv',index=False)

num_var_round = num_var_round[freq_target]
idstr= all['time'].astype(str)

for a in tqdm(freq_target):
    num_var_round[a] = pd.factorize(num_var_round[a])[0]
    num_var_round[a] = idstr.str.cat(num_var_round[a].astype(str),sep=',')
    temp = num_var_round[a].value_counts().to_dict()
    num_var_round[a] = num_var_round[a].map(temp)

round_freq_var = [a+'_round_time_freq' for a in num_var_round]
num_var_round.columns = round_freq_var

del train,test
all = pd.concat([all,num_var_round],axis=1)

In [None]:
train = all[~all['label'].isnull()].reset_index(drop=True).drop(columns = ['time'])
test = all[all['label'].isnull()].reset_index(drop=True).drop(columns = ['time','label'])

del all

train = train.dropna()
test = test.dropna()

### 4-2-7 모든 변수 rolling mean 처리

In [None]:
%%time
# 모든 변수에 대해 rolling mean

train = train.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
train_label = train['label']
train_id = train['id']

test = test.groupby('id').rolling(window = 5).mean().drop(columns = ['id']).reset_index().drop(columns = ['level_1']).dropna().reset_index(drop=True)
test_id = test['id']


In [None]:
## 최종 전처리된 파일을 train_hard.pickle, test_hard.pickle로 저장

train.to_pickle(hard_train + 'train_hard.pickle')
test.to_pickle(hard_train + 'test_hard.pickle')

del train, test
gc.collect()

## 5. 모델 학습, 검증, 저장 : Model Tuning & Evaluation

In [None]:
def lgbm_train(x_tr, y_tr, x_vl, y_vl, SEED):
    
    params = {
        "objective": "multiclass",
        "boosting": "gbdt",
        "num_leaves": 40,
        "learning_rate": 0.05,
        "feature_fraction": 0.85,
        "reg_lambda": 2,
        'seed' : SEED,
        "metric": "multiclass",
        "num_class" : 198
            }
    lgb_tr = lgb.Dataset(x_tr, label=y_tr)
    lgb_vl = lgb.Dataset(x_vl, label=y_vl)

    watchlist_1 = [lgb_tr, lgb_vl]


# 테스트용 : lgb_model = lgb.train(params, train_set=lgb_tr, num_boost_round=1, valid_sets=watchlist_1, verbose_eval=1, early_stopping_rounds=100)
    lgb_model = lgb.train(params, train_set=lgb_tr, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=100, early_stopping_rounds=100)
    
    return lgb_model

In [None]:
def lgbm_predict(test, model, sub_name, test_id):
    
    if test_id[0] == 828:
        prediction = pd.DataFrame(model.predict(test))
        sub = pd.concat([pd.DataFrame(test_id),prediction],axis=1).groupby('id').mean().reset_index()
        sub.to_csv(sub_name,index=False)
    
    else :

        test = test[test['time'] > 15]
        test_id = test['id']
        del test['id']
        pred = model.predict(test, num_iteration = model.best_iteration) 

        submission = pd.DataFrame(data=pred)
        submission.index = test_id
        submission.index.name = 'id'
        submission = submission.sort_index()
        submission = submission.groupby('id').mean()

        submission.to_csv(sub_name, index=True) #제출 파일 만들기
        

In [None]:
def xgb_train(x_tr, y_tr, x_vl, y_vl, SEED):

    params = {
        "objective": "multi:softprob",
        "eta": 0.0123,
        "max_depth": 3,
        "eval_metric": 'mlogloss',
        "num_class" : 198,
        "seed" : SEED,
        'tree_method' : 'gpu_hist', ## gpu 미사용 환경시 주석 처리 바람
        'colsample_bytree' : 0.85  ,
        'lambda' : 3,
        'alpha' : 4    
            }

    xgb_tr = xgb.DMatrix(x_tr, label=y_tr)
    xgb_vl = xgb.DMatrix(x_vl, label=y_vl)

    watchlist_1 = [(xgb_tr, 'train'), (xgb_vl, 'valid')]

    xgb_model = xgb.train(params, xgb_tr, 1500, watchlist_1, early_stopping_rounds=50, verbose_eval=50)
#  테스트용   xgb_model = xgb.train(params, xgb_tr, 1, watchlist_1, early_stopping_rounds=1, verbose_eval=50)    
    return xgb_model

In [None]:
def xgb_predict(test, model, sub_name, test_id):
    
    
    if test_id[0] == 828:
        prediction = pd.DataFrame(model.predict(xgb.DMatrix(test)))
        sub = pd.concat([pd.DataFrame(test_id),prediction],axis=1).groupby('id').mean().reset_index()
        sub.to_csv(sub_name,index=False)   
        
    else : 
        test = test[test['time'] > 15]
        test_id = test['id']
        del test['id']
        pred = model.predict(xgb.DMatrix(test))

        submission = pd.DataFrame(data=pred)
        submission.index = test_id
        submission.index.name = 'id'
        submission = submission.sort_index()
        submission = submission.groupby('id').mean()
        
        submission.to_csv(sub_name, index=True) #제출 파일 만들기


In [None]:
def simple_stacking(dir_):
    all_subs = os.listdir(dir_)
    num_subs = len(all_subs)
    
    empty = pd.DataFrame(columns = ['id'] + [str(a) for a in range(198)],
                         data = np.zeros((720,199)))

    for subs in all_subs:
        if 'half' in subs:
            temp = pd.read_csv(dir_+subs)
            empty = empty + temp

        else:
            print(subs,'는 대상 파일이 아닙니다.')
            num_subs = num_subs - 1

    final_subs = empty/num_subs

    final_subs['id'] = final_subs['id'].astype(int)
    
    return(final_subs)

In [None]:
def soft_half1_tr_vl_split(train_df, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    '''
    
    np.random.seed(seed)
    
    valid_id = []
    vc = train_df[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train_df[train_df['label'] == a]['id'])
        valid_id += random.sample(id_list,num)
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train_df[train_df['id'].isin(train_id)]
    y_tr_ = x_tr_['label']
    del x_tr_['label'], x_tr_['id']    

    x_vl_ = train_df[~train_df['id'].isin(train_id)]
    y_vl_ = x_vl_['label']
    del x_vl_['label'] 
    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

In [None]:
def soft_half2_tr_vl_split(train_df, before_x_vl, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    before_x_vl : half1에서 train에 사용했던 id list
    
    '''
    
    np.random.seed(seed)
    
    valid_id = []
    vc = train_df[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train[train['label'] == a]['id'])    
        sample = list(random.sample(id_list,3))
        while set(sample) & set(before_x_vl):
            ## 겹치는 게 있으면 true, 없으면 false
            sample = list(random.sample(id_list,3))
        ## 겹치는게 없어서 탈출하면
        valid_id += sample
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train_df[train_df['id'].isin(train_id)]
    y_tr_ = x_tr_['label']
    del x_tr_['label'], x_tr_['id']    

    x_vl_ = train_df[~train_df['id'].isin(train_id)]
    y_vl_ = x_vl_['label']
    del x_vl_['label'] ,x_vl_['id']    
    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

In [None]:
def hard_half1_tr_vl_split(train_df, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    '''
    
    np.random.seed(seed)
    train_label = train_df['label']
    
    valid_id = []
    vc = train[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train_df[train_df['label'] == a]['id'])
        valid_id += random.sample(id_list,num)
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train[train['id'].isin(train_id)]
    y_tr_ = train_label[train['id'].isin(train_id)]

    x_vl_ = train[~train['id'].isin(train_id)]
    y_vl_ = train_label[~train['id'].isin(train_id)]
    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

In [None]:
def hard_half2_tr_vl_split(train_df, num, seed):    
    '''
    train / validation split 함수
    train 에 모든 label이 최소 한번은 등장 & train과 validation의 id는 겹치지 않도록 split함.
    
    train_df : train 데이터
    num : label 당 몇개의 id를 뽑을 것이냐.
    seed = random seed
    
    '''
    
    np.random.seed(seed)
    train_label = train_df['label']
    
    valid_id = []
    vc = train_df[['id','label']].drop_duplicates()['label'].value_counts()
    temp = list(vc[vc > num].index)
    for a in temp:
        id_list = list(train[train['label'] == a]['id'])    
        sample = list(random.sample(id_list,3))
        while set(sample) & set(before_x_vl):
            ## 겹치는 게 있으면 true, 없으면 false
            sample = list(random.sample(id_list,3))
        ## 겹치는게 없어서 탈출하면
        valid_id += sample
    
    train_id = list(set(train_df['id']) - set(valid_id))
    
    x_tr_ = train[train['id'].isin(train_id)]
    y_tr_ = train_label[train['id'].isin(train_id)]

    x_vl_ = train[~train['id'].isin(train_id)]
    y_vl_ = train_label[~train['id'].isin(train_id)]

    
    return x_tr_, y_tr_, x_vl_, y_vl_ 

## Soft 파일
### LGBM
- half1 seed : 2014 (시간 부족으로 seed 1995, 2020은 모델 학습을 하지 못함)
- half2 seed : 1995, 2014, 2020

### XGB
- half1 seed : 1995, 2014 
- half2 seed : 1995, 2014

In [None]:
train = pd.read_pickle(soft_train + 'train_soft.pickle')
test = pd.read_pickle(soft_train + 'test_soft.pickle')

## seed 1995, half1
- xgb (lgbm은 시간 부족으로 만들지 못함)

In [None]:
x_tr, y_tr, x_vl, y_vl = soft_half1_tr_vl_split(train, 3, seed = 1995)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)

before_x_vl = list(x_vl['id'].unique())

del x_vl['id']  
gc.collect()

In [None]:
## xgb train
xgb_half1_1995 = xgb_train(x_tr, y_tr, x_vl, y_vl, 1995)
joblib.dump(xgb_half1_1995, soft_pred+'xgb/xgb_half1_1995.pkl')

## xgb predict
xgb_predict(test, xgb_half1_1995, soft_train+"xgb/xgb_half1_1995.csv",[0]) 

## seed 1995, half2
- lgbm, xgb

In [None]:
x_tr, y_tr, x_vl, y_vl = soft_half2_tr_vl_split(train, before_x_vl, 3, seed = 1995)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)


In [None]:
## xgb train
xgb_half2_1995 = xgb_train(x_tr, y_tr, x_vl, y_vl, 1995)
joblib.dump(xgb_half2_1995, soft_pred+'xgb/xgb_half2_1995.pkl')

## xgb predict
xgb_predict(test, xgb_half2_1995, soft_train+"xgb/xgb_half2_1995.csv",[0]) 

## lgbm train
lgb_half2_1995 = lgbm_train(x_tr, y_tr, x_vl, y_vl, 1995)
joblib.dump(lgb_half2_1995, soft_pred+'lgbm/lgb_half2_1995.pkl')

## lgbm predict
lgbm_predict(test,lgb_half2_1995, soft_train+"lgbm/lgb_half2_1995.csv",[0]) 

## seed 2014, half1
- lgbm, xgb

In [None]:
x_tr, y_tr, x_vl, y_vl = soft_half1_tr_vl_split(train, 3, seed = 2014)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)

before_x_vl = list(x_vl['id'].unique())

del x_vl['id']  
gc.collect()

In [None]:
## lgbm train
lgb_half1_2014 = lgbm_train(x_tr, y_tr, x_vl, y_vl, 2014)
joblib.dump(lgb_half1_2014, soft_pred+'lgbm/lgb_half1_2014.pkl')

## lgbm predict
lgbm_predict(test,lgb_half1_2014, soft_train+"lgbm/lgb_half1_2014.csv", [0]) 

## xgb train
xgb_half1_2014 = xgb_train(x_tr, y_tr, x_vl, y_vl, 2014)
joblib.dump(xgb_half1_2014, soft_pred+'xgb/xgb_half1_2014.pkl')

## xgb predict
xgb_predict(test, xgb_half1_2014, soft_train+"xgb/xgb_half1_2014.csv", [0]) 

## seed 2014, half2
- lgbm, xgb

In [None]:
x_tr, y_tr, x_vl, y_vl = soft_half2_tr_vl_split(train, before_x_vl, 3, seed = 2014)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)


In [None]:
## lgbm train
lgb_half2_2014 = lgbm_train(x_tr, y_tr, x_vl, y_vl, 2014)
joblib.dump(lgb_half2_2014, soft_pred+'lgbm/lgb_half2_2014.pkl')

## lgbm predict
lgbm_predict(test,lgb_half2_2014, soft_train+"lgbm/lgb_half2_2014.csv",[0]) 

## xgb train
xgb_half2_2014 = xgb_train(x_tr, y_tr, x_vl, y_vl, 2014)
joblib.dump(xgb_half2_2014, soft_pred+'xgb/xgb_half2_2014.pkl')

## xgb predict
xgb_predict(test, xgb_half2_2014, soft_train+"xgb/xgb_half2_2014.csv", [0]) 

## seed 2020, half2
- lgbm (시간 부족으로 half1은 만들지 못함)

In [None]:
x_tr, y_tr, x_vl, y_vl = soft_half1_tr_vl_split(train, 3, seed = 2020)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)

before_x_vl = list(x_vl['id'].unique())

del x_vl['id']  
gc.collect()

x_tr, y_tr, x_vl, y_vl = soft_half2_tr_vl_split(train, before_x_vl, 3, seed = 2020)

print('train shape :',x_tr.shape)
print('validation shape :',x_vl.shape)


In [None]:
## lgbm train
lgb_half2_2020 = lgbm_train(x_tr, y_tr, x_vl, y_vl, 2020)
joblib.dump(lgb_half2_2020, soft_pred+'lgbm/lgb_half2_2020.pkl')

## lgbm test
lgbm_predict(test,lgb_half2_2020, soft_train+"lgbm/lgb_half2_2020.csv",[0]) 

## Hard 파일
### LGBM MODELING
- half1 seed : 1995, 2014, 2018, 2019, 2020 
- half2 seed : 1995, 2014, 2018, 2019, 2020 

### XGB MODELING
- half1 seed : 1995
- half2 seed : 1995

In [None]:
train = pd.read_pickle(hard_train + 'train_hard.pickle')
test = pd.read_pickle(hard_train + 'test_hard.pickle')
test_id = test['id']

var_model = sorted(list(set(train.columns) & set(test.columns) - set(['id'])))

### LGBM MODELING

In [None]:
for seeds in [1995, 2014, 2018, 2019, 2020]:
    print('" SEED :',seeds,'"\n')

    
    # half 1
    print('half1 start...')
    x_tr, y_tr, x_vl, y_vl = hard_half1_tr_vl_split(train,2,seed = seeds)

    print('train shape :',x_tr.shape)
    print('validation shape :',x_vl.shape)
    print('test shape :', test.shape)

    lgb_model = lgbm_train(x_tr[var_model], y_tr, x_vl[var_model], y_vl, seeds)

    print('\nsave model...')
    joblib.dump(lgb_model, hard_pred+'lgbm/lgb_half1_'+ str(seeds) + '.pkl')  

    print('\nmake prediction...')
    lgbm_predict(test[var_model],lgb_model, hard_train+"lgbm/lgb_half1_"+str(seeds) +".csv", test_id) 
    print('-'*40)

    # half 2
    print('\nhalf2 start...')
    before_x_vl = list(x_vl['id'].unique())

    del x_tr, y_tr, x_vl, y_vl
    gc.collect()

    x_tr, y_tr, x_vl, y_vl = hard_half2_tr_vl_split(train,2,seed = seeds)

    print('train shape :',x_tr.shape)
    print('validation shape :',x_vl.shape)
    print('test shape :', test.shape)

    lgb_model = lgbm_train(x_tr[var_model], y_tr, x_vl[var_model], y_vl, seeds)

    print('\nsave model...')
    joblib.dump(lgb_model, hard_pred+'lgbm/lgb_half2_'+ str(seeds) + '.pkl')  

    print('\nmake prediction...')
    lgbm_predict(test[var_model],lgb_model, hard_train+"lgbm/lgb_half2_"+str(seeds) +".csv", test_id) 
    print('-'*40)
    print('-'*40)

### XGB MODELING

In [None]:
for seeds in [1995]:
    print('" SEED :',seeds,'"\n')

    
    # half 1
    print('half1 start...')
    x_tr, y_tr, x_vl, y_vl = hard_half1_tr_vl_split(train,2,seed = seeds)

    print('train shape :',x_tr.shape)
    print('validation shape :',x_vl.shape)
    print('test shape :', test.shape)

    xgb_model = xgb_train(x_tr[var_model], y_tr, x_vl[var_model], y_vl, seeds)

    print('\nsave model...')
    joblib.dump(xgb_model, hard_pred+'xgb/xgb_half1_'+ str(seeds) + '.pkl')  

    print('\nmake prediction...')
    xgb_predict(test[var_model],xgb_model, hard_train+"xgb/xgb_half1_"+str(seeds) +".csv", test_id) 
    print('-'*40)

    # half 2
    print('\nhalf2 start...')
    before_x_vl = list(x_vl['id'].unique())

    del x_tr, y_tr, x_vl, y_vl
    gc.collect()

    x_tr, y_tr, x_vl, y_vl = hard_half2_tr_vl_split(train,2,seed = seeds)

    print('train shape :',x_tr.shape)
    print('validation shape :',x_vl.shape)
    print('test shape :', test.shape)

    xgb_model = xgb_train(x_tr[var_model], y_tr, x_vl[var_model], y_vl, seeds)

    print('\nsave model...')
    joblib.dump(xgb_model, hard_pred+'xgb/xgb_half2_'+ str(seeds) + '.pkl')  

    print('\nmake prediction...')
    xgb_predict(test[var_model],xgb_model, hard_train+"xgb/xgb_half2_"+str(seeds) +".csv", test_id) 
    print('-'*40)
    print('-'*40)

## MODEL ENSEMBLE

### LGBM ENSEMBLE
- 모두 동일한 weight로 simple stacking

### XGB ENSEMBLE
- 모두 동일한 weight로 simple stacking

### SOFT LGBM ENSEMBLE
- half1은 1개, half2는 3개의 seed가 존재
- 1) half2를 동일한 weight로 simple stacking => half2_new
- 2) half1과 half2_new를 동일한 weight로 simple stacking

In [None]:
## half2 불러와서 합치기
half_2_1 = pd.read_csv(soft_train + 'lgbm/' + 'lgb_half2_1995.csv')
half_2_2 = pd.read_csv(soft_train + 'lgbm/' + 'lgb_half2_2014.csv')
half_2_3 = pd.read_csv(soft_train + 'lgbm/' + 'lgb_half2_2020.csv')

id_list = half_2_1.iloc[:, 0:1]
sub = (half_2_1.iloc[:,1:] + half_2_2.iloc[:,1:] + half_2_3.iloc[:,1:] )/3
soft_lgbm_half2 = pd.concat([id_list, sub], axis= 1)
soft_lgbm_half2 = soft_lgbm_half2.set_index('id')
soft_lgbm_half2.to_csv(soft_train + 'lgbm/' + 'soft_lgbm_half2.csv')

## half1 + half2 합치기
half_1 = pd.read_csv(soft_train + 'lgbm/' + 'lgb_half1_2014.csv')
half_2 = pd.read_csv(soft_train + 'lgbm/' + 'soft_lgbm_half2.csv')

id_list = half_1.iloc[:, 0:1]
sub = (half_1.iloc[:,1:] + half_2.iloc[:,1:])/2
soft_lgbm_submission = pd.concat([id_list, sub], axis= 1)
soft_lgbm_submission = soft_lgbm_submission.set_index('id')

soft_lgbm_submission.to_csv(soft_train+'lgbm/lgbm_final_subsmission.csv', index= False)

### SOFT XGB ENSEMBLE

In [None]:
soft_xgb_submission = simple_stacking(dir_ = soft_train + 'xgb/')
soft_xgb_submission.to_csv(soft_train+'xgb/xgb_final_subsmission.csv', index= False)

### HARD LGBM ENSEMBLE

In [None]:
hard_lgbm_submission = simple_stacking(dir_ = hard_train + 'lgbm/')
hard_lgbm_submission.to_csv(hard_train+'lgbm/lgbm_final_subsmission.csv', index= False)

### HARD XGB ENSEMBLE

In [None]:
hard_xgb_submission = simple_stacking(dir_ = hard_train + 'xgb/')
hard_xgb_submission.to_csv(hard_train+'xgb/xgb_final_subsmission.csv', index= False)

## 6. 결과 및 결언
## Conclusion & Discussion

In [None]:
soft_xgb = pd.read_csv(soft_train+'xgb/xgb_final_subsmission.csv')
hard_xgb = pd.read_csv(hard_train+'xgb/xgb_final_subsmission.csv')

soft_lgbm = pd.read_csv(soft_train+'lgbm/lgbm_final_subsmission.csv')
hard_lgbm = pd.read_csv(hard_train+'lgbm/lgbm_final_subsmission.csv')

In [None]:
final_submission = (0.35*hard_lgbm + 0.35*soft_lgbm + 0.15*hard_xgb + 0.15*soft_xgb)
final_submission['id'] = final_submission['id'].astype(int)

final_submission.to_csv('../1_Code_train/submission.csv',index = False)