In [18]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [19]:
df_train = pd.read_csv('m5-forecasting-accuracy/sales_train_validation.csv')
calendar = pd.read_csv('m5-forecasting-accuracy/calendar.csv')
price = pd.read_csv('m5-forecasting-accuracy/sell_prices.csv')
# df_test = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')

In [20]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame, tst):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()
#         train_id = train_id
        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices
        self.tst = tst
        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        scale = np.where(scale != 0 , scale, 1)
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)
    
    
class WRMSSEForLightGBM(WRMSSEEvaluator):

    def feval(self, preds, dtrain):
#         print(preds.shape, self.tst.shape)
#         tst= self.df[self.df['day'].isin(valid_target_columns)]
#         tst['id'] = train_id.loc[tst.index]
        tmp = self.tst.copy()
        tmp['preds'] = preds
        tmp=  tmp.set_index(['id',"day"]).unstack()["preds"].reset_index()
        tmp =  tmp.fillna(0)

        val = pd.DataFrame()
        val['id'] = self.train_df['id']
        pred = pd.merge(val, tmp, how = 'left')
        pred = pred.fillna(0)
#         print(pred.columns)
#         print(self.valid_target_columns)
        pred = pred.loc[:,self.valid_target_columns ]
#         cv_scores.append(evaluator.score(pred))
#         preds = preds.reshape(self.valid_df[self.valid_target_columns].shape)
        score = self.score(pred)
        return 'WRMSSE', score, False

In [21]:
# tst = pd.read_csv('m5-forecasting-accuracy/sales_train_validation.csv')
# np.where(tst.iloc[0,6:].values>0)[0].max()
# tst.iloc[0,6:].values>0
# tst[tst['id']=='HOBBIES_1_210_CA_1_validation']
# for i, val in last_dict.items():
#     if val<1750 : print(i)

In [22]:
# # startpoints 찾아서 이전 데이터 지우기 + last sales를 찾기
startpoints = np.zeros(df_train.shape[0])
# lastpoints = np.zeros(df_train.shape[0])
for idx in tqdm(range(df_train.shape[0])):
    startpoints[idx]= np.where(df_train.iloc[idx,6:].values>0)[0].min().astype(int)
#     lastpoints[idx]= np.where(df_train.iloc[idx,6:].values>0)[0].max().astype(int)
start_dict = dict(zip(df_train['id'], startpoints))
# last_dict = dict(zip(df_train['id'], lastpoints))

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))




In [23]:
drop_cols = []
cat_cols = []
drop_cols += ['date','d','id']
tr_last = 1913
# F_1~28 만들기  1914~1941 까지 
for i in range(tr_last+1, tr_last+1+28):   df_train['d_%s'%i] = 0

# # Unpivot
df_train = pd.melt(df_train, id_vars=df_train.columns[:6], value_vars=df_train.columns[6:],
       var_name = 'day', value_name = 'volume')



In [24]:
df_train = pd.merge(df_train, calendar, left_on = 'day', right_on ='d')
# snap 합치기
snap = np.zeros(df_train.shape[0])
snap[df_train[(df_train['state_id']=='CA')&(df_train['snap_CA']==1)].index] +=1
snap[df_train[(df_train['state_id']=='TX')&(df_train['snap_TX']==1)].index] +=1
snap[df_train[(df_train['state_id']=='WI')&(df_train['snap_WI']==1)].index] +=1
df_train['snap'] = snap
drop_cols += ['snap_CA','snap_TX','snap_WI']


cat_cols += [ 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
#               'wday', 'month', 'year', # 이게 크리티컬?
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap']


# ????

In [25]:
# Sell price
df_train.head()
df_train = pd.merge(df_train, price)

# # Start point 찾기 ::  0.1% 데이터를 날릴  수 있다. 

df_train['startpoint'] = df_train['id'].map(start_dict).astype(int)#.astype(str)
df_train['startpoints'] = df_train['day'].str.slice(start=2).astype(int) >=df_train['startpoint']
print(df_train['startpoints'].value_counts())
df_train = df_train[df_train['startpoints']]
print(df_train.shape)
df_train.drop(['startpoint','startpoints'],axis =1, inplace= True)

True     46816555
False       65122
Name: startpoints, dtype: int64
(46816555, 26)


In [26]:
#df_train['lastpoint'] = df_train['id'].map(last_dict).astype(int)#.astype(str)
#df_train['from_lastpoint'] = df_train['day'].str.slice(start=2).astype(int) - df_train['lastpoint']

#df_train.drop(['lastpoint'],axis =1, inplace= True)

In [27]:
df_train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,volume,date,wm_yr_wk,...,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,snap,sell_price
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12,2011-01-29,11101,...,d_1,,,,,0,0,0,0.0,0.46
1,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_2,15,2011-01-30,11101,...,d_2,,,,,0,0,0,0.0,0.46
2,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0,2011-01-31,11101,...,d_3,,,,,0,0,0,0.0,0.46
3,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0,2011-02-01,11101,...,d_4,,,,,1,1,0,1.0,0.46
4,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0,2011-02-02,11101,...,d_5,,,,,1,0,1,1.0,0.46


In [28]:
df_train

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,volume,date,wm_yr_wk,...,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,snap,sell_price
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12,2011-01-29,11101,...,d_1,,,,,0,0,0,0.0,0.46
1,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_2,15,2011-01-30,11101,...,d_2,,,,,0,0,0,0.0,0.46
2,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0,2011-01-31,11101,...,d_3,,,,,0,0,0,0.0,0.46
3,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0,2011-02-01,11101,...,d_4,,,,,1,1,0,1.0,0.46
4,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0,2011-02-02,11101,...,d_5,,,,,1,0,1,1.0,0.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46881672,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,d_1941,,,,,0,0,0,0.0,3.98
46881673,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1940,0,2016-05-21,11617,...,d_1940,,,,,0,0,0,0.0,1.28
46881674,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,d_1941,,,,,0,0,0,0.0,1.28
46881675,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1940,0,2016-05-21,11617,...,d_1940,,,,,0,0,0,0.0,1.00


In [29]:
# df_train['rcount_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
# df_train[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7).count() if x>0 else 0 )#.fllna(0)


In [30]:
%%time

# 왜 Shift를 해야할까?
# Shift 28을 하지 않으면 예측값이 뒤로 가면갈 수록  F1->F28 예측 할 수 있는 변수가 줄어든게 된다.
# 28일은 한달을 의미한다. 최근 한달간의 경향성을 보는 것으로 보면 되겠다.
# 28일을 56일로 늘리면 안되나? - 최근 한달간의 경향성이 반영이 안되는 것이낙?
# https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50

#df_train['volume_1'] = df_train[['id','volume']].groupby("id")['volume'].shift(1)
#df_train['volume_2'] = df_train[['id','volume']].groupby("id")['volume'].shift(2)
#df_train['volume_3'] = df_train[['id','volume']].groupby("id")['volume'].shift(3)

df_train['volume_7'] = df_train[['id','volume']].groupby("id")['volume'].shift(7)
df_train['volume_28'] = df_train[['id','volume']].groupby("id")['volume'].shift(28)

print("mean")

df_train['rmean_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).mean())
df_train['rmean_7_28'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).mean())
df_train['rmean_7_50'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).mean())

df_train['rmean_28_7'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).mean())
df_train['rmean_28_28'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).mean())
df_train['rmean_28_50'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).mean())

print("std")
df_train['rstd_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).std())
df_train['rstd_7_28'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).std())
df_train['rstd_7_50'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).std())

df_train['rstd_28_7'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).std())
df_train['rstd_28_28'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).std())
df_train['rstd_28_50'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).std())

print("max")
df_train['rmax_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).max())
df_train['rmax_7_28'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).max())
df_train['rmax_7_50'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).max())

df_train['rmax_28_7'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).max())
df_train['rmax_28_28'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).max())
df_train['rmax_28_50'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).max())

# print("min")
# df_train['rmin_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).min())
# df_train['rmin_7_28'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).min())
# df_train['rmin_7_50'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).min())

# df_train['rmin_28_7'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).min())
# df_train['rmin_28_28'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).min())
# df_train['rmin_28_50'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).min())

# print("count")
# df_train['rcount_7_7'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
# df_train['rcount_7_28'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)
# df_train['rcount_7_50'] = df_train[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).count() if x>0).fllna(0)

# df_train['rcount_28_7'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).count() if x>0 else 0).fllna(0)
# df_train['rcount_28_28'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).count() if x>0 else 0).fllna(0)
# df_train['rcount_28_50'] = df_train[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).count() if x>0 else 0).fllna(0)

# full_df['rcnt_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
# full_df['rcnt_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)
# full_df['rcnt_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)


mean
std
max


MemoryError: Unable to allocate 6.63 GiB for an array with shape (19, 46816555) and data type float64

In [31]:
df_train['date'] =  pd.to_datetime(df_train["date"])

MemoryError: Unable to allocate 4.53 GiB for an array with shape (13, 46816555) and data type object

In [None]:
df_train['week'] = getattr(df_train["date"].dt, "weekofyear").astype("int16")
# df_train['quarter'] = getattr(df_train["date"].dt,"quarter").astype("int16")
df_train['mday'] = getattr(df_train["date"].dt, "day").astype("int16")


In [None]:
cols =['event_name_1','event_type_1','event_name_2','event_type_2']
df_train[cols]= df_train[cols].fillna('NaN')
print(df_train.shape)


In [None]:
gc.collect()

In [None]:
df_train.shape

In [None]:
df_train = df_train[~df_train['rmean_28_50'].isna()]
# dropna 하면 뻥난다.

In [None]:
for col in df_train.columns:
    print(col, df_train[col].isnull().sum())
# df_train.isnull().sum()

In [None]:
# df_train.dropna()#inplace =True

In [None]:
df_train.shape

In [None]:
drop_cols += ["wm_yr_wk", "weekday"]  ## 이게 문제?
drop_cols

In [None]:
df_train.head()
tr_last = 1913
testday = ['d_%s'% x for x in range(tr_last+1, tr_last+1+28)]
train_id = df_train['id']
df_test_id = df_train[df_train['day'].isin(testday)]['id']

In [None]:
df_train= df_train.drop(drop_cols,axis =1 )
# df_test =df_test.drop(drop_cols,axis =1)

In [None]:
df_train.head()

In [None]:
# Encoding
for col in tqdm(cat_cols) :  # encoding -1이 문제?
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col]).astype(np.int8)

    gc.collect()

In [None]:
# Test 분리하기
testday = ['d_%s'% x for x in range(tr_last+1, tr_last+1+28)]
df_test = df_train.copy()
df_train = df_train[~df_train['day'].isin(testday)]
# train_col = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day', 'volume', 'wday', 'month', 'year',
#  'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap', 'sell_price', 'volume_7', 'volume_28',
#  'rmean_7_7', 'rmean_7_28', 'rmean_7_50', 'rmean_28_7', 'rmean_28_28', 'rmean_28_50', 'week', 'quarter', 'mday']
# df_train = df_train.loc[:,train_col]

In [None]:
df_test.head()

In [None]:
# %%time

# from sklearn.model_selection import train_test_split

# x_train , x_valid = train_test_split(df_train, test_size =0.15, random_state = 99)
# y_train, y_valid = x_train['volume'], x_valid['volume']

# x_train = x_train.drop(['day','volume'], axis =1)
# x_valid = x_valid.drop(['day','volume'], axis =1)

In [None]:
df_train.head()

In [None]:
# params = {
#         "objective" : "poisson",
#         "metric" :"rmse",
#         "force_row_wise" : True,
#         "learning_rate" : 0.075,
# #         "sub_feature" : 0.8,
#         "sub_row" : 0.75,
#         "bagging_freq" : 1,
#         "lambda_l2" : 0.1,
# #         "nthread" : 4
#         "metric": ["rmse"],
#     'verbosity': 1,
#     'num_iterations' : 3000,
#     'num_leaves': 128,
#     "min_data_in_leaf": 100,
#         'n_jobs' :10 
# }


params = {
        "objective" : "poisson",
#         "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric":"None",# ["rmse"],#"None",#
    'verbosity': 1,
    'num_iterations' : 10000,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
        'n_jobs' :10 
}

In [None]:
%%time
# New Cross validation
import pickle
decay_rate=0.01
rounds =20
cv_scores = []
folds = 3
cols = ["d_%s"%x for x in range(1,1914)]
days = list(df_train['day'].unique())
cols = [x for x in cols if x in days] # 삭제된 day 삭제
n= int(np.round(len(cols)/folds))
for idx in range(folds):

    print(idx)
    # Set Evaluator
    
    d_valid = cols[n*(idx):n*(idx+1)]
    d_train = [x for x in cols if x not in d_valid]
    tmp = pd.read_csv('m5-forecasting-accuracy/sales_train_validation.csv')
    tmp_train = pd.concat([tmp.iloc[:,:6],tmp.loc[:, d_train]], axis =1)
    tmp_valid = tmp.loc[:, d_valid]
#     evaluator = WRMSSEEvaluator(tmp_train, tmp_valid, calendar, price)
    tst= df_train[df_train['day'].isin(d_valid)]
    tst['id'] = train_id.loc[tst.index]
    
    
    evaluator = WRMSSEForLightGBM(tmp_train, tmp_valid, calendar, price,tst)

    del tmp 
    gc.collect()
    
    # Set train, valid
    x_train  = df_train[df_train['day'].isin(d_train)].drop(['day','volume'], axis =1)
    y_train = df_train[df_train['day'].isin(d_train)]['volume']
    x_valid  = df_train[df_train['day'].isin(d_valid)].drop(['day','volume'], axis =1)
    y_valid = df_train[df_train['day'].isin(d_valid)]['volume']
    print("set train, valid")
    # Modeling
    lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat_cols)
    lgb_eval = lgb.Dataset(x_valid, y_valid,categorical_feature=cat_cols)
    gbm = lgb.train(params, lgb_train, valid_sets=( lgb_eval),
                   feval= evaluator.feval,# Custom Loss 사용
#                     callbacks = [lgb.early_stopping(10, first_metric_only=True)],# Metric First 체크 -earyly Stopping 에
                    early_stopping_rounds= 50,  # rmse 를 기준으로 하고 -- 분석한 후에 WRMSSE만 넣어서 할것
                callbacks=[lgb.reset_parameter(learning_rate = lambda iter: (1 / (1 + decay_rate * (iter//rounds))) * params['learning_rate']​)],
                    verbose_eval=100) 

    pickle.dump(gbm,open( "20200502_model_%s_r1_decay.pkl"%idx, "wb" ))
    
#     evaluator = WRMSSEEvaluator(tmp_train, tmp_valid, calendar, price)
    
#     preds= gbm.predict(x_valid)
#     tst= df_train[df_train['day'].isin(d_valid)]
#     tst['id'] = train_id.loc[tst.index]
#     tst['preds'] = preds
#     tst= tst.set_index(['id',"day"]).unstack()["preds"].reset_index()
#     tst = tst.fillna(0)

#     val = pd.DataFrame()
#     val['id'] = tmp_train['id']
#     pred = pd.merge(val,tst, how = 'left')
#     pred = pred.fillna(0)
#     pred = pred.loc[:,d_valid]
#     cv_scores.append(evaluator.score(pred))
#     print("idx","WRMSSE :", evaluator.score(pred))
    
    
    del gbm,tmp_train, tmp_valid, evaluator
    gc.collect()

In [None]:
alpha =0.019
e = 3
k =1/1000
x= lambda current_round: alpha * e**(k*current_round)
x(100)

In [None]:
10//5

In [None]:
decay_rate=0.01
rounds =20
x= lambda iter: (1 / (1 + decay_rate * (iter//rounds))) * params['learning_rate']
for i in range(100):
    print(i, x(i))

In [None]:
# %%time
# seed = 99
# folds = 3
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import mean_squared_log_error
# import pickle

# skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)

# for idx, (train_index, test_index) in enumerate(skf.split(df_train.index,df_train['volume'])):
#     print(idx)
#     x_train = df_train.iloc[train_index].drop(['day','volume'], axis =1)
#     y_train = df_train.iloc[train_index]['volume']
#     x_valid = df_train.iloc[test_index].drop(['day','volume'], axis =1)
#     y_valid = df_train.iloc[test_index]['volume'] 
    
#     # Modeling
#     lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat_cols)
#     lgb_eval = lgb.Dataset(x_valid, y_valid,categorical_feature=cat_cols)
#     gbm = lgb.train(params, lgb_train,
# #                     num_boost_round=1000, 
#                     valid_sets=(lgb_train, lgb_eval),
#                     early_stopping_rounds= 50,#100,
#                     verbose_eval=200) #100)

#     pickle.dump(gbm,open( "20200501_model_%s_r2.pkl"%idx, "wb" ))
        
#     del gbm
#     gc.collect()
    

In [None]:
%%time
df_test['id'] = train_id
max_lag  = 120
trn_lst = 1913
for tdelta in range(1, 29):
    f_day = trn_lst+tdelta
    print(f_day)
    days = [f"d_{i}" for i in range(trn_lst-max_lag+tdelta,f_day+1)]
    tst = df_test[df_test['day'].isin(days)]

    print("rolling")
   # tst['volume_1'] = tst[['id','volume']].groupby("id")['volume'].shift(1)
  #  tst['volume_2'] = tst[['id','volume']].groupby("id")['volume'].shift(2)
  #  tst['volume_3'] = tst[['id','volume']].groupby("id")['volume'].shift(3)
    
    tst['volume_7'] = tst[['id','volume']].groupby("id")['volume'].shift(7)
    tst['volume_28'] = tst[['id','volume']].groupby("id")['volume'].shift(28)
    
    print("mean")
    tst['rmean_7_7'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).mean())
    tst['rmean_7_28'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).mean())
    tst['rmean_7_50'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).mean())

    tst['rmean_28_7'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).mean())
    tst['rmean_28_28'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).mean())
    tst['rmean_28_50'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).mean())
    
    print("std")
    tst['rstd_7_7'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).std())
    tst['rstd_7_28'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).std())
    tst['rstd_7_50'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).std())

    tst['rstd_28_7'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).std())
    tst['rstd_28_28'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).std())
    tst['rstd_28_50'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).std())

    print("max")
    tst['rmax_7_7'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).max())
    tst['rmax_7_28'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).max())
    tst['rmax_7_50'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).max())

    tst['rmax_28_7'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).max())
    tst['rmax_28_28'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).max())
    tst['rmax_28_50'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).max())

#     print("min")
#     tst['rmin_7_7'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).min())
#     tst['rmin_7_28'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).min())
#     tst['rmin_7_50'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).min())

#     tst['rmin_28_7'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).min())
#     tst['rmin_28_28'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).min())
#     tst['rmin_28_50'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).min())

#     print("count")
#     tst['rcount_7_7'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
#     tst['rcount_7_28'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)
#     tst['rcount_7_50'] = tst[['id','volume_7']].groupby("id")['volume_7'].transform(lambda x: x.rolling(50).count() if x>0).fllna(0)

#     tst['rcount_28_7'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
#     tst['rcount_28_28'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)
#     tst['rcount_28_50'] = tst[['id','volume_28']].groupby("id")['volume_28'].transform(lambda x: x.rolling(50).count() if x>0).fllna(0)

    
    tst = tst[tst['day'] == "d_%s"%(f_day)]
    t_id,t_volume,t_day = tst['id'],tst['volume'],tst['day']
    tst = tst.drop(['id','volume','day'], axis =1)
    
    # Crossvalidation 
    for idx in range(folds):
        gbm = pickle.load(open( "20200502_model_%s_r1_decay.pkl"%idx, "rb" ))
        df_test.loc[df_test.day=="d_%s"%(f_day),'volume'] += 1.028*gbm.predict(tst) / folds
        del gbm
        gc.collect()
 
    del tst 
    gc.collect()
    #0을 예측 못하니까 0으로 치환하는 것 -- test
#     df_test.loc[df_test.volume<0.5,'volume'] = 0
    
    


In [None]:
cols = [f"d_{i}" for i in range(1914,1942)]

sub = df_test[df_test['day'].isin(cols)].loc[:,['id','volume']]
sub['F']= [f"F{rank}" for rank in sub.groupby("id")["id"].cumcount()+1]
sub = sub.set_index(["id", "F" ]).unstack()["volume"].reset_index()
sub.sort_values("id", inplace = True)
sub.reset_index(drop=True, inplace = True)                                                   
sub =sub[['id']+["F%s"% x for x in range(1,29)]]

sub = sub.fillna(0)

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission_20200502_2_decay.csv",index=False)

In [None]:
gc.collect()

In [None]:
# Feature 체크하기
import pickle
gbm = pickle.load(open( "20200502_model_2_r1_decay.pkl", "rb" ))
# %%time

from sklearn.model_selection import train_test_split

def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))


# x_train , x_valid = train_test_split(df_train, test_size =0.05, random_state = 99)
# y_train, y_valid = x_train['volume'], x_valid['volume']

# x_train = x_train.drop(['day','volume'], axis =1)
# x_valid = x_valid.drop(['day','volume'], axis =1)

cols = ["d_%s"% x for x in range(1883,1914)]
x_valid = df_train[df_train['day'].isin(cols)]
y_valid = x_valid['volume']
x_valid = x_valid.drop(['day','volume'], axis =1)

In [None]:
%%time
pred = gbm.predict(x_valid)
print("original", rmse(y_valid, pred))
origin = rmse(y_valid, pred)

In [None]:
%%time 
# 컬럼들을 랜덤하게 섞어서 예측해보는 것
# cols = list(x_valid.columns)

z = pd.read_csv('featuretest.csv')
cols = list(x for x in x_valid.columns if x not in list(z['columns']))
fin = dict()
for col in tqdm(cols):
    tmp = x_valid.copy()
    tmp[col] = np.random.permutation(tmp[col].values)
    t_rmse = rmse(y_valid, gbm.predict(tmp))
    gap = t_rmse-origin  # origin 보다 에러가 커졌으면 이 컬럼은 중요한 것 !
    print(col, t_rmse, gap)
    fin[col] = gap

print(fin)
new = pd.DataFrame()
new['columns'] =fin.keys()
new['values'] = fin.values()
z = pd.concat([z,new])
z.to_csv('featuretest.csv',index=False)
