In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('m5-forecasting-accuracy/sales_train_validation.csv')
calendar = pd.read_csv('m5-forecasting-accuracy/calendar.csv')
price = pd.read_csv('m5-forecasting-accuracy/sell_prices.csv')
df_test = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')

In [3]:
# startpoints 찾아서 이전 데이터 지우기
startpoints = np.zeros(df_train.shape[0])
for idx in tqdm(range(df_train.shape[0])):
    startpoints[idx]= np.where(df_train.iloc[idx,6:].values>0)[0].min().astype(int)

100%|███████████████████████████████████████████████████████████████████████████| 30490/30490 [00:36<00:00, 826.18it/s]


In [4]:
start_dict = dict(zip(df_train['id'], startpoints))

In [5]:
df_train['key'] = df_train['id'].apply(lambda x : x[:-11])


In [6]:
# test 데이터 변환
df_test = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')

lst = list(calendar[1941:]['d'])
for i in range(1,29):
    df_test = df_test.rename(columns = {'F%s'%i:lst[i-1]})

# Unpivot
df_test = pd.melt(df_test, id_vars=df_test.columns[:1], value_vars=df_test.columns[1:],
       var_name = 'day', value_name = 'volume')

df_test['key'] = df_test['id'].apply(lambda x : x[:-11])
# df_train에 있는 기본 정보 붙이기
df_test = pd.merge(df_test,df_train.loc[:,['key','item_id','dept_id','cat_id','store_id','state_id']],
         on =['key'], how='left')

df_train.drop(['key'],axis =1, inplace =True)
df_test.drop(['key'], axis = 1, inplace =True)
    

In [7]:
test_id = df_test['id']


### 전략
1. Unpivot만 해서 LGBM 하기 
2. 날짜 인코딩만해서 LGBM 사용하기
3. Data preprocessing 어떻게 할 것인지?
4. 최근 며칠간의 평균 판매액
5. rolling 추가
6. validation set을 6월 데이터 를 예측하는 것으로 해볼까?
  - 2015년 6월 데이터 예측 ? 

In [8]:
# Unpivot
df_train = pd.melt(df_train, id_vars=df_train.columns[:6], value_vars=df_train.columns[6:],
       var_name = 'day', value_name = 'volume')

    

In [9]:
# calendar 추가하기
df_train = pd.merge(df_train, calendar, left_on = 'day', right_on ='d')
df_test = pd.merge(df_test, calendar, left_on = 'day', right_on ='d')

df_train = df_train.fillna("NaN")
df_test = df_test.fillna("NaN")

In [10]:
drop_cols = []
cat_cols = []
drop_cols += ['date','d','id']



In [11]:
# snap 합치기
snap = np.zeros(df_train.shape[0])
snap[df_train[(df_train['state_id']=='CA')&(df_train['snap_CA']==1)].index] +=1
snap[df_train[(df_train['state_id']=='TX')&(df_train['snap_TX']==1)].index] +=1
snap[df_train[(df_train['state_id']=='WI')&(df_train['snap_WI']==1)].index] +=1
df_train['snap'] = snap

# test
snap = np.zeros(df_test.shape[0])
snap[df_test[(df_test['state_id']=='CA')&(df_test['snap_CA']==1)].index] +=1
snap[df_test[(df_test['state_id']=='TX')&(df_test['snap_TX']==1)].index] +=1
snap[df_test[(df_test['state_id']=='WI')&(df_test['snap_WI']==1)].index] +=1
df_test['snap'] = snap

drop_cols += ['snap_CA','snap_TX','snap_WI']



In [12]:

cat_cols += [ 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
             'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap']

In [13]:
# Sell price
df_train.head()
df_train = pd.merge(df_train, price)
df_test = pd.merge(df_test,price)
# price

In [14]:
# Start point 찾기 ::  40% 데이터를 날릴  수 있다.  -- price가 책정 된 날이 시작 점???

df_train['startpoint'] = df_train['id'].map(start_dict).astype(int).astype(str)
df_train['startpoints'] = df_train['d'] >='d_'+df_train['startpoint']
print(df_train['startpoints'].value_counts())
print(df_train.shape)
df_train = df_train[df_train['startpoints']]
print(df_train.shape)
df_train.drop(['startpoint','startpoints'],axis =1, inplace= True)

True     29494652
False    16533305
Name: startpoints, dtype: int64
(46027957, 26)
(29494652, 26)


In [15]:
%%time

df_train['train'] = True
df_test['train']  = False
full_df = pd.concat([df_train,df_test])

print("mean")
full_df['rmean_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7 ,min_periods=1).mean())
full_df['rmean_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28 ,min_periods=1).mean())
full_df['rmean_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(50 ,min_periods=1).mean())

print("std")
full_df['rstd_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7 ,min_periods=1).std())
full_df['rstd_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28 ,min_periods=1).std())
full_df['rstd_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(50 ,min_periods=1).std())

print("max")
full_df['rmax_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7 ,min_periods=1).max())
full_df['rmax_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28 ,min_periods=1).max())
full_df['rmax_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(50 ,min_periods=1).max())

print("min")
full_df['rmin_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7 ,min_periods=1).min())
full_df['rmin_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28 ,min_periods=1).min())
full_df['rmin_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(50 ,min_periods=1).min())

# print("count")
# full_df['rcnt_7'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(7).count() if x>0).fllna(0)
# full_df['rcnt_28'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)
# full_df['rcnt_50'] = full_df[['id','volume']].groupby("id")['volume'].transform(lambda x: x.rolling(28).count() if x>0).fllna(0)

full_df = full_df.fillna(0)

df_train = full_df[full_df['train'] ==True].drop(['train'], axis =1)
df_test = full_df[full_df['train'] ==False].drop(['train'], axis =1)

del full_df
gc.collect()


mean
std
max
min
Wall time: 7min 49s


0

In [16]:
df_train= df_train.drop(drop_cols,axis =1 )
df_test =df_test.drop(drop_cols,axis =1)

In [17]:
# Encoding
for col in tqdm(cat_cols) :
    le = LabelEncoder()
    le.fit(pd.concat([df_train[col],df_test[col]]))
    df_train[col] = le.transform(df_train[col]).astype(np.int8)
    df_test[col] = le.transform(df_test[col]).astype(np.int8)
    gc.collect()

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:08<00:00,  4.58s/it]


In [18]:
df_train.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,day,volume,wm_yr_wk,weekday,wday,...,rmean_50,rstd_7,rstd_28,rstd_50,rmax_7,rmax_28,rmax_50,rmin_7,rmin_28,rmin_50
0,-92,3,1,0,0,d_1,12,0,2,0,...,12.0,0.0,0.0,0.0,12,12,12,12,12,12
1,-92,3,1,0,0,d_2,15,0,3,1,...,13.5,2.12132,2.12132,2.12132,15,15,15,12,12,12
2,-92,3,1,0,0,d_3,0,0,1,2,...,9.0,7.937254,7.937254,7.937254,15,15,15,0,0,0
3,-92,3,1,0,0,d_4,0,0,5,3,...,6.75,7.889867,7.889867,7.889867,15,15,15,0,0,0
4,-92,3,1,0,0,d_5,0,0,6,4,...,5.4,7.46994,7.46994,7.46994,15,15,15,0,0,0


In [19]:
# Train_test_split  -- 가장 마지막 28일을 예측하는 것으로 짜보자
# lst = [0 for x in range(df_train.shape[0])]

x_train = df_train[~df_train['day'].isin(df_train['day'].unique()[-50:])]
x_valid = df_train[df_train['day'].isin(df_train['day'].unique()[-50:])]

y_train, y_valid = x_train['volume'], x_valid['volume']

x_train = x_train.drop(['day','volume'], axis =1)
x_valid = x_valid.drop(['day','volume'], axis =1)

In [20]:
x_valid.shape[0]/x_train.shape[0]

0.026929225688665227

In [21]:
from sklearn.metrics import mean_squared_log_error
seed = 99

In [41]:
# # lightbgm
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': {'rmse'},#{'l2'},
#     #'subsample': 0.2,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.9,
# #     'num_leaves': 128,
#     'alpha': 0.1,
#     'lambda': 0.1,
#     'n_jobs' :10 
# }

params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [None]:
%%time
# Modeling
lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat_cols)
lgb_eval = lgb.Dataset(x_valid, y_valid,categorical_feature=cat_cols)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000, #300,
                valid_sets=(lgb_train, lgb_eval),
#                     feval=rmsle,
#                 early_stopping_rounds= 50,#100,
                verbose_eval=50) #100)



[50]	training's rmse: 2.43944	valid_1's rmse: 1.92333


In [35]:
df_day = df_test['day']
df_test = df_test.drop(['day'],axis =1 )

In [36]:
%%time

predict = gbm.predict(df_test)
predict = np.around(predict)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: id

In [37]:
submission = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')


In [38]:
df_test['id'] = test_id
df_test['predict']  = predict
df_test['day'] = df_day
df_test.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,volume,wm_yr_wk,weekday,wday,month,...,rstd_50,rmax_7,rmax_28,rmax_50,rmin_7,rmin_28,rmin_50,id,predict,day
0,-99,3,1,0,0,0,18,1,2,4,...,0.606092,1,2,2,0,0,0,HOBBIES_1_001_CA_1_validation,7.0,d_1942
1,-99,3,1,0,0,0,18,1,2,4,...,0.0,0,0,0,0,0,0,HOBBIES_1_002_CA_1_validation,8.0,d_1942
2,-99,3,1,0,0,0,18,5,3,4,...,0.606092,1,2,2,0,0,0,HOBBIES_1_003_CA_1_validation,8.0,d_1943
3,-99,3,1,0,0,0,18,5,3,4,...,0.0,0,0,0,0,0,0,HOBBIES_1_004_CA_1_validation,8.0,d_1943
4,-99,3,1,0,0,0,18,6,4,4,...,0.606092,0,2,2,0,0,0,HOBBIES_1_005_CA_1_validation,8.0,d_1944


In [39]:
test2 = pd.pivot_table(df_test,index=['id'],columns = ['day'], values = 'predict').values

test2[test2<0] = 0


  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
submission = pd.concat([submission.loc[:,'id'],pd.DataFrame(test2, columns = submission.columns[1:])],axis=1)
submission = round(submission.fillna(0))
submission.to_csv('submission_baseline3_rolling .csv',index=False)

In [30]:
submission


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,9.0,0.0,0.0,0.0,3.0,9.0,7.0,5.0,...,6.0,3.0,9.0,7.0,5.0,3.0,1.0,6.0,0.0,6.0
1,HOBBIES_1_002_CA_1_validation,0.0,9.0,0.0,0.0,0.0,3.0,10.0,7.0,4.0,...,6.0,3.0,10.0,6.0,4.0,2.0,1.0,6.0,2.0,0.0
2,HOBBIES_1_003_CA_1_validation,2.0,0.0,0.0,0.0,0.0,8.0,3.0,2.0,1.0,...,1.0,8.0,3.0,2.0,1.0,4.0,4.0,1.0,0.0,1.0
3,HOBBIES_1_004_CA_1_validation,2.0,0.0,0.0,0.0,0.0,8.0,3.0,2.0,1.0,...,0.0,8.0,3.0,2.0,1.0,4.0,4.0,1.0,10.0,0.0
4,HOBBIES_1_005_CA_1_validation,4.0,0.0,0.0,0.0,0.0,3.0,10.0,7.0,4.0,...,6.0,3.0,9.0,7.0,4.0,2.0,2.0,6.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.0,0.0,2.0,0.0,0.0,8.0,3.0,21.0,2.0,...,1.0,8.0,3.0,13.0,2.0,4.0,4.0,1.0,0.0,2.0
60976,FOODS_3_824_WI_3_evaluation,0.0,1.0,0.0,0.0,0.0,9.0,7.0,4.0,1.0,...,2.0,9.0,7.0,4.0,1.0,0.0,6.0,2.0,6.0,0.0
60977,FOODS_3_825_WI_3_evaluation,0.0,1.0,0.0,0.0,0.0,10.0,7.0,4.0,1.0,...,2.0,10.0,7.0,4.0,1.0,0.0,6.0,2.0,0.0,1.0
60978,FOODS_3_826_WI_3_evaluation,0.0,5.0,0.0,0.0,0.0,8.0,3.0,39.0,3.0,...,4.0,8.0,3.0,23.0,2.0,4.0,5.0,2.0,0.0,2.0


In [31]:
df_train.shape[0]-snap.sum()

28884852.0

In [32]:
snap = np.zeros(df_train.shape[0])

In [33]:
snap[[0,1,2]]=1
snap

array([1., 1., 1., ..., 0., 0., 0.])