In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [58]:
df_train = pd.read_csv('m5-forecasting-accuracy/sales_train_validation.csv')
calendar = pd.read_csv('m5-forecasting-accuracy/calendar.csv')
price = pd.read_csv('m5-forecasting-accuracy/sell_prices.csv')
df_test = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')

In [59]:
df_train['key'] = df_train['id'].apply(lambda x : x[:-11])


In [60]:
# test 데이터 변환
df_test = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')

lst = list(calendar[1941:]['d'])
for i in range(1,29):
    df_test = df_test.rename(columns = {'F%s'%i:lst[i-1]})

# Unpivot
df_test = pd.melt(df_test, id_vars=df_test.columns[:1], value_vars=df_test.columns[1:],
       var_name = 'day', value_name = 'volume')

df_test['key'] = df_test['id'].apply(lambda x : x[:-11])
# df_train에 있는 기본 정보 붙이기
df_test = pd.merge(df_test,df_train.loc[:,['key','item_id','dept_id','cat_id','store_id','state_id']],
         on =['key'], how='left')

df_train.drop(['key'],axis =1, inplace =True)
df_test.drop(['key'], axis = 1, inplace =True)
    

In [62]:
test_id = df_test['id']


### 전략
1. Unpivot만 해서 LGBM 하기 
2. 날짜 인코딩만해서 LGBM 사용하기
3. Data preprocessing 어떻게 할 것인지?
4. 최근 며칠간의 평균 판매액
5. rolling 추가
6. validation set을 6월 데이터 를 예측하는 것으로 해볼까?
  - 2015년 6월 데이터 예측 ? 

In [5]:
# Unpivot
df_train = pd.melt(df_train, id_vars=df_train.columns[:6], value_vars=df_train.columns[6:],
       var_name = 'day', value_name = 'volume')

    

In [6]:
# calendar 추가하기
df_train = pd.merge(df_train, calendar, left_on = 'day', right_on ='d')
df_test = pd.merge(df_test, calendar, left_on = 'day', right_on ='d')


In [7]:
drop_cols = []
cat_cols = []
drop_cols += ['date','d','id']



In [8]:
# snap 합치기
snap = np.zeros(df_train.shape[0])
snap[df_train[(df_train['state_id']=='CA')&(df_train['snap_CA']==1)].index] +=1
snap[df_train[(df_train['state_id']=='TX')&(df_train['snap_TX']==1)].index] +=1
snap[df_train[(df_train['state_id']=='WI')&(df_train['snap_WI']==1)].index] +=1
df_train['snap'] = snap

# test
snap = np.zeros(df_test.shape[0])
snap[df_test[(df_test['state_id']=='CA')&(df_test['snap_CA']==1)].index] +=1
snap[df_test[(df_test['state_id']=='TX')&(df_test['snap_TX']==1)].index] +=1
snap[df_test[(df_test['state_id']=='WI')&(df_test['snap_WI']==1)].index] +=1
df_test['snap'] = snap

drop_cols += ['snap_CA','snap_TX','snap_WI']



In [9]:

cat_cols += [ 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
             'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap']

In [13]:
# Sell price
df_train.head()
df_train = pd.merge(df_train, price)
df_test = pd.merge(df_test,price)
# price

In [10]:
# Rolling
# test['count_last_uid1'] = test.groupby('uid1')['TransactionAmt'].transform(lambda x: x.rolling(5, 1).count())

In [19]:
df_train= df_train.drop(drop_cols,axis =1 )
df_test =df_test.drop(drop_cols,axis =1)

In [20]:
df_train = df_train.fillna("NaN")
df_test = df_test.fillna("NaN")

In [21]:
# Encoding
for col in tqdm(cat_cols) :
    le = LabelEncoder()
    le.fit(pd.concat([df_train[col],df_test[col]]))
    df_train[col] = le.transform(df_train[col]).astype(np.int8)
    df_test[col] = le.transform(df_test[col]).astype(np.int8)
    gc.collect()

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:15<00:00,  5.02s/it]


In [22]:
df_train.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,day,volume,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,sell_price
0,-92,3,1,0,0,d_1,12,0,2,0,0,0,18,1,3,1,0,0.46
1,-92,3,1,0,0,d_2,15,0,3,1,0,0,18,1,3,1,0,0.46
2,-92,3,1,0,0,d_3,0,0,1,2,0,0,18,1,3,1,0,0.46
3,-92,3,1,0,0,d_4,0,0,5,3,1,0,18,1,3,1,1,0.46
4,-92,3,1,0,0,d_5,0,0,6,4,1,0,18,1,3,1,1,0.46


In [23]:
# Train_test_split  -- 가장 마지막 28일을 예측하는 것으로 짜보자
# lst = [0 for x in range(df_train.shape[0])]

x_train = df_train[~df_train['day'].isin(df_train['day'].unique()[-50:])]
x_valid = df_train[df_train['day'].isin(df_train['day'].unique()[-50:])]

y_train, y_valid = x_train['volume'], x_valid['volume']

x_train = x_train.drop(['day','volume'], axis =1)
x_valid = x_valid.drop(['day','volume'], axis =1)

In [24]:
x_valid.shape[0]/x_train.shape[0]

0.03425576579365509

In [25]:
from sklearn.metrics import mean_squared_log_error
seed = 99

In [28]:
# lightbgm
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},#{'l2'},
    #'subsample': 0.2,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
#     'num_leaves': 128,
    'alpha': 0.1,
    'lambda': 0.1,
    'n_jobs' :10 
}

In [29]:
%%time
# Modeling
lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat_cols)
lgb_eval = lgb.Dataset(x_valid, y_valid,categorical_feature=cat_cols)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000, #300,
                valid_sets=(lgb_train, lgb_eval),
#                     feval=rmsle,
                early_stopping_rounds= 50,#100,
                verbose_eval=30) #100)

Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 3.76614	valid_1's rmse: 3.17967
[60]	training's rmse: 3.70013	valid_1's rmse: 3.14074
[90]	training's rmse: 3.66618	valid_1's rmse: 3.12248
[120]	training's rmse: 3.63279	valid_1's rmse: 3.10235
[150]	training's rmse: 3.6089	valid_1's rmse: 3.08713
[180]	training's rmse: 3.58705	valid_1's rmse: 3.06936
[210]	training's rmse: 3.56891	valid_1's rmse: 3.0574
[240]	training's rmse: 3.55782	valid_1's rmse: 3.04688
[270]	training's rmse: 3.54732	valid_1's rmse: 3.04146
[300]	training's rmse: 3.5361	valid_1's rmse: 3.03922
[330]	training's rmse: 3.52768	valid_1's rmse: 3.03594
[360]	training's rmse: 3.51871	valid_1's rmse: 3.02973
[390]	training's rmse: 3.51133	valid_1's rmse: 3.02675
[420]	training's rmse: 3.50614	valid_1's rmse: 3.02426
[450]	training's rmse: 3.49902	valid_1's rmse: 3.02185
[480]	training's rmse: 3.49481	valid_1's rmse: 3.01928
[510]	training's rmse: 3.4891	valid_1's rmse: 3.01248
[540]	train

In [None]:
df_test = df_test.drop(['day'],axis =1 )

In [50]:
%%time

predict = gbm.predict(df_test)
predict = np.around(predict)

Wall time: 17 s


In [64]:
submission = pd.read_csv('m5-forecasting-accuracy/sample_submission.csv')


In [67]:
df_test['id'] = test_id
df_test['predict']  = predict
df_test.head()


Unnamed: 0,id,day,volume,item_id,dept_id,cat_id,store_id,state_id,predict
0,HOBBIES_1_001_CA_1_validation,d_1942,0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0
1,HOBBIES_1_002_CA_1_validation,d_1942,0,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0
2,HOBBIES_1_003_CA_1_validation,d_1942,0,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0.0
3,HOBBIES_1_004_CA_1_validation,d_1942,0,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0.0
4,HOBBIES_1_005_CA_1_validation,d_1942,0,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0.0


In [79]:
test2 = pd.pivot_table(df_test,index=['id'],columns = ['day'], values = 'predict').values

test2[test2<0] = 0


In [94]:
submission = pd.concat([submission.loc[:,'id'],pd.DataFrame(test2, columns = submission.columns[1:])],axis=1)
submission.to_csv('submission_baseline1.csv',index=False)

In [95]:
submission


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,2.0,0.0,3.0,0.0,0.0
1,HOBBIES_1_002_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,...,2.0,0.0,7.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,HOBBIES_1_004_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,...,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,5.0,2.0
4,HOBBIES_1_005_CA_1_validation,3.0,3.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,...,4.0,1.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,1.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0,...,0.0,3.0,1.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0
60976,FOODS_3_824_WI_3_evaluation,1.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,5.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
60977,FOODS_3_825_WI_3_evaluation,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
60978,FOODS_3_826_WI_3_evaluation,1.0,1.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,...,0.0,5.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0


In [54]:
df_train.shape[0]-snap.sum()

31435190.0

In [48]:
snap = np.zeros(df_train.shape[0])

In [43]:
snap[[0,1,2]]=1
snap

array([1., 1., 1., ..., 0., 0., 0.])