In [180]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import notebook
import datetime
import os

import xgboost as xgb
from sklearn.model_selection import train_test_split
import lightgbm
import warnings
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
#한글 깨짐방지
plt.rc('font',family = 'Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
path = './data/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission  = pd.read_csv(path + 'submission.csv')
merge_df = pd.concat([train,test])

In [3]:
train.shape, test.shape

((3362796, 8), (473392, 8))

In [4]:
test['date'] = pd.to_datetime(test['date'])
gap_check = test.groupby('store_id')['date'].agg(['min','max'])
gap = (gap_check['max']-gap_check['min']).apply(lambda x : x.days)
print(f'test의 최소 길이: {gap.min()},test의 최대길이 : {gap.max()}')

test의 최소 길이: 93,test의 최대길이 : 607


### Data setting
* 거래 취소의 경우 거래금액, 환불금액 2회 체크되므로 거래취소 횟수의 2배만큼 제외해야함
* 날짜기준으로 묶어줄 필요가 있음
* 주말과 요일은 날짜기준으로 결합

-> 시간은 EDA에서 상세히 다시살펴볼것 (새벽시간대 매출이 높은 곳이라던지)

### 필요없는 row 제거
* 2nd 솔루션 : 거래기록이 160일 이하인 경우/ 2018-07-31 주변 5일 내 거래가 없으면 제외, test의 경우 2019-03-31 이전 7일간 거래가 없으면 폐업으로 추정
* 변경 : 거래기록 100일 이하인경우 삭제, 100일 이전 거래횟수가 30회 이하면 제외

In [146]:
# train 중 전체 거래일이 100일 이하인경우 학습에 사용불가
train['date'] = pd.to_datetime(train['date'])
gap_check = train.groupby('store_id')['date'].agg(['min','max'])
gap = (gap_check['max']-gap_check['min']).apply(lambda x : x.days)
gap.name = 'gap'
row_count = train.groupby('store_id')['date'].count()
gap = pd.concat([gap,row_count],axis=1)

del_id = gap.loc[(gap['gap']<100)|(gap['date']<29)].index
clean_train = train[~(train.store_id.isin(del_id))]

In [147]:
print(f'기본 트레인 길이 : {train.shape[0]}, 기본트레인 상점수 : {train.store_id.nunique()}')
print(f'clean 트레인 길이 : {clean_train.shape[0]}, clean 트레인 상점수 : {clean_train.store_id.nunique()}')
print(f'길이 차이 : {train.shape[0] - clean_train.shape[0]},상점수 차이 : {train.store_id.nunique() - clean_train.store_id.nunique()}')

기본 트레인 길이 : 3362796, 기본트레인 상점수 : 1775
clean 트레인 길이 : 3343266, clean 트레인 상점수 : 1483
길이 차이 : 19530,상점수 차이 : 292


In [148]:
#전체날짜의 휴일 요일
all_dates = merge_df.drop_duplicates('date').sort_values('date')
all_dates = all_dates[['date','days_of_week','holyday']]
all_dates.reset_index(drop=True, inplace=True)
all_dates['date'] = pd.to_datetime(all_dates['date'])

In [149]:
clean_train.loc[clean_train.amount <0,'refund'] = 1
sample_df = clean_train.groupby(['date','store_id']).agg({'amount':['sum','count'],'refund':'sum'}).reset_index()
sample_df.columns = ['date','store_id','amount_sum','amount_count','refund_sum']
sample_df['date'] = pd.to_datetime(sample_df['date'])
sample_df['temp_date'] = sample_df['date']

#거래횟수당 평균 거래금액 
sample_df['trans_amount'] = sample_df['amount_sum']/(sample_df['amount_count']-sample_df['refund_sum'])
sample_df['trans_amount'].fillna(0,inplace= True)

#날짜병합
sample_df = pd.merge(sample_df,all_dates)

sample_df.set_index("date",inplace=True)

In [150]:
#환불금액이 동일하지 않은 경우도 있음 
train.loc[(train.store_id == 176)&(train.date == datetime.datetime(2018,7,13))]

Unnamed: 0,store_id,date,time,card_id,amount,installments,days_of_week,holyday,refund
486173,176,2018-07-13,13:34:02,edba13c668,-2700,18.0,4,0,1.0
486174,176,2018-07-13,16:30:12,77fff27956,2750,,4,0,


In [151]:
def reform_data(df, isTrain = True):
    '''날짜별로 나열되어있는 데이터를 상정별로 병합
    
    데이터를 store_id를 기준으로 병합. 최종 거래일로 부터 100일인 날짜를 마지막으로 설정 후
    최초 거래일자부터의 데이터를 결합. 마지막 100일의 거래금액은 y로 변환
    
    all_dates는 train,test의 날짜를 결합해놓은 데이터 
    
    Parameters
    ---------
    df(DataFrame) : train, test 데이터 입력
    isTrain(bool) : train일때 True, test일때, False
    
    '''
    df.loc[df.amount <0,'refund'] = 1
    sample_df = df.groupby(['date','store_id']).agg({'amount':['sum','count'],'refund':'sum'}).reset_index()
    sample_df.columns = ['date','store_id','amount_sum','amount_count','refund_sum']
    sample_df['date'] = pd.to_datetime(sample_df['date'])
    sample_df['temp_date'] = sample_df['date']

    #거래횟수당 평균 거래금액 
    sample_df['trans_amount'] = sample_df['amount_sum']/(sample_df['amount_count']-sample_df['refund_sum'])
    sample_df['trans_amount'].fillna(0,inplace= True)

    #날짜병합
    sample_df = pd.merge(sample_df,all_dates)

    sample_df.set_index("date",inplace=True)

    store_id_list = sample_df.store_id.unique()
    fin_ls =[]
    add_row = pd.DataFrame({'holyday_amount_sum':0,'holyday_amount_mean':0},index = ['holy']) #holyday 없는경우 
    
    for store_id in notebook.tqdm(store_id_list):

        store = sample_df[sample_df.store_id == store_id]    
        store = store.asfreq('D',fill_value = 0) #일자별로 열추가 
        store['temp_date'] = store.index
        store['store_id'] = store_id

        # 7일, 15일, 30일간의 이동평균 계산 
        store['MA7'] = store['amount_sum'].rolling('7D').mean()
        store['MA15'] = store['amount_sum'].rolling('15D').mean()
        store['MA30'] = store['amount_sum'].rolling('30D').mean()

        # train set인 경우 마지막 100일은 y값으로 저장되어야함
        # store_x는 100일전까지 기간 데이터 
        if isTrain:
            store_y = store.last("100D")
            y = store_y.amount_sum.sum()
            store_x = store[store.temp_date < store_y.iloc[0].temp_date]
        #test는 전체
        else : 
            y = 0
            store_x = store[:]

        feat = dict()

        #amount >0 이상인 경우만 
        over_zero = store_x.loc[store_x.amount_sum>0]
        
        # base (거래가 있는 경우만 )
        feat['store_id'] = store_id
        feat['y'] = y
        feat['amount_mean'] = over_zero.amount_sum.mean()  # 거래액의 평균
        feat['amount_mid'] = over_zero.amount_sum.median() # 거래액의 중간값
        feat['count_sum'] = over_zero.amount_count.sum() # 전체 거래 횟수
        feat['count_mean'] = over_zero.amount_count.mean() # 평균 거래 횟수
        feat['refund_count'] = over_zero.refund_sum.sum() # 전체 최소 횟수
        feat['refund_rate'] = over_zero['refund_sum'].sum()/over_zero.shape[0] # 취소율
        feat['daily_trans_amount'] = over_zero.trans_amount.sum() / over_zero.shape[0] #일자별 평균 금액의 평균

        feat['mean_gap'] = (over_zero.index[1:] - over_zero.index[:-1]).days.values.mean() # 거래일자간 평균 기간
        
        #weekly (거래가 있는 경우만 고려)
        weekly = over_zero[['days_of_week','amount_sum']].groupby('days_of_week').agg(['sum','mean'])
        weekly.columns = ['weekly_amount_sum','weekly_amount_mean']

        weekly_sum = weekly['weekly_amount_sum']
        weekly_sum.index = [f"{index}_sum" for index in weekly['weekly_amount_sum'].index.values]
        feat.update(weekly_sum.to_dict())

        weekly_mean = weekly['weekly_amount_mean']
        weekly_mean.index = [f"{index}_mean" for index in weekly['weekly_amount_mean'].index.values]
        feat.update(weekly_mean.to_dict())

        #holyday (거래가 있는 경우만 고려)
        holyday = over_zero[['holyday','amount_sum']].groupby('holyday').agg(['sum','mean'])
        holyday.columns = ['holyday_amount_sum','holyday_amount_mean']

        try : 
            holyday.index = ['unholy','holy']
        except :
            holyday = pd.concat([holyday,add_row])
            
        holyday_sum = holyday['holyday_amount_sum']
        holyday_sum.index = [f"{index}_sum" for index in holyday['holyday_amount_sum'].index.values]
        feat.update(holyday_sum.to_dict())

        holyday_mean = holyday['holyday_amount_mean']
        holyday_mean.index = [f"{index}_mean" for index in holyday['holyday_amount_mean'].index.values]
        feat.update(holyday_mean.to_dict())
    
        #이동평균값 
        feat['7D_mean'] = store_x.last('7D').amount_sum.mean()
        feat['15D_mean'] = store_x.last('15D').amount_sum.mean()
        feat['30D_mean'] = store_x.last('30D').amount_sum.mean()

        feat['7D_median'] = store_x.last('7D').amount_sum.median()
        feat['15D_median'] = store_x.last('15D').amount_sum.median()
        feat['30D_median'] = store_x.last('30D').amount_sum.median()

        feat['7D_sum'] = store_x.last('7D').amount_sum.sum()
        feat['15D_sum'] = store_x.last('15D').amount_sum.sum()
        feat['30D_sum'] = store_x.last('30D').amount_sum.sum()

        feat['7D_MA7_mean'] = store_x.last('7D').MA7.mean()
        feat['15D_MA15_mean'] = store_x.last('15D').MA15.mean()
        feat['30D_MA30_mean'] = store_x.last('30D').MA30.mean()

        feat['7D_MA7_median'] = store_x.last('7D').MA7.median()
        feat['15D_MA15_median'] = store_x.last('15D').MA15.median()
        feat['30D_MA30_median'] = store_x.last('30D').MA30.median()

        feat['7D_MA7_sum'] = store_x.last('7D').MA7.sum()
        feat['15D_MA15_sum'] = store_x.last('15D').MA15.sum()
        feat['30D_MA30_sum'] = store_x.last('30D').MA30.sum()

        fin_ls.append(feat)
        
    fin_df = pd.DataFrame(fin_ls)
    return fin_df

In [152]:
reform_train = reform_data(clean_train)
reform_test = reform_data(test)

HBox(children=(FloatProgress(value=0.0, max=1483.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [158]:
reform_train.describe()

Unnamed: 0,store_id,y,amount_mean,amount_mid,count_sum,count_mean,refund_count,refund_rate,daily_trans_amount,mean_gap,0_sum,1_sum,2_sum,3_sum,4_sum,5_sum,6_sum,0_mean,1_mean,2_mean,3_mean,4_mean,5_mean,6_mean,unholy_sum,holy_sum,unholy_mean,holy_mean,7D_mean,15D_mean,30D_mean,7D_median,15D_median,30D_median,7D_sum,15D_sum,30D_sum,7D_MA7_mean,15D_MA15_mean,30D_MA30_mean,7D_MA7_median,15D_MA15_median,30D_MA30_median,7D_MA7_sum,15D_MA15_sum,30D_MA30_sum
count,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1480.0,1474.0,1449.0,1455.0,1469.0,1468.0,1429.0,1262.0,1474.0,1449.0,1455.0,1469.0,1468.0,1429.0,1262.0,1247.0,1483.0,1247.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0,1483.0
mean,898.420769,59260.68,1134.388388,866.092717,1817.596089,5.286492,16.438975,0.061421,493.116007,3.243291,44541.42,38390.39,39276.79,39291.29,42769.22,44794.72,39334.33122,1078.454111,1150.676063,1101.732732,1107.150384,1138.389294,1143.776285,1051.497901,292919.1,10108.190155,1039.025553,827.617304,586.597314,601.165141,620.619663,488.466622,466.31996,468.372556,4096.985165,8951.1706,18315.57586,617.068513,623.823236,625.788771,622.455367,623.630191,625.072541,4308.576719,9285.790351,18457.22121
std,521.091486,112025.3,2372.485488,1533.465496,3653.064037,8.663589,83.176581,0.590469,1206.542959,4.724456,104677.6,73099.21,74922.32,76983.08,79496.75,83683.72,75554.68607,2408.683242,3120.058002,2343.139966,2909.321604,2071.777511,1994.873379,1968.168017,509149.2,21413.450435,1615.622402,1371.973519,1931.467032,1376.536207,1233.73483,1257.045123,1145.153428,1165.428085,13517.614216,20636.964073,36997.735514,1260.117903,1189.810429,1145.365992,1216.688283,1188.255665,1153.859902,8814.540966,17828.330544,34320.635898
min,0.0,75.0,42.333333,25.0,1.0,1.0,0.0,0.0,14.107333,1.0,83.0,81.0,24.0,20.0,27.0,50.0,25.0,42.333333,30.0,24.0,20.0,27.0,37.5,23.333333,1484.0,0.0,55.196491,0.0,-57142.857143,-26666.666667,-13333.333333,0.0,0.0,0.0,-400000.0,-400000.0,-400000.0,-12244.897959,-2666.666667,-25.757778,-321.428571,0.0,-55.4,-85714.285714,-40000.0,-772.733333
25%,442.5,15233.5,349.513097,250.0,136.0,1.516368,1.0,0.008562,104.268448,1.095377,6878.0,5500.0,5900.0,5416.0,6070.25,6200.0,3886.25,304.596591,312.44186,308.571429,315.0,339.985806,353.287671,319.422619,58820.0,525.0,347.759521,159.068182,107.142857,126.333333,145.166667,0.0,0.0,0.0,750.0,1835.0,4249.0,118.346939,144.235556,148.733333,114.285714,137.866667,148.136667,823.642857,2115.5,4357.833333
50%,900.0,33616.0,632.363636,467.0,531.0,2.624113,4.0,0.02454,188.110302,1.357762,17409.5,18033.0,17265.0,17550.0,19660.0,18900.0,14696.0,566.703904,583.58427,601.111111,593.181818,637.390297,640.2,583.746503,146412.0,3370.0,615.974576,450.0,307.142857,342.0,345.633333,180.0,195.0,190.0,2150.0,5091.0,10120.0,328.77551,344.617778,349.866667,317.714286,340.0,346.083333,2298.714286,5144.333333,10119.032146
75%,1350.5,68042.0,1250.986583,963.75,1894.5,5.487522,14.0,0.049478,407.96866,2.92723,47465.75,44085.0,44278.5,45585.0,48079.75,49970.0,43179.25,1142.507305,1208.696629,1186.372549,1174.588235,1260.176136,1295.0,1130.324747,335714.5,10778.5,1172.62132,951.1,734.5,716.2,732.183333,554.0,545.0,536.5,5110.5,10673.0,21183.5,742.693878,742.86,715.405556,728.428571,739.866667,711.458333,5197.071429,10906.0,20667.959105
max,1799.0,2755612.0,64667.538462,24630.0,56848.0,122.018519,2411.0,22.324074,26449.737342,50.2,2522034.0,1406489.0,1352439.0,1411112.0,1408678.0,1369847.0,993133.0,64667.538462,83416.75,59385.777778,92721.7,47095.5,47500.0,45000.0,8968854.0,342140.0,23235.923077,15981.75,29583.142857,28849.2,29313.6,33774.0,28725.0,30886.5,207082.0,432738.0,879408.0,29783.020408,29655.035556,28790.528889,29583.142857,29701.333333,28889.316667,208481.142857,444825.533333,863715.866667


In [160]:
money_col = ['amount_mean', 'amount_mid', '0_sum', '1_sum', '2_sum', '3_sum', '4_sum', '5_sum', '6_sum', '0_mean',
             '1_mean', '2_mean', '3_mean', '4_mean', '5_mean', '6_mean',
             'unholy_sum', 'holy_sum', 'unholy_mean', 'holy_mean', '7D_mean',
             '15D_mean', '30D_mean', '7D_median', '15D_median', '30D_median',
             '7D_sum', '15D_sum', '30D_sum', '7D_MA7_mean', '15D_MA15_mean',
             '30D_MA30_mean', '7D_MA7_median', '15D_MA15_median', '30D_MA30_median',
             '7D_MA7_sum', '15D_MA15_sum', '30D_MA30_sum']

In [161]:
reform_train.shape,reform_test.shape

((1483, 46), (200, 46))

### modeling
* 2nd 솔루션은 xgb, train_test_split 0.1만

In [163]:
col = [i for i in reform_train.columns if i not in ['store_id','y']]
y = reform_train['y'].values

train_X, test_X, train_y, test_y = train_test_split(reform_train[col],y, test_size=0.3, random_state=4024)

#### lgbm

In [176]:
lgbm = lightgbm.LGBMRegressor(learning_rate=0.001, n_estimators=30000)

lgbm.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X,test_y)], eval_names=['train','test'],eval_metric='rmse',verbose=100, early_stopping_rounds= 500)

Training until validation scores don't improve for 500 rounds
[100]	train's rmse: 120305	train's l2: 1.44732e+10	test's rmse: 58516.8	test's l2: 3.42422e+09
[200]	train's rmse: 114647	train's l2: 1.3144e+10	test's rmse: 52518.5	test's l2: 2.75819e+09
[300]	train's rmse: 109772	train's l2: 1.20499e+10	test's rmse: 47210.8	test's l2: 2.22886e+09
[400]	train's rmse: 105568	train's l2: 1.11446e+10	test's rmse: 42502.8	test's l2: 1.80649e+09
[500]	train's rmse: 101942	train's l2: 1.03923e+10	test's rmse: 38614.3	test's l2: 1.49107e+09
[600]	train's rmse: 98829.1	train's l2: 9.7672e+09	test's rmse: 35259.2	test's l2: 1.24321e+09
[700]	train's rmse: 96149.9	train's l2: 9.24481e+09	test's rmse: 32522.9	test's l2: 1.05774e+09
[800]	train's rmse: 93847.8	train's l2: 8.80741e+09	test's rmse: 30388.3	test's l2: 9.23448e+08
[900]	train's rmse: 91874.2	train's l2: 8.44086e+09	test's rmse: 28760.3	test's l2: 8.27155e+08
[1000]	train's rmse: 90152	train's l2: 8.12739e+09	test's rmse: 27539.8	test's l2

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=30000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [177]:
y_pred = lgbm.predict(test_X)
score = np.sqrt(mean_squared_error(test_y, y_pred))

In [178]:
y_pred = lgbm.predict(reform_test[col])

In [181]:
submission['total_sales'] = y_pred

today = datetime.datetime.today().date()
os.makedirs('./submission/',exist_ok=True)
submission.to_csv(f'./submission/submission_{today}_lgbm_{score}_rmse.csv',index=False)

#### xgb

In [182]:
def XGB_regressor(train_X, train_y, test_X, test_y, metric, feature_name = None, seed_val = 2018, num_rounds = 3000):
    params = {'objective' : 'reg:squarederror',
              'learning_rate' : 0.05,
              'max_depth' : 10,
              'eval_metric': metric,#??? rmse가 아니고??
              'min_child_weight': 1,
              'subsample': 0.7,
              'colsample_bytree' : 0.7,
              'seed': seed_val}
    plst = list(params.items())
    
    xgtrain = xgb.DMatrix(train_X, label = train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label = test_y)
        watchlist = [ (xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 300,verbose_eval= 100)
    else :
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    return model

In [185]:
model = XGB_regressor(train_X, train_y, test_X, test_y, 'rmse')

[0]	train-rmse:135567.90625	test-rmse:81256.32812
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 300 rounds.
[100]	train-rmse:12467.85840	test-rmse:23141.36719
[200]	train-rmse:2663.48389	test-rmse:23221.37109
[300]	train-rmse:753.29089	test-rmse:23256.43945
[400]	train-rmse:238.16800	test-rmse:23262.75000
Stopping. Best iteration:
[125]	train-rmse:8033.55176	test-rmse:23099.49805



In [188]:
y_pred = model.predict(xgb.DMatrix(test_X))
score = np.sqrt(mean_squared_error(test_y, y_pred))

In [189]:
#몇가지 조정을 더 거친후 제출할 답변임
y_pred = model.predict(xgb.DMatrix(reform_test[col]), ntree_limit = model.best_ntree_limit)

In [190]:
submission['total_sales'] = y_pred

today = datetime.datetime.today().date()
os.makedirs('./submission/',exist_ok=True)
submission.to_csv(f'./submission/submission_{today}_xgb_{score}_rmse.csv',index=False)

#### lgbm (log scale)

In [194]:
reform_train[money_col] = reform_train[money_col].apply(lambda x : np.log(1+x))

In [195]:
train_X, test_X, train_y, test_y = train_test_split(reform_train[col],y, test_size=0.3, random_state=4024)

In [201]:
lgbm = lightgbm.LGBMRegressor(learning_rate=0.001, n_estimators=30000)

lgbm.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X,test_y)], eval_names=['train','test'],eval_metric='rmse',verbose=100, early_stopping_rounds= 500)

Training until validation scores don't improve for 500 rounds
[100]	train's rmse: 120300	train's l2: 1.44721e+10	test's rmse: 58516.7	test's l2: 3.4242e+09
[200]	train's rmse: 114640	train's l2: 1.31422e+10	test's rmse: 52540.7	test's l2: 2.76053e+09
[300]	train's rmse: 109764	train's l2: 1.20482e+10	test's rmse: 47248.8	test's l2: 2.23245e+09
[400]	train's rmse: 105563	train's l2: 1.11435e+10	test's rmse: 42514.4	test's l2: 1.80748e+09
[500]	train's rmse: 101933	train's l2: 1.03903e+10	test's rmse: 38552.9	test's l2: 1.48633e+09
[600]	train's rmse: 98811	train's l2: 9.76362e+09	test's rmse: 35221.7	test's l2: 1.24057e+09
[700]	train's rmse: 96126.5	train's l2: 9.24031e+09	test's rmse: 32485.2	test's l2: 1.05529e+09
[800]	train's rmse: 93818.5	train's l2: 8.80191e+09	test's rmse: 30282.2	test's l2: 9.17011e+08
[900]	train's rmse: 91831	train's l2: 8.43294e+09	test's rmse: 28592.5	test's l2: 8.17531e+08
[1000]	train's rmse: 90108	train's l2: 8.11945e+09	test's rmse: 27345.9	test's l2: 7

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.001, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=30000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [202]:
y_pred = lgbm.predict(test_X)
score = np.sqrt(mean_squared_error(test_y, y_pred))

In [203]:
y_pred = lgbm.predict(reform_test[col])

In [204]:
submission['total_sales'] = y_pred

today = datetime.datetime.today().date()
os.makedirs('./submission/',exist_ok=True)
submission.to_csv(f'./submission/submission_{today}_lgbm_scale_{score}_rmse.csv',index=False)

### 작업본

#### 주별, 주말별 계산 어쩌냐

In [55]:
#weekly
weekly_sum = weekly['weekly_amount_sum']
weekly_sum.index = [f"{index}_sum" for index in weekly['weekly_amount_sum'].index.values]
feat.update(weekly_sum.to_dict())

weekly_count = weekly['weekly_amount_count']
weekly_count.index = [f"{index}_count" for index in weekly['weekly_amount_count'].index.values]
feat.update(weekly_count.to_dict())

#holyday
holyday.index = ['unholy','holy']
holyday_sum = holyday['holyday_amount_sum']
holyday_sum.index = [f"{index}_sum" for index in holyday['holyday_amount_sum'].index.values]
feat.update(holyday_sum.to_dict())

holyday_count = holyday['holyday_amount_count']
holyday_count.index = [f"{index}_count" for index in holyday['holyday_amount_count'].index.values]
feat.update(holyday_count.to_dict())

In [228]:
store_x[store_x.amount_sum>0].shape[0]

1

In [54]:
weekly = store[['days_of_week','amount_sum']].groupby('days_of_week').agg(['sum','count'])
weekly.columns = ['weekly_amount_sum','weekly_amount_count']
weekly_dict = weekly.to_dict()

holyday = store[['holyday','amount_sum']].groupby('holyday').agg(['sum','count'])
holyday.columns = ['holyday_amount_sum','holyday_amount_count']
holyday_dict = holyday.to_dict()

In [230]:
store_x

Unnamed: 0_level_0,store_id,amount_sum,amount_count,refund_sum,temp_date,trans_amount,days_of_week,holyday,MA7,MA15,MA30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-04-21,1163,545,1,0.0,2018-04-21,545.0,5,0,545.0,545.0,545.0
2018-04-22,1163,0,0,0.0,2018-04-22,0.0,0,0,272.5,272.5,272.5


In [188]:
store

Unnamed: 0,store_id,amount_sum,amount_count,refund_sum,temp_date,days_of_week,holyday,trans_amount,MA7,MA15,MA30


In [189]:
store = sample_df[sample_df.store_id == 8]

In [164]:
store = sample_df[sample_df.store_id == store_id]

In [174]:
store.trans_amount.sum()

65714.6097208347

In [173]:
store[store.trans_amount>0].shape[0]

531

In [151]:
store = store.asfreq('D',fill_value = 0)

In [152]:
store['temp_date'] = store.index
store['store_id'] = store_id

In [153]:
store

Unnamed: 0_level_0,store_id,amount_sum,amount_count,refund_sum,temp_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-08-10,32,140,1,0.0,2016-08-10
2016-08-11,32,0,0,0.0,2016-08-11
2016-08-12,32,0,0,0.0,2016-08-12
2016-08-13,32,60,1,0.0,2016-08-13
2016-08-14,32,0,0,0.0,2016-08-14
...,...,...,...,...,...
2018-07-26,32,884,4,0.0,2018-07-26
2018-07-27,32,549,5,0.0,2018-07-27
2018-07-28,32,709,4,0.0,2018-07-28
2018-07-29,32,364,3,0.0,2018-07-29


In [83]:
store_y = store.last("100D")

In [85]:
store_y.amount_sum.sum()

510