In [1]:
# we will try to get missing transactions thru linear regression and imputation as needed

In [2]:
import numpy as np
import pandas as pd
import feather
from tqdm import tqdm
from joblib import Parallel, delayed
import gc

In [3]:
df_t = pd.read_csv('../data/transactions.csv')

In [4]:
df_t.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [5]:
# there are 2 days of missing data in df_t
# {'2016-01-01', '2016-01-03'}

In [6]:
df_t[df_t.store_nbr == 25]

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
22,2013-01-02,25,1038
68,2013-01-03,25,887
114,2013-01-04,25,1054
160,2013-01-05,25,1355
206,2013-01-06,25,716
252,2013-01-07,25,703
298,2013-01-08,25,626
344,2013-01-09,25,755
390,2013-01-10,25,632


In [7]:
# for test we also have to predict 15 days: 2017-08-16 to 2017-08-31 for all stores
# there are 2 days of missing data
# {'2016-01-01', '2016-01-03'}
# for 2016-01-01 only store 25 is needed. For 2016-01-03, all stores except 52 are needed.
# for test all stores have to be predicted

In [8]:
df = df_t[(df_t.store_nbr == 25) & (df_t.date < '2016-01-01')]

In [9]:
df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
22,2013-01-02,25,1038
68,2013-01-03,25,887
114,2013-01-04,25,1054
160,2013-01-05,25,1355


In [10]:
df.tail()

Unnamed: 0,date,store_nbr,transactions
52151,2015-12-27,25,1384
52204,2015-12-28,25,1546
52257,2015-12-29,25,1887
52310,2015-12-30,25,2489
52363,2015-12-31,25,3178


In [10]:
df.to_csv('../cache/transactions/tr_25_20160101.csv', index=False)

In [6]:
df1 = pd.DataFrame()
df1['date'] = ['2016-01-01']
df1['transactions']  = 0

In [7]:
df1

Unnamed: 0,date,transactions
0,2016-01-01,0


In [4]:
df1.to_csv('../cache/transactions/te_25_20160101.csv', index=False)

In [14]:
del df1

In [12]:
for i in tqdm(range(1,55)):
    if i == 52:
        continue
    df = df_t[(df_t.store_nbr == i) & (df_t.date < '2016-01-03')]
    fn_tr = '../cache/transactions/tr_' + str(i) + '_20160103.csv'
    df.to_csv(fn_tr, index=False)
    df1 = pd.DataFrame()
    df1['date'] = ['2016-01-03']
    df1['transactions']  = 0
    fn_te = '../cache/transactions/te_' + str(i) + '_20160103.csv'
    df1.to_csv(fn_te, index=False)
    del df1

100%|██████████| 54/54 [00:00<00:00, 64.69it/s]


In [11]:
l = ['2013-06-19_10',
 '2013-06-19_35',
 '2013-06-19_43',
 '2013-06-19_54',
 '2014-01-02_32',
 '2014-03-24_25',
 '2016-01-02_1',
 '2016-01-02_10',
 '2016-01-02_11',
 '2016-01-02_12',
 '2016-01-02_13',
 '2016-01-02_14',
 '2016-01-02_15',
 '2016-01-02_17',
 '2016-01-02_18',
 '2016-01-02_19',
 '2016-01-02_3',
 '2016-01-02_4',
 '2016-01-02_5',
 '2016-01-02_6',
 '2016-01-02_7',
 '2016-01-02_8',
 '2016-01-02_9',
 '2016-01-04_10',
 '2016-01-04_11',
 '2016-01-04_12',
 '2016-01-04_13',
 '2016-01-04_14',
 '2016-01-04_15',
 '2016-01-04_16',
 '2016-01-04_17',
 '2016-01-04_18',
 '2016-01-04_19',
 '2016-01-04_2',
 '2016-01-04_20',
 '2016-01-04_21',
 '2016-01-04_22',
 '2016-01-04_27',
 '2016-01-04_28',
 '2016-01-04_29',
 '2016-01-04_3',
 '2016-01-04_35',
 '2016-01-04_36',
 '2016-01-04_4',
 '2016-01-04_40',
 '2016-01-04_41',
 '2016-01-04_42',
 '2016-01-04_43',
 '2016-01-04_44',
 '2016-01-04_45',
 '2016-01-04_46',
 '2016-01-04_47',
 '2016-01-04_48',
 '2016-01-04_49',
 '2016-01-04_50',
 '2016-01-04_51',
 '2016-01-04_53',
 '2016-01-04_54',
 '2016-01-04_6',
 '2016-01-04_7',
 '2016-01-04_8',
 '2016-01-04_9',
 '2016-09-27_23',
 '2016-09-27_7']

In [13]:
for el in tqdm(l):
    date, i = el.split('_')
    df = df_t[(df_t.store_nbr == int(i)) & (df_t.date < date)]
    date1 = date.replace('-', '')
    fn_tr = '../cache/transactions/tr_' + str(i) + '_' + date1 + '.csv'
    df.to_csv(fn_tr, index=False)
    df1 = pd.DataFrame()
    df1['date'] = [date]
    df1['transactions']  = 0
    fn_te = '../cache/transactions/te_' + str(i) + '_' + date1 + '.csv'
    df1.to_csv(fn_te, index=False)
    del df1

100%|██████████| 64/64 [00:00<00:00, 64.30it/s]


In [19]:
gc; gc.collect()

330

In [14]:
test_dates = ['2017-08-16','2017-08-17','2017-08-18','2017-08-19','2017-08-20','2017-08-21','2017-08-22',
             '2017-08-23','2017-08-24','2017-08-25','2017-08-26','2017-08-27','2017-08-28','2017-08-29',
             '2017-08-30','2017-08-31']

In [15]:
for i in tqdm(range(1,55)):
    df = df_t[df_t.store_nbr == i]
    fn_tr = '../cache/transactions/tr_all_' + str(i) + '.csv'
    df.to_csv(fn_tr, index=False)
    df1 = pd.DataFrame()
    df1['date'] = test_dates
    df1['transactions']  = 0
    fn_te = '../cache/transactions/te_all_' + str(i) + '.csv'
    df1.to_csv(fn_te, index=False)
    del df1

100%|██████████| 54/54 [00:00<00:00, 127.03it/s]
