In [10]:
import pandas as pd
import numpy as np
import time

In [11]:
def reduce_mem_usage(df,verbose=True):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    start_mem = df.memory_usage().sum() / 1024**2 
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df  


In [12]:
calendar_file = "D:\\\m5-forecasting-accuracy\\calendar.csv"
sales_train_validation_file = "D:\\\m5-forecasting-accuracy\\sales_train_validation.csv"
sell_prices_file = "D:\\\m5-forecasting-accuracy\\sell_prices.csv"


calendar_ = pd.read_csv(calendar_file, delimiter=",")
sales_train_validation_ = pd.read_csv(sales_train_validation_file, delimiter=",")
sell_prices_ = pd.read_csv(sell_prices_file, delimiter=",")

In [13]:
for d in range(1914,1970):
    col = 'd_' + str(d)
    sales_train_validation_[col] = 0
    sales_train_validation_[col] = sales_train_validation_[col].astype(np.int16)

In [14]:
calendar_ = reduce_mem_usage(calendar_)

Mem. usage decreased to  0.14 Mb (33.2% reduction)


In [15]:
sell_prices_ = reduce_mem_usage(sell_prices_)

Mem. usage decreased to 45.77 Mb (78.1% reduction)


In [16]:
sales_train_validation_ = reduce_mem_usage(sales_train_validation_)

Mem. usage decreased to 97.05 Mb (78.4% reduction)


In [17]:
print(calendar_.shape)
calendar_.head()

(1969, 14)


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [18]:
print(sales_train_validation_.shape)
sales_train_validation_.head()

(30490, 1975)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
print(sell_prices_.shape)
sell_prices_.head()

(6841121, 4)


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.578125
1,CA_1,HOBBIES_1_001,11326,9.578125
2,CA_1,HOBBIES_1_001,11327,8.257812
3,CA_1,HOBBIES_1_001,11328,8.257812
4,CA_1,HOBBIES_1_001,11329,8.257812


## Prepare Training Data

In [20]:
column_names = list(sales_train_validation_.columns)
train_ = pd.melt(sales_train_validation_, id_vars=column_names[0:6],value_vars=column_names[6:],var_name='d', value_name='sales')

In [21]:
del(sales_train_validation_)

In [22]:
print(train_.shape)
train_.head()

(60034810, 8)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [23]:
train_ = reduce_mem_usage(train_)

Mem. usage decreased to 688.72 Mb (33.3% reduction)


#### Merge with calendar

In [24]:
train = pd.merge(train_, calendar_, on='d')

In [37]:
del(train_)
del(calendar_)

In [38]:
train_data = train.drop(['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI'], axis=1, inplace=True)

In [40]:
train.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year'],
      dtype='object')

In [17]:
train.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year'],
      dtype='object')

In [41]:
reduce_mem_usage(train)

Mem. usage decreased to 2005.56 Mb (0.0% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,1,2011
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,1,2011
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,1,2011
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,1,2011
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60034805,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,0,2016-06-19,11621,Sunday,2,6,2016
60034806,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,0,2016-06-19,11621,Sunday,2,6,2016
60034807,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,0,2016-06-19,11621,Sunday,2,6,2016
60034808,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,0,2016-06-19,11621,Sunday,2,6,2016


#### Merge with sell_prices

In [42]:
train_data = train.merge(sell_prices_, on=["store_id","item_id","wm_yr_wk"],how="left")

In [43]:
del(sell_prices_)

In [44]:
train_data.shape

(60034810, 15)

In [45]:
train.head()
train.shape

(60034810, 14)

### Generate simple features

In [6]:
def lag_(df,lag_val):
   
    # sales: lag    
    col = f"lag_{lag_val}_sales"
    df.loc[:,col] = df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_val))
    
    return df

In [7]:
def rolling_mean_(df,lag_val,days):
   
    # sales: rolling_mean   
    col = f"mov_avg_{days}_{lag_val}_sales"
    df.loc[:,col] = df.loc[:,f"lag_{lag_val}_sales"].transform(lambda x: x.rolling(days).mean())
    
    return df

In [8]:
def simple_features_monthly(df):
    
    # monthly mean, std minimum/max sales
    months = set(df['month'])
    
    for month in months:        
        
        df.loc[df['month'] == month,'montly_sales_mean'] = df.loc[df['month'] == month].groupby('id')['sales'].transform(lambda x: x.mean())
        df.loc[df['month'] == month,'montly_sales_std'] = df.loc[df['month'] == month].groupby('id')['sales'].transform(lambda x: x.std())
        df.loc[df['month'] == month,'monthly_sales_max'] = df.loc[df['month'] == month].groupby('id')['sales'].transform(lambda x: x.max())
        df.loc[df['month'] == month,'monthly_sales_min'] = df.loc[df['month'] == month].groupby('id')['sales'].transform(lambda x: x.min())
        
        print("done")
    
    return df

In [9]:
def simple_features_weekly(df):
    
    # weekly mean, std minimum/max sales, max-min 
    weekdays = set(df['weekday'])
    
    for day in weekdays:
        
        df.loc[df['weekday'] == day,'weekly_sales_mean'] = df.loc[df['weekday'] == day].groupby('id')['sales'].transform(lambda x: x.mean())
        df.loc[df['weekday'] == day,'weekly_sales_std'] = df.loc[df['weekday'] == day].groupby('id')['sales'].transform(lambda x: x.std())
        df.loc[df['weekday'] == day,'weekly_sales_max'] = df.loc[df['weekday'] == day].groupby('id')['sales'].transform(lambda x: x.max())
        df.loc[df['weekday'] == day,'weekly_sales_min'] = df.loc[df['weekday'] == day].groupby('id')['sales'].transform(lambda x: x.min())
                
        print("done")
        
    return df

In [None]:
start = time.time()

train_data = lag_(train_data,7)
train_data = lag_(train_data,28)

train_data = rolling_mean_(train_data,7,7)
train_data = rolling_mean_(train_data,7,28)
train_data = rolling_mean_(train_data,28,7)
train_data = rolling_mean_(train_data,28,28)

end = time.time()
print(end - start)

In [None]:
start = time.time()

train_data = simple_features_monthly(train_data)

end = time.time()
print(end - start)

In [None]:
start = time.time()

train_data = simple_features_weekly(train_data)

end = time.time()
print(end - start)