In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
from datetime import timedelta
from dotenv import find_dotenv
from os.path import dirname
from os.path import exists
from os.path import join
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.load import load_train_df
from src.load import load_test_df
from src.transform import get_week_by_dept_df
from src.transform import unpivot_week_by_dept_df
from src.transform import svd_reconstruct
from src.features import make_id_column

# Root directory of repo
project_dir = dirname(find_dotenv())

# Use custom matplotlib style
plt.style.use(join(project_dir, 'big-darkgrid.mplstyle'))

In [3]:
week_by_dept = get_week_by_dept_df()
week_by_dept.tail()

Store_Dept,10_1,10_10,10_11,10_12,10_13,10_14,10_16,10_17,10_18,10_19,...,9_9,9_90,9_91,9_92,9_93,9_94,9_95,9_96,9_97,9_98
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-09-28,31477.5,47821.03,23483.55,8628.64,56960.17,30545.42,17398.02,22918.62,11069.68,2930.77,...,16044.45,3156.76,998.95,18333.69,34.9,84.8,33119.87,3763.21,-9.92,6.0
2012-10-05,36323.95,46511.89,26447.16,9423.35,64940.04,34453.61,16204.14,25257.58,16151.92,2968.35,...,17044.07,2954.65,889.74,20612.82,34.9,158.98,35189.41,4306.43,-9.92,6.0
2012-10-12,40626.47,48684.03,25425.08,9051.76,61065.21,33971.87,12939.16,24494.83,19251.31,3440.85,...,13830.42,3151.12,698.84,20285.74,34.9,3.88,32072.8,3675.99,-9.92,6.0
2012-10-19,44224.36,46947.21,25507.61,8245.89,58976.21,38299.34,15007.93,25287.66,22814.92,2955.1,...,12813.42,3126.61,923.6,19304.95,34.9,23.28,34031.01,3540.05,-9.92,6.0
2012-10-26,58409.83,43637.06,25448.11,9043.49,61010.32,31883.89,11397.55,24781.75,27426.84,3271.02,...,14034.4,3096.77,914.84,18310.28,34.9,233.02,32382.05,4752.25,-9.92,6.0


In [4]:
week_by_dept2 = week_by_dept.shift(52)
week_by_dept2.tail()

Store_Dept,10_1,10_10,10_11,10_12,10_13,10_14,10_16,10_17,10_18,10_19,...,9_9,9_90,9_91,9_92,9_93,9_94,9_95,9_96,9_97,9_98
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-09-28,31654.83,46741.08,25135.31,8310.16,59617.8,35202.53,21897.09,23126.69,6143.18,2633.95,...,12578.37,3458.26,869.74,15518.36,34.9,64.74,31523.74,3340.16,-9.92,1.5
2012-10-05,37294.04,53097.86,27620.13,8721.13,65464.57,38471.29,18724.89,24676.7,11242.89,3458.37,...,12801.8,3588.95,959.68,18602.41,34.9,54.78,33160.67,4454.78,-9.92,1.5
2012-10-12,39680.9,48159.92,25699.35,9013.73,61492.4,33037.26,15496.18,23395.02,15147.58,2960.4,...,11708.71,3406.68,817.41,17133.1,34.9,109.08,30064.47,4116.66,-9.92,1.5
2012-10-19,47792.75,46021.91,25911.42,9252.23,60363.04,33683.18,11785.44,24959.53,17692.57,3047.53,...,10956.27,3424.57,880.93,16482.22,34.9,214.14,30402.54,4181.56,-9.92,6.0
2012-10-26,68491.28,43216.33,25940.74,9890.17,58593.28,31908.74,8328.67,23611.19,26935.41,3798.81,...,9286.45,3354.83,515.51,17124.85,34.9,363.54,29126.08,3304.87,-9.92,18.0


In [5]:
week_by_dept = get_week_by_dept_df()
shifted_dfs = []
for i in [51, 52, 53]:
    new_df = week_by_dept.shift(i)
    shifted_dfs.append(new_df)
#     new_df.columns = ["{}_{}".format(c, i) for c in shifted_dfs.columns]

In [6]:
unpivoteds = []
for df in [week_by_dept] + shifted_dfs:
    unpivoted = unpivot_week_by_dept_df(df)
    unpivoted = make_id_column(unpivoted)
    unpivoted = unpivoted.set_index('Id', drop=True)
    unpivoteds.append(unpivoted)

In [7]:
df_train = unpivoteds[0].copy()
for df in shifted_dfs:
    pass
    # Combine the columns from the shifted dataframes together in one dataframe, so
    # each point has several values from around a year ago to train on