In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
# Constructors
def construct_P(series, L):
    T = len(series)
    num_cols = int(T/L)
    P = np.zeros((L, num_cols))
    for col in range(num_cols):
        P[:, col] = series[col*L : (1+col)*L]
    return P

def construct_P_hat(P, r):
    U, S, Vh = np.linalg.svd(P)
    S_r = np.diag(S)
    S_r[r:, r:] = 0
    S_r = np.hstack([S_r, np.zeros((S_r.shape[0], Vh.shape[0] - S_r.shape[1]))])
    P_hat = (U @ S_r @ Vh)
    print("P_hat.shape =", P_hat.shape)
    
    return P_hat

def construct_normalized_P_hat(series, L, r):
    P = construct_P(series, L)
    ior = inverse_observed_ratio(series)
    U, S, Vh = np.linalg.svd(P)
    S_r = np.diag(S)
    S_r[r:, r:] = 0
    S_r = np.hstack([S_r, np.zeros((S_r.shape[0], Vh.shape[0] - S_r.shape[1]))])
    P_hat = (U @ S_r @ Vh)
    print("P_hat.shape =", P_hat.shape)
    
    return P_hat * ior

def optimize_r(P):
    U, S, Vh = np.linalg.svd(P)
    arr = (S**2).cumsum() / (S**2).sum()
    print(arr)
    r = np.searchsorted(arr, 0.99, side='right')
    return r+1

def inverse_observed_ratio(series):
    num_missing = series.isnna().sum()
    return 1/(1 - num_missing/len(series))

def construct_P_hats_for_test(train_series, test_series, r):
    """
    Returns a list of P_hat matrices for every entry in test
    """
    n = len(test_series)
    P_list = [construct_P(np.array(train_series.iloc[i:].to_list() + test_series.iloc[:i].to_list())) for i in range(n)]
    P_hat_list = [construct_P_hat(P, r) for P in P_list]
    return P_hat_list

def construct_stacked_P(df, L):
    """
    Constructs a stacked Page matrix from multiple series
    in a pd.DataFrame object.
    """
    stacked_P = np.hstack([construct_P(df.loc[:, col], L) for col in df.columns])
    return stacked_P

def construct_multi_stacked_P_for_test(df_train, df_test, L):
    """
    Constructs a stacked Page matrix from multiple series
    in a pd.DataFrame object.

    Returns a list of np.array objects, which are stacked Page matrices.
    """
    n, d = df_test.shape
    multi_stacked_P = []
    for i in range(n):
        df_merged = pd.concat([df_train.iloc[i:, :], df_test.iloc[:i, :]])
        stacked_P_list = multi_stacked_P.append(construct_stacked_P(df_merged, L))
    return stacked_P_list

def construct_multi_stacked_P_hat_for_test(df_train, df_test, L): #TODO
    """
    Constructs a stacked Page matrix from multiple series
    in a pd.DataFrame object.

    Returns a list of np.array objects, which are stacked Page matrices.
    """
    n, d = df_test.shape
    multi_stacked_P = []
    for i in range(n):
        df_merged = pd.concat([df_train.iloc[i:, :], df_test.iloc[:i, :]])
        stacked_P_list = multi_stacked_P.append(construct_stacked_P(df_merged, L))
    return stacked_P_list

In [None]:
# Calculators
def optimize_beta(P_hat):
    """
    Optimizes the beta coefficient vector such that
    linear combinations of all rows except the last rows
    give the last row with least squares error.
    """
    X = P_hat[:-1, :].T
    Y = P_hat[-1, :].T
    beta_hat = (np.linalg.inv(X.T @ X) @ X.T @ Y)[:, np.newaxis]  # column vector
    print("beta_hat.shape =", beta_hat)
    return beta_hat

def forecast_extra_row_SMOLS(P_hat):
    """
    Returns a forecast for an extra row for the matrix
    """
    X = P_hat[:-1, :].T
    Y = P_hat[-1, :].T
    model = sm.OLS(Y, X).fit()
    beta_hat = model.params[:, np.newaxis]
    return (X @ beta_hat).T  # return as a row

def MSE_short_term_test(df_train, df_test, L, r):  # TODO
    stacked_P_list = construct_multi_stacked_P_for_test(df_train, df_test, L)
    stacked_P_hat_list = [construct_stacked_P()]

In [38]:
# Load datasets
train = pd.read_csv("data_walmart_train.csv", index_col="Date")
test = pd.read_csv("data_walmart_test.csv", index_col="Date")
missing_train = pd.read_csv("data_walmart_train_missing.csv", index_col="Date")
train.index = pd.to_datetime(train.index)
test.index = pd.to_datetime(test.index)
missing_train.index = pd.to_datetime(missing_train.index)

train.head()

Unnamed: 0_level_0,s1_d1,s1_d2,s1_d3,s1_d4,s1_d5,s1_d6,s1_d7,s1_d8,s1_d9,s1_d10,...,s10_d87,s10_d90,s10_d91,s10_d92,s10_d93,s10_d94,s10_d95,s10_d96,s10_d97,s10_d98
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,24924.5,50605.27,13740.12,39954.04,32229.38,5749.03,21084.08,40129.01,16930.99,30721.5,...,26394.89,16873.5,16363.1,54538.9,1337.33,22.15,77349.87,10576.0,6242.07,74.0
2010-02-12,46039.49,44682.74,10887.84,35351.21,29620.81,9135.0,18310.31,37334.83,16562.49,31494.77,...,22280.68,16145.65,14371.53,52893.1,1482.82,1531.13,71980.72,9385.66,6101.56,181.25
2010-02-19,41595.55,47928.89,11523.47,36826.95,26468.27,6060.26,19985.2,38717.6,15880.85,29634.13,...,22896.5,15874.73,13266.1,48087.25,1322.86,3627.75,71524.04,9871.61,5676.73,9.0
2010-02-26,19403.54,44292.87,11135.17,34660.16,24101.89,5244.56,17224.22,35318.2,15175.52,27921.96,...,21536.65,16752.37,13372.08,52140.66,1387.3,-12.74,75904.32,9830.56,5653.97,24.5
2010-03-05,21827.9,48397.98,12275.58,38086.19,23082.14,4221.25,19659.7,38776.09,24064.7,33299.27,...,22496.77,15286.39,13008.35,53637.96,1317.98,4.52,74608.89,8679.74,5769.35,11.0


In [39]:
df_s10_train = train.filter(like="s10").dropna(axis=1)
df_s10_train

Unnamed: 0_level_0,s10_d1,s10_d2,s10_d3,s10_d4,s10_d5,s10_d6,s10_d7,s10_d8,s10_d9,s10_d10,...,s10_d83,s10_d85,s10_d87,s10_d90,s10_d91,s10_d92,s10_d93,s10_d95,s10_d96,s10_d97
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,40212.84,123952.48,30175.46,51885.86,75297.91,14833.87,52212.32,98763.10,58124.72,48027.87,...,231.78,3150.38,26394.89,16873.50,16363.10,54538.90,1337.33,77349.87,10576.00,6242.07
2010-02-12,67699.32,119209.48,28704.83,49359.18,74064.19,12089.46,50907.48,95944.77,61156.92,50595.72,...,266.68,2543.03,22280.68,16145.65,14371.53,52893.10,1482.82,71980.72,9385.66,6101.56
2010-02-19,49748.33,121430.80,26505.03,50350.28,59974.12,15596.01,52955.80,92709.52,55930.64,51199.72,...,330.24,2882.98,22896.50,15874.73,13266.10,48087.25,1322.86,71524.04,9871.61,5676.73
2010-02-26,33601.22,120292.15,28366.97,51206.84,56750.43,14217.19,49807.58,93758.01,53330.55,50028.83,...,252.28,3312.70,21536.65,16752.37,13372.08,52140.66,1387.30,75904.32,9830.56,5653.97
2010-03-05,36572.44,113163.91,26674.08,50641.43,62991.98,7949.83,45078.34,89784.31,59861.14,49892.15,...,290.18,2929.61,22496.77,15286.39,13008.35,53637.96,1317.98,74608.89,8679.74,5769.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-04-20,26759.76,106444.89,19446.76,46297.96,41458.02,6291.02,44070.86,87003.65,71468.69,44237.82,...,210.92,2614.58,25147.10,14172.38,12264.22,52282.06,961.58,71180.41,11351.30,4850.19
2012-04-27,25524.15,100435.09,18807.59,44384.52,34073.11,11365.32,40376.25,81916.80,67867.14,50959.51,...,254.00,3012.15,24044.12,14345.64,12016.40,48989.59,1148.88,68626.12,11148.12,4764.09
2012-05-04,25874.89,109105.81,19497.26,47588.04,38292.51,7415.92,45237.93,91003.76,68333.48,46264.19,...,221.18,2861.43,26152.07,14119.22,11903.95,51804.37,1222.35,72317.75,12975.52,4765.11
2012-05-11,27611.94,100546.67,20861.41,46364.20,43364.64,10307.30,45674.48,82412.49,65426.88,45649.04,...,237.16,3755.53,28218.92,14261.80,12698.26,51704.65,1415.09,71904.93,11480.82,5201.14


In [40]:
df_s10_test = test.filter(like="s10").dropna(axis=1)
df_s10_test.head()

Unnamed: 0_level_0,s10_d1,s10_d2,s10_d3,s10_d4,s10_d5,s10_d6,s10_d7,s10_d8,s10_d9,s10_d10,...,s10_d83,s10_d85,s10_d87,s10_d90,s10_d91,s10_d92,s10_d93,s10_d95,s10_d96,s10_d97
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-25,26421.66,108660.46,20049.88,48223.63,39254.77,10884.41,54769.01,88663.29,82797.81,57258.38,...,357.96,3037.79,24816.13,14886.28,12767.63,50254.84,1434.04,80000.77,13532.62,5209.82
2012-06-01,24734.89,103012.31,19301.08,48904.91,37920.13,8761.11,55763.78,86720.67,74275.8,49258.91,...,272.86,2840.55,28673.43,14154.5,12643.66,49268.0,1404.0,74332.99,12420.57,5064.67
2012-06-08,24653.33,109543.37,20367.9,48182.67,51545.14,7251.51,56415.8,87833.99,71592.66,52059.82,...,341.86,2556.47,32190.23,14031.68,13996.16,51041.74,1591.79,78148.12,12402.62,5393.54
2012-06-15,24374.99,108530.26,20305.49,46738.71,50225.6,9389.52,50609.22,87652.48,77280.48,52530.42,...,372.32,2968.54,28846.4,14098.15,12889.54,49327.75,1246.94,78226.25,12366.02,4942.24
2012-06-22,24418.14,102144.47,17258.33,45670.8,46431.46,8276.24,46346.57,80396.28,69266.67,52842.88,...,359.48,3354.55,30013.64,13886.1,12518.84,48216.47,1146.92,74639.34,12358.22,4828.78


In [41]:
# Create stacked Page matrix.
L = 60

print("Available columns for Store 10:", df_s10_train.shape[1])

stacked_P = construct_stacked_P(df_s10_train, L)
print("Stacked Page matrix shape:", stacked_P.shape)

stacked_P_hat = construct_P_hat(stacked_P, r=1)

forecast_extra_row_SMOLS(stacked_P_hat).shape

Available columns for Store 10: 67
Stacked Page matrix shape: (60, 134)
P_hat.shape = (60, 134)


(1, 134)