In [268]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [269]:
train = pd.read_csv("data_walmart_train.csv", index_col="Date")
test = pd.read_csv("data_walmart_test.csv", index_col="Date")
missing_train = pd.read_csv("data_walmart_train_missing.csv", index_col="Date")
train.index = pd.to_datetime(train.index)
test.index = pd.to_datetime(test.index)
missing_train.index = pd.to_datetime(missing_train.index)

train.head()

Unnamed: 0_level_0,s1_d1,s1_d2,s1_d3,s1_d4,s1_d5,s1_d6,s1_d7,s1_d8,s1_d9,s1_d10,...,s10_d87,s10_d90,s10_d91,s10_d92,s10_d93,s10_d94,s10_d95,s10_d96,s10_d97,s10_d98
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,24924.5,50605.27,13740.12,39954.04,32229.38,5749.03,21084.08,40129.01,16930.99,30721.5,...,26394.89,16873.5,16363.1,54538.9,1337.33,22.15,77349.87,10576.0,6242.07,74.0
2010-02-12,46039.49,44682.74,10887.84,35351.21,29620.81,9135.0,18310.31,37334.83,16562.49,31494.77,...,22280.68,16145.65,14371.53,52893.1,1482.82,1531.13,71980.72,9385.66,6101.56,181.25
2010-02-19,41595.55,47928.89,11523.47,36826.95,26468.27,6060.26,19985.2,38717.6,15880.85,29634.13,...,22896.5,15874.73,13266.1,48087.25,1322.86,3627.75,71524.04,9871.61,5676.73,9.0
2010-02-26,19403.54,44292.87,11135.17,34660.16,24101.89,5244.56,17224.22,35318.2,15175.52,27921.96,...,21536.65,16752.37,13372.08,52140.66,1387.3,-12.74,75904.32,9830.56,5653.97,24.5
2010-03-05,21827.9,48397.98,12275.58,38086.19,23082.14,4221.25,19659.7,38776.09,24064.7,33299.27,...,22496.77,15286.39,13008.35,53637.96,1317.98,4.52,74608.89,8679.74,5769.35,11.0


In [270]:
def P(series, L):
    T = len(series)
    num_cols = int(T/L)
    P = np.zeros((L, num_cols))
    for col in range(num_cols):
        P[:, col] = series[col*L : (1+col)*L]
    return P

def construct_P_hat(P, r):
    U, S, Vh = np.linalg.svd(P)
    S_r = np.diag(S)
    S_r[r:, r:] = 0
    S_r = np.hstack([S_r, np.zeros((S_r.shape[0], Vh.shape[0] - S_r.shape[1]))])
    P_hat = U @ S_r @ Vh
    print("P_hat.shape =", P_hat.shape)
    
    return P_hat

def optimize_r(P):
    U, S, Vh = np.linalg.svd(P)
    arr = (S**2).cumsum() / (S**2).sum()
    print(arr)
    r = np.searchsorted(arr, 0.99, side='right')
    return r+1

In [271]:
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])
P(arr, L=4)

array([[1., 5.],
       [2., 6.],
       [3., 7.],
       [4., 8.]])

In [272]:
# Full data
L = 10

train_s10_d83 = train.loc[:, "s10_d83"]
train_s1_d34 = train.loc[:, "s1_d34"]

In [273]:
# Missing data
L = 10

num_missing_train_s10_d83 = missing_train.loc[:, "s10_d83"].isna().sum()
num_missing_train_s1_d34 = missing_train.loc[:, "s1_d34"].isna().sum()

print("s10_d83 missing:", num_missing_train_s10_d83)
print("s1_d34 missing:", num_missing_train_s1_d34)

observed_s10_d83 =  1 - (num_missing_train_s10_d83 / len(train_s10_d83))
observed_s1_d34 =  1 - (num_missing_train_s1_d34 / len(train_s1_d34))

print("observed:", observed_s10_d83)
print("observed:", observed_s1_d34)

missing_train_s10_d83 = missing_train.loc[:, "s10_d83"].fillna(0)
missing_train_s1_d34 = missing_train.loc[:, "s1_d34"].fillna(0)

P_s10_d83 = P(missing_train_s10_d83, L)
r_s10_d83 = optimize_r(P_s10_d83)

P_s1_d34 = P(missing_train_s1_d34, L)
r_s1_d34 = optimize_r(P_s1_d34)

full_P_s1_d34 = P(train_s1_d34, L=10)

print("r_s10_d83 =", r_s10_d83)
print("r_s1_d34 =", r_s1_d34)

s10_d83 missing: 19
s1_d34 missing: 48
observed: 0.8416666666666667
observed: 0.6
[0.80487713 0.8922433  0.93202116 0.95616785 0.97584063 0.98863859
 0.99564717 0.99803581 0.99958549 1.        ]
[0.6084575  0.73290468 0.81380524 0.88317841 0.92872292 0.96331227
 0.98815545 0.99384364 0.99782147 1.        ]
r_s10_d83 = 7
r_s1_d34 = 8


In [274]:
P_s10_d83_hat = construct_P_hat(P_s10_d83, r=5) * (1/observed_s10_d83)
P_s1_d34_hat = construct_P_hat(P_s1_d34, r=5) * (1/observed_s1_d34)

P_hat.shape = (10, 12)
P_hat.shape = (10, 12)


In [275]:
# s10_d83
zero_indexes_s10_d83 = np.where(P_s10_d83 == 0)
zero_positions = list(zip(zero_indexes_s10_d83[0], zero_indexes_s10_d83[1]))
L = 10
MSE = 0
n = len(zero_positions)
for pos in zero_positions:
    MSE += (P(train_s10_d83, L)[pos] - P_s10_d83_hat[pos])**2

np.round(MSE / n / 1e6, 2)

0.12

In [276]:
# s1_d34
zero_indexes_s1_d34 = np.where(P_s1_d34 == 0)
zero_positions = list(zip(zero_indexes_s1_d34[0], zero_indexes_s1_d34[1]))

MSE = 0
n = len(zero_positions)

num_iter = 0
for pos in zero_positions:
    num_iter += 1
    # print(f"Missing {pos}")
    MSE += (full_P_s1_d34[pos] - P_s1_d34_hat[pos])**2
print("Number of iterations =", num_iter)

np.round(MSE / n / 1e6, 2)

Number of iterations = 48


118.55