# Approaches

1. Try doing rolling 7d guesses, Shift 7d guesses
Look at RMSE for different vals

2. Try multi LR (after removing sell price none)

## Try Rolling / Lag again

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error, mean_squared_error
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.linear_model import LinearRegression

In [40]:
DIRECTORY = './data/'
calendar = pd.read_csv(DIRECTORY + 'calendar.csv', parse_dates=['date'])
train_validation = pd.read_csv(DIRECTORY + 'sales_train_validation.csv')
train_evaluation = pd.read_csv(DIRECTORY + 'sales_train_evaluation.csv')
prices = pd.read_csv(DIRECTORY + 'sell_prices.csv')
sample_submission  = pd.read_csv(DIRECTORY + 'sample_submission.csv')

In [41]:
# Try finding RMSE between days
# Shift 1d for d1913
print("Shift 1d Results:")
for day in range(1913, 1908, -1):
    rmse = root_mean_squared_error(train_validation[f'd_{day}'], train_validation[f'd_{day-1}'])
    print(day, rmse)
print('-'*10)

# Shift 7d 
print("Shift 7d Results:")
for day in range(1913, 1913-8, -1):
    rmse = root_mean_squared_error(train_validation[f'd_{day}'], train_validation[f'd_{day-7}'])
    print(day, rmse)
print('-'*10)

# Rolling 7d
print("Rolling 7d Results:")
for day in range(1913, 1913-8, -1):
    day_index = train_validation.columns.to_list().index(f'd_{day}')
    guess = train_validation.iloc[:,day_index-7:day_index].mean(axis=1)
    rmse = root_mean_squared_error(train_validation[f'd_{day}'], guess)
    print(day, rmse)
print('-'*10)

Shift 1d Results:
1913 2.642048411701363
1912 2.6233742564378355
1911 2.296917094797697
1910 2.196018938911738
1909 2.265220376016131
----------
Shift 7d Results:
1913 2.8174304020129664
1912 2.6246429109443596
1911 2.6039554139743917
1910 2.4236589411813396
1909 2.316637388493749
1908 2.6344402631610357
1907 2.75182041181439
1906 2.9599380384072624
----------
Rolling 7d Results:
1913 2.0507093731208728
1912 2.1830781514472943
1911 1.8637290900890529
1910 1.8796450128668138
1909 1.9586755893178596
1908 1.887509740583179
1907 2.046456453524297
1906 2.3995854477838576
----------


In [125]:
# 28D prediction (last 28D of val)
guess = train_validation.iloc[:,-28-14:-28].mean(axis=1)
diffs = train_validation.iloc[:,-28:].to_numpy() - guess.to_numpy().reshape(-1,1)
rmse = np.sqrt(np.mean(diffs ** 2))
print("guessing 14d rolling mean for next 28D:", rmse)

guessing 14d rolling mean for next 28D: 2.221536667675976


In [88]:
# Submission of rolling 14d for val set
validation_guess = train_validation.iloc[:,-14:].mean(axis=1)
validation_ids = pd.concat([train_validation['id'], train_evaluation['id']]).reset_index(drop=True)
submission = pd.DataFrame({'id': validation_ids})
for i in np.arange(1, 29):
    submission[f'F{i}'] = validation_guess

In [87]:
submission.fillna(0).to_csv('submission.csv', index=False)

## Try Multi LR Model

In [112]:
Y = train_validation.select_dtypes('number').T
dp = DeterministicProcess(index=Y.index, constant=True, order=1)
X = dp.in_sample()

model = LinearRegression(fit_intercept=False)
model.fit(X, Y)

In [114]:
root_mean_squared_error(model.predict(X), Y)

1.4383200113975778

In [121]:
# Results
np.sqrt(np.mean((model.predict(X).flatten() - Y.to_numpy().flatten())**2))

2.639087663448183

Save Simple Row-Wise CSV

In [13]:
train_validation['id'], _ = train_validation['id'].factorize()
train_validation = train_validation.drop(columns=[col for col in train_validation.columns if '_id' in col])
train_validation = train_validation.rename(columns={col : int(col.split('_')[1]) for col in train_validation.columns if 'd_' in col})
train_validation = train_validation.melt(id_vars='id', var_name='day', value_name='sales')

In [15]:
train_validation.to_csv('./data/row_wise_simplified.csv', index=False)

In [20]:
for col in train_validation:
    train_validation[col] = pd.to_numeric(train_validation[col].astype(int), downcast='integer')

In [22]:
train_validation.groupby('id')['sales'].rolling(7).mean()

id             
0      0           NaN
       30490       NaN
       60980       NaN
       91470       NaN
       121960      NaN
                  ... 
30489  58205409    0.0
       58235899    0.0
       58266389    0.0
       58296879    0.0
       58327369    0.0
Name: sales, Length: 58327370, dtype: float64