In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = "results/df_featured_full.parquet"
df = pd.read_parquet(PATH)

if not np.issubdtype(df['date'].dtype, np.datetime64):
    df['date'] = pd.to_datetime(df['date'])

print(f"Full dataset shape: {df.shape}")

Full dataset shape: (125497040, 46)


In [3]:
pairs = df.groupby(['store_nbr', 'item_nbr']).agg({
    'family': 'first',
    'perishable': 'first',
}).reset_index()

PAIRS_PER_FAMILY = 50

sampled_pairs = (
    pairs
    .sample(frac=1, random_state=42)
    .groupby('family')
    .head(PAIRS_PER_FAMILY)
    .reset_index(drop=True)
)

df_sample = df.merge(
    sampled_pairs[['store_nbr', 'item_nbr']],
    on=['store_nbr', 'item_nbr'],
    how='inner'
).copy()

print(f"Sampled dataset: {df_sample.shape}")

del df
import gc
gc.collect()

Sampled dataset: (866805, 46)


33

In [4]:
leaking_features = [
    'item_daily_sales',
    'store_daily_sales',
    'transactions',
    'family_avg_sales',
    'store_family_avg_sales',
]

df_sample = df_sample.drop(columns=[c for c in leaking_features if c in df_sample.columns])
print(f"Removed leaking features")

Removed leaking features


In [5]:
oil_df = df_sample[['date', 'dcoilwtico']].drop_duplicates().sort_values('date').copy()

# Forward fill NaN oil prices (weekends/holidays don't have prices)
oil_df['dcoilwtico'] = oil_df['dcoilwtico'].ffill()

# Create lagged versions
oil_df['oil_lag_1'] = oil_df['dcoilwtico'].shift(1)
oil_df['oil_lag_7'] = oil_df['dcoilwtico'].shift(7)
oil_df['oil_rolling_7'] = oil_df['dcoilwtico'].shift(1).rolling(7, min_periods=1).mean()
oil_df['oil_change_7d'] = oil_df['oil_lag_1'] - oil_df['oil_lag_7']
oil_df['oil_pct_change_7d'] = oil_df['oil_change_7d'] / (oil_df['oil_lag_7'] + 0.01)

# Drop original, keep lagged
df_sample = df_sample.drop(columns=['dcoilwtico'])
df_sample = df_sample.merge(
    oil_df[['date', 'oil_lag_1', 'oil_lag_7', 'oil_rolling_7', 'oil_change_7d', 'oil_pct_change_7d']],
    on='date',
    how='left'
)

print("✓ Oil price lagged")

✓ Oil price lagged


In [6]:
df_sample = df_sample.sort_values(['store_nbr', 'item_nbr', 'date']).reset_index(drop=True)

# --- Lagged transactions ---
df_txn = pd.read_parquet(PATH, columns=['store_nbr', 'date', 'transactions'])
df_txn = df_txn.drop_duplicates(['store_nbr', 'date']).sort_values(['store_nbr', 'date'])

df_txn['transactions_lag_1'] = df_txn.groupby('store_nbr')['transactions'].shift(1)
df_txn['transactions_lag_7'] = df_txn.groupby('store_nbr')['transactions'].shift(7)
df_txn['transactions_rolling_7'] = (
    df_txn.groupby('store_nbr')['transactions']
    .apply(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_sample = df_sample.merge(
    df_txn[['store_nbr', 'date', 'transactions_lag_1', 'transactions_lag_7', 'transactions_rolling_7']],
    on=['store_nbr', 'date'],
    how='left'
)
print("✓ Lagged transactions")

# --- Lagged item daily sales ---
item_daily = (
    df_sample.groupby(['item_nbr', 'date'])['unit_sales']
    .sum()
    .reset_index(name='item_daily_raw')
)
item_daily = item_daily.sort_values(['item_nbr', 'date'])
item_daily['item_daily_sales_lag_1'] = item_daily.groupby('item_nbr')['item_daily_raw'].shift(1)
item_daily['item_daily_sales_lag_7'] = item_daily.groupby('item_nbr')['item_daily_raw'].shift(7)

df_sample = df_sample.merge(
    item_daily[['item_nbr', 'date', 'item_daily_sales_lag_1', 'item_daily_sales_lag_7']],
    on=['item_nbr', 'date'],
    how='left'
)
print("✓ Lagged item daily sales")

# --- Lagged store daily sales ---
store_daily = (
    df_sample.groupby(['store_nbr', 'date'])['unit_sales']
    .sum()
    .reset_index(name='store_daily_raw')
)
store_daily = store_daily.sort_values(['store_nbr', 'date'])
store_daily['store_daily_sales_lag_1'] = store_daily.groupby('store_nbr')['store_daily_raw'].shift(1)
store_daily['store_daily_sales_lag_7'] = store_daily.groupby('store_nbr')['store_daily_raw'].shift(7)

df_sample = df_sample.merge(
    store_daily[['store_nbr', 'date', 'store_daily_sales_lag_1', 'store_daily_sales_lag_7']],
    on=['store_nbr', 'date'],
    how='left'
)
print("✓ Lagged store daily sales")


✓ Lagged transactions
✓ Lagged item daily sales
✓ Lagged store daily sales


In [7]:
# Get holiday dates
holiday_dates = df_sample[df_sample['is_holiday'] == 1]['date'].unique()
holiday_dates = pd.Series(sorted(holiday_dates))

def days_to_nearest_holiday(date, holiday_list, direction='both', max_days=14):
    """Calculate days to nearest holiday"""
    if direction == 'before':
        past = holiday_list[holiday_list <= date]
        if len(past) == 0:
            return max_days
        return min((date - past.max()).days, max_days)
    elif direction == 'after':
        future = holiday_list[holiday_list >= date]
        if len(future) == 0:
            return max_days
        return min((future.min() - date).days, max_days)
    else:  # both
        past = holiday_list[holiday_list <= date]
        future = holiday_list[holiday_list >= date]
        days_past = (date - past.max()).days if len(past) > 0 else max_days
        days_future = (future.min() - date).days if len(future) > 0 else max_days
        return min(days_past, days_future, max_days)

# Calculate for each unique date (faster than per row)
unique_dates = df_sample['date'].unique()
date_holiday_features = pd.DataFrame({'date': unique_dates})

date_holiday_features['days_since_holiday'] = date_holiday_features['date'].apply(
    lambda x: days_to_nearest_holiday(x, holiday_dates, 'before')
)
date_holiday_features['days_until_holiday'] = date_holiday_features['date'].apply(
    lambda x: days_to_nearest_holiday(x, holiday_dates, 'after')
)
date_holiday_features['days_to_nearest_holiday'] = date_holiday_features['date'].apply(
    lambda x: days_to_nearest_holiday(x, holiday_dates, 'both')
)

# Binary flags
date_holiday_features['is_day_before_holiday'] = (date_holiday_features['days_until_holiday'] == 1).astype(int)
date_holiday_features['is_day_after_holiday'] = (date_holiday_features['days_since_holiday'] == 1).astype(int)
date_holiday_features['is_holiday_week'] = (date_holiday_features['days_to_nearest_holiday'] <= 3).astype(int)
date_holiday_features['is_holiday_weekend'] = (
    (date_holiday_features['days_until_holiday'] <= 2) | 
    (date_holiday_features['days_since_holiday'] <= 2)
).astype(int)

# Merge back
df_sample = df_sample.merge(date_holiday_features, on='date', how='left')
print("✓ Enhanced holiday features")

✓ Enhanced holiday features


In [8]:
group = df_sample.groupby(['store_nbr', 'item_nbr'])

df_sample['sales_lag_21'] = group['unit_sales'].shift(21)

# Average of same weekday over past 4 weeks
df_sample['same_weekday_avg_4w'] = (
    df_sample[['sales_lag_7', 'sales_lag_14', 'sales_lag_21', 'sales_lag_28']]
    .mean(axis=1, skipna=True)
)

# Median (more robust)
df_sample['same_weekday_median_4w'] = (
    df_sample[['sales_lag_7', 'sales_lag_14', 'sales_lag_21', 'sales_lag_28']]
    .median(axis=1, skipna=True)
)

print("✓ Same-weekday features")

✓ Same-weekday features


In [9]:
df_sample['sales_ratio_7_28'] = df_sample['rolling_mean_7'] / (df_sample['rolling_mean_28'] + 1)
df_sample['sales_cv_7'] = df_sample['rolling_std_7'] / (df_sample['rolling_mean_7'] + 1)
df_sample['trend_7_28'] = df_sample['rolling_mean_7'] - df_sample['rolling_mean_28']
df_sample['is_trending_up'] = (df_sample['rolling_mean_7'] > df_sample['rolling_mean_28']).astype(int)

print("✓ Trend features")

✓ Trend features


In [10]:
df_sample['promo_lag_1'] = group['onpromotion'].shift(1)
df_sample['promo_start'] = ((df_sample['onpromotion'] == 1) & (df_sample['promo_lag_1'] == 0)).astype(int)
df_sample['promo_end'] = ((df_sample['onpromotion'] == 0) & (df_sample['promo_lag_1'] == 1)).astype(int)

# Promo during holiday
df_sample['promo_holiday'] = (df_sample['onpromotion'] * df_sample['is_holiday']).astype(int)
df_sample['promo_near_holiday'] = (df_sample['onpromotion'] * df_sample['is_holiday_week']).astype(int)

print("✓ Promo features")

✓ Promo features


In [11]:
df_sample['is_zero_lag1'] = (group['unit_sales'].shift(1) == 0).astype(int)
df_sample['zero_count_7d'] = (
    group['unit_sales']
    .apply(lambda x: (x.shift(1) == 0).rolling(7, min_periods=1).sum())
    .reset_index(level=[0,1], drop=True)
)

print("✓ Zero sales features")

✓ Zero sales features


In [12]:
all_cols = df_sample.columns.tolist()
non_feature_cols = ['id', 'date', 'unit_sales', 'set']
exclude_patterns = ['_raw']

feature_cols = [
    c for c in all_cols 
    if c not in non_feature_cols 
    and not any(p in c for p in exclude_patterns)
]

print(f"\nTotal features: {len(feature_cols)}")


Total features: 71


In [13]:
max_date = df_sample['date'].max()
test_start = max_date - pd.Timedelta(days=15)
valid_end = test_start - pd.Timedelta(days=1)
valid_start = valid_end - pd.Timedelta(days=15)

df_sample['set'] = 'train'
df_sample.loc[(df_sample['date'] >= valid_start) & (df_sample['date'] <= valid_end), 'set'] = 'valid'
df_sample.loc[df_sample['date'] >= test_start, 'set'] = 'test'

print(f"\nSplit:")
print(f"\nSplit distribution:")
print(df_sample['set'].value_counts())
print(df_sample.groupby('set')['date'].agg(['min', 'max']))


Split:

Split distribution:
set
train    844246
valid     11312
test      11247
Name: count, dtype: int64
             min        max
set                        
test  2017-07-31 2017-08-15
train 2013-01-01 2017-07-14
valid 2017-07-15 2017-07-30


In [14]:
train_df = df_sample[df_sample['set'] == 'train'].copy()
valid_df = df_sample[df_sample['set'] == 'valid'].copy()
test_df  = df_sample[df_sample['set'] == 'test'].copy()

X_train = train_df[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()
X_test  = test_df[feature_cols].copy()

y_train = train_df['unit_sales'].values
y_valid = valid_df['unit_sales'].values
y_test  = test_df['unit_sales'].values

# Handle categoricals
for X in [X_train, X_valid, X_test]:
    obj_cols = X.select_dtypes(include=['object']).columns
    for c in obj_cols:
        X[c] = X[c].astype('category')

print(f"\nX_train: {X_train.shape}")
print(f"X_valid: {X_valid.shape}")
print(f"X_test:  {X_test.shape}")

# Sample weights (higher for perishables)
train_weights = 1 + train_df['perishable'].values


X_train: (844246, 71)
X_valid: (11312, 71)
X_test:  (11247, 71)


In [15]:
def rmsle(y_true, y_pred):
    y_true = np.clip(y_true, 0, None)
    y_pred = np.clip(y_pred, 0, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.abs(y_true) + np.abs(y_pred)
    mask = denom != 0
    out = np.zeros_like(denom)
    out[mask] = 2.0 * np.abs(y_pred[mask] - y_true[mask]) / denom[mask]
    return np.mean(out)

def nwrmsle(y_true, y_pred, perishable):
    y_true = np.clip(y_true, 0, None)
    y_pred = np.clip(y_pred, 0, None)
    w = 1 + (perishable == 1)
    msle = (w * (np.log1p(y_pred) - np.log1p(y_true))**2).sum() / w.sum()
    return np.sqrt(msle)

In [16]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.03,
    'num_leaves': 128,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'n_estimators': 1500,
    'random_state': 42,
    'n_jobs': -1,
}

model = lgb.LGBMRegressor(**params)

print("\nTraining LightGBM (with enhanced holiday features)...")

model.fit(
    X_train, y_train,
    sample_weight=train_weights,
    eval_set=[(X_valid, y_valid)],
    eval_metric='rmse',
    callbacks=[
        lgb.log_evaluation(period=100),
        lgb.early_stopping(stopping_rounds=100)
    ]
)

print(f"\nBest iteration: {model.best_iteration_}")


Training LightGBM (with enhanced holiday features)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8294
[LightGBM] [Info] Number of data points in the train set: 844246, number of used features: 71
[LightGBM] [Info] Start training from score 8.634099
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 6.26753
[200]	valid_0's rmse: 6.0011
[300]	valid_0's rmse: 5.94996
[400]	valid_0's rmse: 5.95829
Early stopping, best iteration is:
[372]	valid_0's rmse: 5.94807

Best iteration: 372


In [17]:
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
y_test_pred  = model.predict(X_test)



In [18]:
### print("\n" + "="*60)
print("EVALUATION RESULTS (Enhanced Features + Fixed Oil Leakage)")
print("="*60)

print("\n--- TRAIN ---")
print(f"RMSLE:  {rmsle(y_train, y_train_pred):.6f}")
print(f"SMAPE:  {smape(y_train, y_train_pred):.6f}")

print("\n--- VALID ---")
print(f"RMSLE:  {rmsle(y_valid, y_valid_pred):.6f}")
print(f"SMAPE:  {smape(y_valid, y_valid_pred):.6f}")

print("\n--- TEST (last 16 days) ---")
print(f"RMSLE:  {rmsle(y_test, y_test_pred):.6f}")
print(f"SMAPE:  {smape(y_test, y_test_pred):.6f}")

print(f"\nNWRMSLE (train): {nwrmsle(y_train, y_train_pred, train_df['perishable'].values):.6f}")
print(f"NWRMSLE (valid): {nwrmsle(y_valid, y_valid_pred, valid_df['perishable'].values):.6f}")
print(f"NWRMSLE (test):  {nwrmsle(y_test, y_test_pred, test_df['perishable'].values):.6f}")

EVALUATION RESULTS (Enhanced Features + Fixed Oil Leakage)

--- TRAIN ---
RMSLE:  0.470457
SMAPE:  0.461021

--- VALID ---
RMSLE:  0.465498
SMAPE:  0.464676

--- TEST (last 16 days) ---
RMSLE:  0.487534
SMAPE:  0.483984

NWRMSLE (train): 0.473505
NWRMSLE (valid): 0.472529
NWRMSLE (test):  0.493050


In [19]:
fi = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_,
}).sort_values('importance', ascending=False)

print("\n" + "="*60)
print("TOP 30 FEATURES BY IMPORTANCE")
print("="*60)
print(fi.head(30).to_string(index=False))


TOP 30 FEATURES BY IMPORTANCE
                feature  importance
            sales_lag_1        2167
           sales_lag_28        1621
                 family        1610
            day_of_week        1538
 item_daily_sales_lag_1        1492
           day_of_month        1492
       sales_ratio_7_28        1467
          rolling_max_7        1463
           sales_lag_21        1393
           sales_lag_14        1375
 item_daily_sales_lag_7        1362
     transactions_lag_7        1362
             sales_cv_7        1358
         rolling_mean_7        1335
    same_weekday_avg_4w        1297
            sales_lag_7        1292
 same_weekday_median_4w        1274
          rolling_min_7        1175
     transactions_lag_1        1121
store_daily_sales_lag_1        1094
             trend_7_28        1074
store_daily_sales_lag_7        1059
           week_of_year        1032
       days_since_promo        1020
 transactions_rolling_7         999
        rolling_mean_14         9

In [20]:
test_df = test_df.copy()
test_df['pred'] = y_test_pred

date_rmsle = (
    test_df.groupby('date')
    .apply(lambda x: rmsle(x['unit_sales'], x['pred']))
    .rename('rmsle')
    .reset_index()
)

# Add holiday flag for analysis
date_holiday_flag = df_sample[['date', 'is_holiday']].drop_duplicates()
date_rmsle = date_rmsle.merge(date_holiday_flag, on='date', how='left')

print("\n" + "="*60)
print("RMSLE BY DATE (TEST) - with holiday flag")
print("="*60)
print(date_rmsle.to_string(index=False))
print(f"\nMean RMSLE: {date_rmsle['rmsle'].mean():.6f}")
print(f"Std RMSLE:  {date_rmsle['rmsle'].std():.6f}")


RMSLE BY DATE (TEST) - with holiday flag
      date    rmsle  is_holiday
2017-07-31 0.453025           0
2017-08-01 0.462555           0
2017-08-02 0.476912           0
2017-08-03 0.505394           0
2017-08-04 0.506239           0
2017-08-05 0.472920           1
2017-08-06 0.490515           0
2017-08-07 0.488656           0
2017-08-08 0.479391           0
2017-08-09 0.465701           0
2017-08-10 0.524644           1
2017-08-11 0.516635           1
2017-08-12 0.499332           0
2017-08-13 0.515790           0
2017-08-14 0.470860           0
2017-08-15 0.467857           1

Mean RMSLE: 0.487277
Std RMSLE:  0.021892


In [21]:
perishable_rmsle = (
    test_df.groupby('perishable')
    .apply(lambda x: rmsle(x['unit_sales'], x['pred']))
)

print("\n" + "="*60)
print("RMSLE BY PERISHABLE (TEST)")
print("="*60)
print(perishable_rmsle)


RMSLE BY PERISHABLE (TEST)
perishable
0    0.473897
1    0.506069
dtype: float64


In [22]:
# ============================================================
# 21. ERROR BY HOLIDAY PROXIMITY (FIXED)
# ============================================================

if 'days_to_nearest_holiday' in test_df.columns:
    # Filter out NaN values
    test_df_valid = test_df[test_df['days_to_nearest_holiday'].notna()].copy()
    
    if len(test_df_valid) > 0:
        test_df_valid['holiday_proximity'] = pd.cut(
            test_df_valid['days_to_nearest_holiday'],
            bins=[-0.1, 0, 1, 3, 7, 100],
            labels=['Holiday', '1 day', '2-3 days', '4-7 days', '7+ days']
        )
        
        holiday_proximity_rmsle = (
            test_df_valid.groupby('holiday_proximity', observed=True)
            .apply(lambda x: rmsle(x['unit_sales'], x['pred']) if len(x) > 0 else np.nan)
        )
        
        print("\n" + "="*60)
        print("RMSLE BY HOLIDAY PROXIMITY (TEST)")
        print("="*60)
        print(holiday_proximity_rmsle)
    else:
        print("No valid holiday proximity data")
else:
    print("Skipping holiday proximity analysis - column not found")


RMSLE BY HOLIDAY PROXIMITY (TEST)
holiday_proximity
Holiday     0.495468
1 day       0.486907
2-3 days    0.493510
4-7 days    0.457812
dtype: float64


In [23]:
print("\n" + "="*60)
print("COMPARISON: BEFORE vs AFTER")
print("="*60)
print("""
PREVIOUS (no leakage, basic features):
  - Test RMSLE:   0.504743
  - Test SMAPE:   0.491256
  - Test NWRMSLE: 0.511413
  - Perishable: 0.527 vs Non-perishable: 0.488

CURRENT (enhanced holiday + fixed oil):
  - Test RMSLE:   [see above]
  - Test SMAPE:   [see above]
  - Test NWRMSLE: [see above]
""")


COMPARISON: BEFORE vs AFTER

PREVIOUS (no leakage, basic features):
  - Test RMSLE:   0.504743
  - Test SMAPE:   0.491256
  - Test NWRMSLE: 0.511413
  - Perishable: 0.527 vs Non-perishable: 0.488

CURRENT (enhanced holiday + fixed oil):
  - Test RMSLE:   [see above]
  - Test SMAPE:   [see above]
  - Test NWRMSLE: [see above]



In [24]:
# ============================================================
# FIRST 3 DAYS PREDICTION ANALYSIS - ITEM 164088 ONLY
# ============================================================

TARGET_ITEM = 164088

# Get the first 3 days of the test set
first_3_dates = sorted(test_df['date'].unique())[:3]
print(f"Analyzing first 3 prediction days: {[str(d.date()) for d in first_3_dates]}")
print(f"Filtering for item_nbr: {TARGET_ITEM}")

# Filter test_df for these dates AND specific item
first_3_days_df = test_df[
    (test_df['date'].isin(first_3_dates)) & 
    (test_df['item_nbr'] == TARGET_ITEM)
].copy()

print(f"Total records: {len(first_3_days_df)}")

# Calculate prediction error metrics
first_3_days_df['error'] = first_3_days_df['pred'] - first_3_days_df['unit_sales']
first_3_days_df['abs_error'] = np.abs(first_3_days_df['error'])
first_3_days_df['pct_error'] = (first_3_days_df['error'] / (first_3_days_df['unit_sales'] + 1)) * 100

# Select and rename relevant columns
result_df = first_3_days_df[[
    'date', 'store_nbr', 'item_nbr', 'family', 
    'unit_sales', 'pred', 'error', 'abs_error', 'pct_error'
]].copy()

result_df = result_df.rename(columns={
    'unit_sales': 'true_sales',
    'pred': 'predicted_sales'
})

# Round for readability
result_df['predicted_sales'] = result_df['predicted_sales'].round(2)
result_df['error'] = result_df['error'].round(2)
result_df['abs_error'] = result_df['abs_error'].round(2)
result_df['pct_error'] = result_df['pct_error'].round(2)

# Sort by date, store
result_df = result_df.sort_values(['date', 'store_nbr']).reset_index(drop=True)

print("\n" + "="*90)
print(f"ITEM {TARGET_ITEM} - FIRST 3 DAYS PREDICTIONS BY STORE")
print("="*90)
print(result_df.to_string(index=False))

# ============================================================
# SUMMARY STATISTICS
# ============================================================

print("\n" + "="*90)
print(f"ITEM {TARGET_ITEM} - SUMMARY BY DATE")
print("="*90)
date_summary = first_3_days_df.groupby('date').agg({
    'unit_sales': ['count', 'sum', 'mean'],
    'pred': ['sum', 'mean'],
    'abs_error': ['mean', 'sum']
}).round(2)
date_summary.columns = ['n_stores', 'total_true_sales', 'avg_true_sales', 
                        'total_predicted', 'avg_predicted', 'MAE', 'total_abs_error']
print(date_summary)

print("\n" + "="*90)
print(f"ITEM {TARGET_ITEM} - SUMMARY BY STORE (across first 3 days)")
print("="*90)
store_summary = first_3_days_df.groupby('store_nbr').agg({
    'unit_sales': ['count', 'sum', 'mean'],
    'pred': ['sum', 'mean'],
    'abs_error': 'mean'
}).round(2)
store_summary.columns = ['n_days', 'total_true_sales', 'avg_true_sales', 
                         'total_predicted', 'avg_predicted', 'MAE']
print(store_summary.to_string())

Analyzing first 3 prediction days: ['2017-07-31', '2017-08-01', '2017-08-02']
Filtering for item_nbr: 164088
Total records: 2

ITEM 164088 - FIRST 3 DAYS PREDICTIONS BY STORE
      date  store_nbr  item_nbr family  true_sales  predicted_sales  error  abs_error  pct_error
2017-07-31          1    164088  DAIRY         2.0             3.07   1.07       1.07      35.76
2017-08-02          1    164088  DAIRY         6.0             2.86  -3.14       3.14     -44.79

ITEM 164088 - SUMMARY BY DATE
            n_stores  total_true_sales  avg_true_sales  total_predicted  \
date                                                                      
2017-07-31         1               2.0             2.0             3.07   
2017-08-02         1               6.0             6.0             2.86   

            avg_predicted   MAE  total_abs_error  
date                                              
2017-07-31           3.07  1.07             1.07  
2017-08-02           2.86  3.14             3.14 