In [2]:
import numpy as np 
import pandas as pd

In [3]:
train = pd.read_csv('../data/merged_train.csv')
train_alt = pd.read_csv('../data/merged_train_alt.csv')
test = pd.read_csv('../data/merged_test.csv')
sample_subm = pd.read_csv('../data/sample_submission.csv')

  train = pd.read_csv('../data/merged_train.csv')
  train_alt = pd.read_csv('../data/merged_train_alt.csv')


## Baseline Model I: Weighted Mean Sales

|  Variable  | Description 
|:--------|:--------|
$D$         |   day of the week (Monday-Sunday)
$W$         |   week of the year
$S$         |   Favorita store number (1-54)
$F$         |   product family, e.g. baby care products
$Y$         |   year  (2013, 2014, 2015, 2016)
$\textnormal{Sales}_{D,W,S,F}(Y)$    |   total sales of $F$ products on day $D$, week $W$, store $S$, in the year $Y$.

Our baseline model predicts $\textnormal{Sales}_{D,S,F}(2017)$ using a weighted average of previous years sales.

More recent years are more heavily weighted.

$$\textnormal{Sales}_{D,W,S,F}(2017) = \sum_{Y=2013}^{2016}\lambda_{Y}\cdot\textnormal{Sales}_{D,W,S,F}(Y) \hspace{0.5in} \textnormal{where }\lambda_{Y}=0.1+0.1(Y-2013)$$


In [None]:
years = [2013, 2014, 2015, 2016]

for year in years:
    # Splits training set by year
    # Renames 'sales' column to f'sales_{year}'
    df_year = train[train['year'] == year].copy()
    df_year = df_year.rename(columns={'sales': f'sales_{year}'})
    
    # Merge the previous year sales to test data
    # Merge along day of the week, week number, store number, and product family
    test = test.merge( df_year[['week_number', 'day_of_week', 'store_nbr', 'family', f'sales_{year}']], 
                            on=['week_number', 'day_of_week', 'store_nbr', 'family'], 
                            how='left'    
    )

test['mean_sales'] = 0.1*test['sales_2013']+ 0.2*test['sales_2014']+ 0.3*test['sales_2015']+0.4*test['sales_2016']

In [None]:
def baseline1_submission():
    sample_subm['sales'] = test['mean_sales']
    #sample_subm.to_csv('/kaggle/working/submission.csv',index=False)
    # it scores 0.90242

## Baseline Model II: Rolling Average

In [4]:
# Reset the testing set
test = pd.read_csv('../data/merged_test.csv')

# Combine training and testing sets to compute rolling averages
df_whole = pd.concat([train, test], ignore_index=True)

# Calculate the rolling average
avg_sales=df_whole.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())

# Add rolling averages as a columm
df_whole['avg_sales']=avg_sales

# Select the testing set dates
df_whole[len(train):]

Unnamed: 0,date,year,month,week_number,day,day_of_week,store_nbr,type,cluster,city,...,hol_type_Additional,hol_type_Bridge,hol_type_Event,hol_type_Holiday,hol_type_Transfer,hol_type_Work Day,family,onpromotion,sales,avg_sales
2755104,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,,,,False,,,AUTOMOTIVE,0,,4.965517
2755105,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,,,,False,,,BABY CARE,0,,0.000000
2755106,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,,,,False,,,BEAUTY,2,,3.310345
2755107,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,,,,False,,,BEVERAGES,20,,2060.068966
2755108,2017-08-16,2017,8,33,16,2,1,D,13,Quito,...,,,,False,,,BOOKS,0,,0.034483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2783611,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,,,,False,,,POULTRY,1,,437.880281
2783612,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,,,,False,,,PREPARED FOODS,0,,120.867428
2783613,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,,,,False,,,PRODUCE,1,,1606.105500
2783614,2017-08-31,2017,8,35,31,3,9,B,6,Quito,...,,,,False,,,SCHOOL AND OFFICE SUPPLIES,9,,150.071429


In [None]:
def baseline2_submission():
    sample_subm['sales'] = df_whole['avg_sales'].iloc[len(train):].reset_index(drop=True)
    #sample_subm.to_csv('/kaggle/working/submission.csv',index=False)
    # it scores 0.46141 