In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from collections import Counter
from sklearn.linear_model import LinearRegression
import xgboost as xg
from prophet import Prophet
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import ParameterGrid
import random

# Importing the dataset

In [2]:
final_df = pd.read_csv('df_RF.csv', parse_dates=['date']).set_index('date')
event_holiday = pd.read_csv('event_holiday.csv')

In [3]:
date = {}
date['date_start_train'] = '2017-04-30'
date['date_end_train'] = '2017-08-15'
date['date_end_test'] = '2017-08-31'
#test starts on '2017-08-16'

diff_test = (pd.Timestamp(date['date_end_test']) - pd.Timestamp(date['date_start_train'])).days
diff_train = (pd.Timestamp(date['date_end_train']) - pd.Timestamp(date['date_start_train'])).days

# Prophet

In [5]:
def split_func (data, X, y, end_date, test_size):
    
    # Splitting train and test
    idx_train, idx_test = train_test_split(data.index, test_size=test_size, shuffle=False)
    X_train, X_test = X.loc[idx_train, :], X.loc[idx_test, :]
    y_train, y_test = y.loc[idx_train], y.loc[idx_test]
    
    return X_train, y_train, X_test, y_test

In [9]:
def train_test (data, end_df, n):
    
    df = data.loc[:end_df,:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()
    y = np.log1p(df.loc[:,'sales'].unstack(['store_nbr', 'family']))
    
    # Selecting features
    X = df.drop(columns = ['id','sales','trend','transactions', 'onpromotion', 'store_B', 'store_C', 'store_D', 'store_E'])
    df2 = X.groupby(by='date').first()
        
    # Train
    if end_df <= date['date_end_train']:
        y_tr = np.empty((3036,0))
        y_te = np.empty((528,0))
        pred_train_y = np.empty((3036,0))
        pred_test_y = np.empty((528,0))
    # Test
    else:
        y_tr = np.empty((3564,0))
        y_te = np.empty((528,0))
        pred_train_y = np.empty((3564,0))
        pred_test_y = np.empty((528,0))

    # A model for each shop
    for i in data.store_nbr.unique():
        y = df.loc[i,'sales'].unstack(['family'])
        X = df.loc[i, df2.columns]
        X = X.groupby(by='date').first()

        # Splitting train and test and log transformation
        X_train, y_train, X_test, y_test = split_func(y, X, np.log1p(y), end_df, n)
        
        y_train = y_train.stack(['family']).to_frame()
        
        if end_df > date['date_end_train']:
            y_test = y_test.fillna(0).stack(['family']).to_frame()
        y_test = y_test.stack(['family']).to_frame()

        train = y_train.join(X_train.reindex(y_train.index, level=0))
        train = train.reset_index()
        train = train.rename(columns={'date': 'ds', 0: 'y'})

        test = y_test.join(X_test.reindex(y_test.index, level=0))
        test = test.reset_index()
        test = test.rename(columns={'date': 'ds', 0: 'y'})
        if end_df > date['date_end_train']:
            test['y'] = np.nan

        y_train = y_train.reset_index()
        y_test = y_test.reset_index()
        y_train = y_train[['date',0]].rename(columns={'date': 'ds', 0: 'y'})
        y_test = y_test[['date',0]].rename(columns={'date': 'ds', 0: 'y'})
    

        # Prophet
        model = Prophet(holidays = event_holiday,
                        changepoint_prior_scale = 0.05,
                        holidays_prior_scale = 0.01,
                        seasonality_prior_scale = 0.01,
                        seasonality_mode = 'additive',
                        yearly_seasonality = False,
                          weekly_seasonality = True,
                          daily_seasonality = False)
        
        for j in range(0, len(df2.columns.values)):
            model.add_regressor(df2.columns.values[j])
            
        model.fit(train)
        p_pred_train_y = model.predict(train) 
        p_pred_test_y = model.predict(test)
        
        y_tr = np.append(y_tr, train[['y']], axis=1)
        y_te = np.append(y_te, test[['y']], axis=1)
        pred_train_y = np.append(pred_train_y, p_pred_train_y[['yhat']], axis=1)
        pred_test_y = np.append(pred_test_y, p_pred_test_y[['yhat']], axis=1)
        
        # Performances of each shop
        # Train
        if end_df <= date['date_end_train']:
            print(f'RMSLE_train {i}: ', np.round(np.sqrt(mean_squared_error(train[['y']].clip(0.0), p_pred_train_y[['ds', 'yhat']].set_index('ds').clip(0.0))), 4), f'RMSLE_test {i}: ', np.round(np.sqrt(mean_squared_error(test[['y']].clip(0.0), p_pred_test_y[['ds', 'yhat']].set_index('ds').clip(0.0))), 4))        

    
    # Total performances
    # Train
    if end_df <= date['date_end_train']:
        print(f'RMSLE_train tot: ', np.round(np.sqrt(mean_squared_error(y_tr.clip(0.0), pred_train_y.clip(0.0))), 4), f'RMSLE_test tot: ', np.round(np.sqrt(mean_squared_error(y_te.clip(0.0), pred_test_y.clip(0.0))), 4))

   
 
    return pred_test_y, y_te

In [11]:
# pred_test_y, y_te = train_test(final_df, date['date_end_train'], 16) 

In [232]:
pred_test_y, y_te = train_test(final_df, date['date_end_test'], 16) 

INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:finish chain

In [233]:
tg = final_df2.reset_index().set_index(['date', 'store_nbr', 'family']).sort_index().loc['2017-08-16':, 'id']

In [234]:
pred = pd.DataFrame(pred_test_y).melt(
        var_name="store_nbr", 
        value_name="sales")
pred

Unnamed: 0,store_nbr,sales
0,0,3.902660
1,0,3.902660
2,0,3.902660
3,0,3.902660
4,0,3.902660
...,...,...
28507,53,4.113966
28508,53,4.113966
28509,53,4.113966
28510,53,4.113966


In [235]:
pred['id'] = tg.reset_index()['id']
pred['store_nbr'] += 1
pred = pred.set_index('store_nbr')
pred

Unnamed: 0_level_0,sales,id
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.902660,3000888
1,3.902660,3000889
1,3.902660,3000890
1,3.902660,3000891
1,3.902660,3000892
...,...,...
54,4.113966,3029263
54,4.113966,3029264
54,4.113966,3029265
54,4.113966,3029266


In [236]:
cols = pred.columns.tolist()
cols = cols[-1:] + cols[:-1]
sub= pred[cols]
sub

Unnamed: 0_level_0,id,sales
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3000888,3.902660
1,3000889,3.902660
1,3000890,3.902660
1,3000891,3.902660
1,3000892,3.902660
...,...,...
54,3029263,4.113966
54,3029264,4.113966
54,3029265,4.113966
54,3029266,4.113966


In [None]:
sub.to_csv('submission.csv', index=False)

# HYPERPARAMETER TUNING

In [267]:
def tuning(train, test, y_test, X_test):
    param_grid = {  
    'changepoint_prior_scale': [0.001, 0.05, 0.08, 0.5],
    'holidays_prior_scale': [0.01, 1, 5, 10, 12],
    'seasonality_prior_scale': [0.01, 1, 5, 10, 12]
    }
    grid = ParameterGrid(param_grid)
    
    model_parameters = pd.DataFrame(columns = ['RMSLE','Parameters'])
    for p in grid:
        random.seed(0)
        train_model = Prophet(changepoint_prior_scale = p['changepoint_prior_scale'],
                          holidays_prior_scale = p['holidays_prior_scale'],
                          seasonality_prior_scale = p['seasonality_prior_scale'],
                          yearly_seasonality = False,
                          weekly_seasonality = True,
                          daily_seasonality = False)
        
        print('He entrado', p)
        train_model.fit(train)
        p_pred_test_y = train_model.predict(test)
        RMSLE = np.round(np.sqrt(mean_squared_error(test[['y']].clip(0.0), p_pred_test_y[['ds', 'yhat']].set_index('ds').clip(0.0))), 4)
        model_parameters = model_parameters.append({'RMSLE':RMSLE,'Parameters':p},ignore_index=True)
        
    parameters = model_parameters.sort_values(by=['RMSLE'])
    parameters = parameters.reset_index(drop=True)
    return parameters['Parameters'][0]

In [287]:
def HT (data, end_df, n):
    
    df = data.loc[:end_df,:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()
    y = np.log1p(df.loc[:,'sales'].unstack(['store_nbr', 'family']))
    
    # Selecting features
    X = df.drop(columns = ['id','sales','trend','transactions', 'onpromotion', 'store_B', 'store_C', 'store_D', 'store_E'])
    df2 = X.groupby(by='date').first()
        
    # Train
    if end_df <= date['date_end_train']:
        y_tr = np.empty((3036,0))
        y_te = np.empty((528,0))
        pred_train_y = np.empty((3036,0))
        pred_test_y = np.empty((528,0))
    # Test
    else:
        y_tr = np.empty((3564,0))
        y_te = np.empty((528,0))
        pred_train_y = np.empty((3564,0))
        pred_test_y = np.empty((528,0))
        
    par = []
    
    # A model for each shop
    for i in data.store_nbr.unique():
        y = df.loc[i,'sales'].unstack(['family'])
        X = df.loc[i, df2.columns]
        X = X.groupby(by='date').first()

        # Splitting train and test and log transformation
        X_train, y_train, X_test, y_test = split_func(y, X, np.log1p(y), end_df, n)
        
        y_train = y_train.stack(['family']).to_frame()
        if end_df > date['date_end_train']:
            y_test = y_test.fillna(0).stack(['family']).to_frame()
        y_test = y_test.stack(['family']).to_frame()

        train = y_train.join(X_train.reindex(y_train.index, level=0))
        train = train.reset_index()
        train = train.rename(columns={'date': 'ds', 0: 'y'})

        test = y_test.join(X_test.reindex(y_test.index, level=0))
        test = test.reset_index()
        test = test.rename(columns={'date': 'ds', 0: 'y'})
        if end_df > date['date_end_train']:
            test['y'] = np.nan

        y_train = y_train.reset_index()
        y_test = y_test.reset_index()
        y_train = y_train[['date',0]].rename(columns={'date': 'ds', 0: 'y'})
        y_test = y_test[['date',0]].rename(columns={'date': 'ds', 0: 'y'})

        params = tuning(train, test, y_test, X_test)
        par.append(params)
        model = Prophet(changepoint_prior_scale=params['changepoint_prior_scale'], 
                     seasonality_prior_scale=params['seasonality_prior_scale'], 
                     holidays_prior_scale=params['holidays_prior_scale'],
                     yearly_seasonality = False,
                          weekly_seasonality = True,
                          daily_seasonality = False)
        model.fit(train)
        print('He salido')
        p_pred_train_y = model.predict(train) 
        p_pred_test_y = model.predict(test)
        
        y_tr = np.append(y_tr, train[['y']], axis=1)
        y_te = np.append(y_te, test[['y']], axis=1)
        pred_train_y = np.append(pred_train_y, p_pred_train_y[['yhat']], axis=1)
        pred_test_y = np.append(pred_test_y, p_pred_test_y[['yhat']], axis=1)
        
        # Performances of each shop
        if end_df <= date['date_end_train']:
            print(f'RMSLE_train {i}: ', np.round(np.sqrt(mean_squared_error(train[['y']].clip(0.0), p_pred_train_y[['ds', 'yhat']].set_index('ds').clip(0.0))), 4), f'RMSLE_test {i}: ', np.round(np.sqrt(mean_squared_error(test[['y']].clip(0.0), p_pred_test_y[['ds', 'yhat']].set_index('ds').clip(0.0))), 4))        
  

    # Total performances
    # Train
    if end_df <= date['date_end_train']:
        print(f'RMSLE_train tot: ', np.round(np.sqrt(mean_squared_error(y_tr.clip(0.0), pred_train_y.clip(0.0))), 4), f'RMSLE_test tot: ', np.round(np.sqrt(mean_squared_error(y_te.clip(0.0), pred_test_y.clip(0.0))), 4))
   
    
    return pred_test_y, y_te, par

In [457]:
model_pred_test_y, final_df, par = HT(final_df, date['date_end_train'], 16) #añadir model_pred_tot_y

  # This is added back by InteractiveShellApp.init_path()


He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 0.01}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 1}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 5}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 10}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 12}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 1, 'seasonality_prior_scale': 0.01}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 1, 'seasonality_prior_scale': 1}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 1, 'seasonality_prior_scale': 5}
He entrado {'changepoint_prior_scale': 0.001, 'holidays_prior_scale': 1, 'seasonality_prior_scale': 10}
He entrado {'changepoint_prior_scale': 0.001, 'ho

In [289]:
par

[{'changepoint_prior_scale': 0.5,
  'holidays_prior_scale': 12,
  'seasonality_prior_scale': 12},
 {'changepoint_prior_scale': 0.05,
  'holidays_prior_scale': 10,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.5,
  'holidays_prior_scale': 1,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.05,
  'holidays_prior_scale': 0.01,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.05,
  'holidays_prior_scale': 10,
  'seasonality_prior_scale': 5},
 {'changepoint_prior_scale': 0.08,
  'holidays_prior_scale': 0.01,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.08,
  'holidays_prior_scale': 0.01,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.5,
  'holidays_prior_scale': 1,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.5,
  'holidays_prior_scale': 10,
  'seasonality_prior_scale': 0.01},
 {'changepoint_prior_scale': 0.08,
  'holidays_prior_scale': 12,
  'seasonality_prior_scale': 0.

In [291]:
from collections import Counter
date_counts = Counter(d['changepoint_prior_scale'] for d in par)
most_common = {'changepoint_prior_scale': date_counts.most_common(1)[0][0]}
most_common

{'changepoint_prior_scale': 0.5}

In [292]:
from collections import Counter
date_counts = Counter(d['holidays_prior_scale'] for d in par)
most_common = {'holidays_prior_scale': date_counts.most_common(1)[0][0]}
most_common

{'holidays_prior_scale': 0.01}

In [293]:
from collections import Counter
date_counts = Counter(d['seasonality_prior_scale'] for d in par)
most_common = {'seasonality_prior_scale': date_counts.most_common(1)[0][0]}
most_common

{'seasonality_prior_scale': 0.01}