In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
#from category_encoders import TargetEncoder
from sklearn.feature_selection import SelectFromModel

In [2]:
date = {}
date['date_start_train'] = '2017-04-30'
date['date_end_train'] = '2017-08-15'
date['date_end_test'] = '2017-08-31'
#test starts on '2017-08-16'

diff_test = (pd.Timestamp(date['date_end_test']) - pd.Timestamp(date['date_start_train'])).days
diff_train = (pd.Timestamp(date['date_end_train']) - pd.Timestamp(date['date_start_train'])).days

In [3]:
final_df = pd.read_csv('df_RF.csv', parse_dates=['date']).set_index('date')

# Stacking

In [4]:
def split_func (data, X, y, end_date, test_size):
    
    # Splitting train and test
    idx_train, idx_test = train_test_split(data.index, test_size=test_size, shuffle=False)
    X_train, X_test = X.loc[idx_train, :], X.loc[idx_test, :]
    y_train, y_test = y.loc[idx_train], y.loc[idx_test]
    
    return X_train, y_train, X_test, y_test

def create_sample_weights(X, target_date, weight=0.9):
    extra_weight_days = X.index.get_level_values('date') > target_date
    return np.array(list(map(lambda x: np.exp(-weight) if x == 0 else 1, extra_weight_days.astype('int'))))

In [5]:
def train_test (data, end_df, n):
    
    df = data.loc[:end_df,:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()
    y = np.log1p(df.loc[:,'sales'].unstack(['store_nbr', 'family']))
    
    # Selecting features
    #We select the best feature (done in feature selection)
    
    X = df[['sin(2,freq=A-DEC)','sin(2,freq=W-SUN)','lagoil_2_dcoilwtico','lagoil_6_dcoilwtico',
            'lagoil_7_dcoilwtico','isweekend','oil_2_month_avg','trend','lagoil_10_dcoilwtico',
            'sin(1,freq=A-DEC)','lagoil_1_dcoilwtico','sin(4,freq=W-SUN)','cos(1,freq=W-SUN)',
            'dcoilwtico','sin(4,freq=A-DEC)','oil_1_month_avg','lagoil_14_dcoilwtico',
            'sin(5,freq=A-DEC)','sin(1,freq=M)','cos(2,freq=M)','day','cos(1,freq=M)','sin(2,freq=M)',
            'sin(1,freq=W-SUN)','onpromotion_std_store', 'onpromotion_avg_store', 'onpromotion_biweek_avg',
            'onpromotion_lag_3'
            ]] 
    
    X = X.groupby(by='date').first()
        
    # Train
    if end_df <= date['date_end_train']:
        y_tr = np.empty((92,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((92,0))
        pred_test = np.empty((n,0))
    # Test
    else:
        y_tr = np.empty((108,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((108,0))
        pred_test = np.empty((n,0))

    # A model for each shop
    for i in data.store_nbr.unique():
        y = df.loc[i,'sales'].unstack(['family'])
        X = df.loc[i, X.columns]
        X = X.groupby(by='date').first()

        # Splitting train and test and log transformation
        X_train, y_train, X_test, y_test = split_func(y, X, np.log1p(y), end_df, n)
                
        # Exponentially weighted cost function
        weights = create_sample_weights(X_train, '2017-07-01')
        
        XGB = xg.XGBRegressor(n_estimators=500, learning_rate = 0.01, max_depth= 3, subsample = 0.5, colsample_bytree = 0.6, colsample_bylevel = 1, random_state=0)
        RF = RandomForestRegressor(n_estimators=1200, max_depth = 50, max_features = 'auto', bootstrap = True, min_samples_leaf=2, min_samples_split=2, random_state=0)
        LR = LinearRegression()
        
        XGB.fit(X_train, y_train)
        RF.fit(X_train, y_train)
        LR.fit(X_train, y_train)
        
        preds1 = XGB.predict(X_train)
        preds2 = RF.predict(X_train)
        preds3 = LR.predict(X_train)

        test_preds1 = XGB.predict(X_test)
        test_preds2 = RF.predict(X_test)
        test_preds3 = LR.predict(X_test)
        
        train_stack = np.column_stack((preds1,preds2, preds3))
        test_stack = np.column_stack((test_preds1,test_preds2, test_preds3))
        
        model =  RandomForestRegressor(n_estimators=1200, max_depth = 50, max_features = 'auto', bootstrap = True, min_samples_leaf=2, min_samples_split=2, random_state=0)
        model.fit(train_stack, y_train, sample_weight = weights)
        final_predictions = model.predict(test_stack) 
        
        y_tr = np.append(y_tr, y_train, axis=1)
        y_te = np.append(y_te, y_test, axis=1)
        pred_test = np.append(pred_test, final_predictions, axis=1)

    index = pd.MultiIndex.from_product([data.store_nbr.unique(), data.family.sort_values().unique()], names=['store_nbr', 'family'])
    
    y_tr = pd.DataFrame(y_tr, columns=index, index=X_train.index)
    y_te = pd.DataFrame(y_te, columns=index, index=X_test.index)
    pred_test = pd.DataFrame(pred_test, columns=y_te.columns, index=y_te.index)

    y_tr = y_tr.stack(['store_nbr', 'family'])
    y_te = y_te.stack(['store_nbr', 'family'])
    
    pred_test = pred_test.stack(['store_nbr', 'family'])

    return pred_test, y_te

In [1]:
pred_test, y_te = train_test(final_df, date['date_end_train'], 16)

In [8]:
y_pred = pred_test.reset_index()
y = y_te.reset_index()

In [1]:
y['sales_pred'] =  y_pred[0].clip(0.)
y = y.rename(columns={0: 'sales'})
y = y.set_index('date')
y

In [11]:
# y.to_csv('Stacking.csv', index=True)

In [99]:
pred_test, y_te = train_test(final_df, date['date_end_test'], 16)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
y = pd.Series(np.exp(pred_test.values) - 1, index=pred_test.index)
tg = final_df2.reset_index().set_index(['date', 'store_nbr', 'family']).sort_index().loc['2017-08-16':, 'id']

In [None]:
sub = pd.concat([tg, y], axis=1)
sub = sub.rename(columns = {0:'sales'})
sub['sales'] = sub['sales'].apply(lambda x: 0 if x<0.01 else x) 
sub

In [None]:
# sub.to_csv('submission.csv', index=False)