In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
#from category_encoders import TargetEncoder
from sklearn.feature_selection import SelectFromModel

In [2]:
date = {}
date['date_start_train'] = '2017-04-30'
date['date_end_train'] = '2017-08-15'
date['date_end_test'] = '2017-08-31'
#test starts on '2017-08-16'

diff_test = (pd.Timestamp(date['date_end_test']) - pd.Timestamp(date['date_start_train'])).days
diff_train = (pd.Timestamp(date['date_end_train']) - pd.Timestamp(date['date_start_train'])).days

In [3]:
final_df = pd.read_csv('df_RF.csv', parse_dates=['date']).set_index('date')

# RF

In [4]:
def split_func (data, X, y, end_date, test_size):
    
    # Splitting train and test
    idx_train, idx_test = train_test_split(data.index, test_size=test_size, shuffle=False)
    X_train, X_test = X.loc[idx_train, :], X.loc[idx_test, :]
    y_train, y_test = y.loc[idx_train], y.loc[idx_test]
    
    return X_train, y_train, X_test, y_test

def create_sample_weights(X, target_date, weight=0.9):
    extra_weight_days = X.index.get_level_values('date') > target_date
    return np.array(list(map(lambda x: np.exp(-weight) if x == 0 else 1, extra_weight_days.astype('int'))))

In [5]:
def train_test (data, end_df, n):
    
    df = data.loc[:end_df,:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()
    y = np.log1p(df.loc[:,'sales'].unstack(['store_nbr', 'family']))
    
    # Selecting features
    #We select the best feature (done in feature selection)
    
    X = df[['sin(2,freq=A-DEC)','sin(2,freq=W-SUN)','lagoil_2_dcoilwtico','lagoil_6_dcoilwtico',
            'lagoil_7_dcoilwtico','isweekend','oil_2_month_avg','trend','lagoil_10_dcoilwtico',
            'sin(1,freq=A-DEC)','lagoil_1_dcoilwtico','sin(4,freq=W-SUN)','cos(1,freq=W-SUN)',
            'dcoilwtico','sin(4,freq=A-DEC)','oil_1_month_avg','lagoil_14_dcoilwtico',
            'sin(5,freq=A-DEC)','sin(1,freq=M)','cos(2,freq=M)','day','cos(1,freq=M)','sin(2,freq=M)',
            'sin(1,freq=W-SUN)','onpromotion_std_store', 'onpromotion_avg_store', 'onpromotion_biweek_avg',
            'onpromotion_lag_3'
            ]] 
    
    X = X.groupby(by='date').first()
        
    # Train
    if end_df <= date['date_end_train']:
        y_tr = np.empty((92,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((92,0))
        pred_test = np.empty((n,0))
    # Test
    else:
        y_tr = np.empty((108,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((108,0))
        pred_test = np.empty((n,0))

    # A model for each shop
    for i in data.store_nbr.unique():
        y = df.loc[i,'sales'].unstack(['family'])
        X = df.loc[i, X.columns]
        X = X.groupby(by='date').first()

        # Splitting train and test and log transformation
        X_train, y_train, X_test, y_test = split_func(y, X, np.log1p(y), end_df, n)
                
        # Exponentially weighted cost function
        weights = create_sample_weights(X_train, '2017-07-01')
        
        # RandomForestRegressor
        model = RandomForestRegressor(n_estimators=1200, max_depth = 50, max_features = 'auto', bootstrap = True, min_samples_leaf=2, min_samples_split=2, random_state=0) #bootstrap=False, max_depth=90, max_features='sqrt',min_samples_leaf=4, min_samples_split=10,n_estimators=600
        model.fit(X_train, y_train, sample_weight = weights) 
        rf_pred_train = model.predict(X_train) 
        rf_pred_test = model.predict(X_test)
        
        y_tr = np.append(y_tr, y_train, axis=1)
        y_te = np.append(y_te, y_test, axis=1)
        pred_train = np.append(pred_train, rf_pred_train, axis=1)
        pred_test = np.append(pred_test, rf_pred_test, axis=1)
        
        # Performances of each shop
        # Train
        if end_df <= date['date_end_train']:
            print(f'RMSLE_train {i}: ', np.round(np.sqrt(mean_squared_error(y_train.clip(0.0), rf_pred_train.clip(0.0))), 4), f'RMSLE_test {i}: ', np.round(np.sqrt(mean_squared_error(y_test.clip(0.0), trf_pred_test.clip(0.0))), 4))        

    index = pd.MultiIndex.from_product([data.store_nbr.unique(), data.family.sort_values().unique()], names=['store_nbr', 'family'])
    
    y_tr = pd.DataFrame(y_tr, columns=index, index=X_train.index)
    y_te = pd.DataFrame(y_te, columns=index, index=X_test.index)
    pred_train = pd.DataFrame(pred_train, columns=y_tr.columns, index=y_tr.index)
    pred_test = pd.DataFrame(pred_test, columns=y_te.columns, index=y_te.index)
    
    # Total performances
    # Train
    if end_df <= date['date_end_train']:
        print(f'RMSLE_train tot: ', np.round(np.sqrt(mean_squared_error(y_tr.clip(0.0), pred_train.clip(0.0))), 4), f'RMSLE_test tot: ', np.round(np.sqrt(mean_squared_error(y_te.clip(0.0), pred_test.clip(0.0))), 4))

   
    y_tr = y_tr.stack(['store_nbr', 'family'])
    y_te = y_te.stack(['store_nbr', 'family'])
    pred_train = pred_train.stack(['store_nbr', 'family'])
    pred_test = pred_test.stack(['store_nbr', 'family'])
 
    return pred_test, y_te

In [1]:
pred_test, y_te = train_test(final_df, date['date_end_train'], 16)

In [8]:
y_pred = pred_test.reset_index()
y = y_te.reset_index()

In [10]:
y['sales_pred'] =  y_pred[0].clip(0.)
y = y.rename(columns={0: 'sales'})
y = y.set_index('date')
y

Unnamed: 0_level_0,store_nbr,family,sales,sales_pred
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-31,1,AUTOMOTIVE,2.197225,1.506556
2017-07-31,1,BABY CARE,0.000000,0.000000
2017-07-31,1,BEAUTY,1.386294,1.367747
2017-07-31,1,BEVERAGES,7.789455,7.705845
2017-07-31,1,BOOKS,0.693147,0.166759
...,...,...,...,...
2017-08-15,54,POULTRY,4.104608,4.228193
2017-08-15,54,PREPARED FOODS,4.553877,4.511984
2017-08-15,54,PRODUCE,6.820421,6.591136
2017-08-15,54,SCHOOL AND OFFICE SUPPLIES,0.000000,0.118667


In [11]:
# y.to_csv('RF.csv', index=True)

In [99]:
pred_test, y_te = train_test(final_df, date['date_end_test'], 16)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
y = pd.Series(np.exp(pred_test.values) - 1, index=pred_test.index)
tg = final_df2.reset_index().set_index(['date', 'store_nbr', 'family']).sort_index().loc['2017-08-16':, 'id']

In [None]:
sub = pd.concat([tg, y], axis=1)
sub = sub.rename(columns = {0:'sales'})
sub['sales'] = sub['sales'].apply(lambda x: 0 if x<0.01 else x) 
sub

In [None]:
# sub.to_csv('submission.csv', index=False)

# HYPERPARAMETER TUNING

In [None]:
def HT (data, end_df, n):
    
    df = data.loc[:end_df,:].reset_index().set_index(['store_nbr', 'family', 'date']).sort_index()
    y = np.log1p(df.loc[:,'sales'].unstack(['store_nbr', 'family']))
    
    # Selecting features
    X = df[['sin(2,freq=A-DEC)','sin(2,freq=W-SUN)','lagoil_2_dcoilwtico','lagoil_6_dcoilwtico',
            'lagoil_7_dcoilwtico','isweekend','oil_2_month_avg','trend','lagoil_10_dcoilwtico',
            'sin(1,freq=A-DEC)','lagoil_1_dcoilwtico','sin(4,freq=W-SUN)','cos(1,freq=W-SUN)',
            'dcoilwtico','sin(4,freq=A-DEC)','oil_1_month_avg','lagoil_14_dcoilwtico',
            'sin(5,freq=A-DEC)','sin(1,freq=M)','cos(2,freq=M)','day','cos(1,freq=M)','sin(2,freq=M)',
            'sin(1,freq=W-SUN)','onpromotion_std_store', 'onpromotion_avg_store', 'onpromotion_biweek_avg'
            ]] 
    
    X = X.groupby(by='date').first()
        
    # Train
    if end_df <= date['date_end_train']:
        y_tr = np.empty((92,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((92,0))
        pred_test = np.empty((n,0))
    # Test
    else:
        y_tr = np.empty((diff_test-n,0))
        y_te = np.empty((n,0))
        pred_train = np.empty((diff_test-n,0))
        pred_test = np.empty((n,0))
        
    params = []

    # A model for each shop
    for i in data.store_nbr.unique():
        y = df.loc[i,'sales'].unstack(['family'])
        X = df.loc[i,X.columns]
        X = X.groupby(by='date').first()

        # Splitting train and test and log transformation
        X_train, y_train, X_test, y_test = split_func(y, X, np.log1p(y), end_df, n)
                
        # Exponentially weighted cost function    
        weights = create_sample_weights(X_train, '2017-07-01')
        
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                      'max_features': max_features,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf': min_samples_leaf,
                      'bootstrap': bootstrap}


        # Random Forest
        model = RandomForestRegressor(random_state=0)
        random_search = RandomizedSearchCV(model, param_distributions=random_grid, scoring='neg_mean_absolute_error', cv=3, random_state=42, n_jobs=-1)
        random_search.fit(X_train, y_train, sample_weight = weights)
        params.append(random_search.best_params_)
        rf_pred_train = random_search.predict(X_train) 
        rf_pred_test = random_search.predict(X_test)
        
        y_tr = np.append(y_tr, y_train, axis=1)
        y_te = np.append(y_te, y_test, axis=1)
        pred_train = np.append(pred_train, rf_pred_train, axis=1)
        pred_test = np.append(pred_test, rf_pred_test, axis=1)
        
        # Performances of each shop
        # Train
        if end_df <= date['date_end_train']:
            print(f'RMSLE_train {i}: ', np.round(np.sqrt(mean_squared_error(y_train.clip(0.0), rf_pred_train.clip(0.0))), 4), f'RMSLE_test {i}: ', np.round(np.sqrt(mean_squared_error(y_test.clip(0.0), rf_pred_test.clip(0.0))), 4))
        

    index = pd.MultiIndex.from_product([data.store_nbr.unique(), data.family.sort_values().unique()], names=['store_nbr', 'family'])
    
    y_tr = pd.DataFrame(y_tr, columns=index, index=X_train.index)
    y_te = pd.DataFrame(y_te, columns=index, index=X_test.index)
    pred_train = pd.DataFrame(pred_train, columns=y_tr.columns, index=y_tr.index)
    pred_test = pd.DataFrame(pred_test, columns=y_te.columns, index=y_te.index)
    
    # Total performances
    # Train
    if end_df <= date['date_end_train']:
        print(f'RMSLE_train tot: ', np.round(np.sqrt(mean_squared_error(y_tr.clip(0.0), pred_train.clip(0.0))), 4), f'RMSLE_test tot: ', np.round(np.sqrt(mean_squared_error(y_te.clip(0.0), pred_test.clip(0.0))), 4))

   
    y_tr = y_tr.stack(['store_nbr', 'family'])
    y_te = y_te.stack(['store_nbr', 'family'])
    pred_train = pred_train.stack(['store_nbr', 'family'])
    pred_test = pred_test.stack(['store_nbr', 'family'])

    return  pred_test, y_te, params, random_search

In [2]:
# pred_test, y_te, params, random_search = HT(final_df, date['date_end_train'], 16)

In [224]:
# date_counts = Counter(d['n_estimators'] for d in params)
# most_common = {'n_estimators': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

In [227]:
# date_counts = Counter(d['learning_rate'] for d in params)
# most_common = {'learning_rate': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

{'min_samples_split': 2}

In [228]:
# date_counts = Counter(d['subsample'] for d in params)
# most_common = {'subsample': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

{'min_samples_leaf': 4}

In [229]:
# date_counts = Counter(d['colsample_bytree'] for d in params)
# most_common = {'colsample_bytree': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

{'max_features': 'auto'}

In [230]:
# date_counts = Counter(d['max_depth'] for d in params)
# most_common = {'max_depth': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

{'max_depth': 100}

In [231]:
# date_counts = Counter(d['colsample_bylevel'] for d in params)
# most_common = {'colsample_bylevel': date_counts.most_common(1)[0][0]}
# print(date_counts)
# most_common

{'bootstrap': True}