### This script will round up all the split result files and merge together them for a final submission file.

In [None]:
# imports
import numpy as np
import pandas as pd
import glob

In [2]:
# working directory / files
workDir = 'D:\\project\\data\\kg_corpgroc\\'
eportDir = 'D:\\project\\data\\kg_corpgroc\\export\\completed\\'

In [3]:
# test file
testPD = pd.read_csv(workDir + 'test.csv')

# submission file
submissionPD = pd.read_csv(workDir + 'sample_submission.csv')

In [4]:
#for filename in glob.glob(workDir + '/*results*'):
#    print (filename)
    
# get data file names
#path =r'C:\DRO\DCL_rawdata_files'
filenames = glob.glob(eportDir + '/*results*')    
    
dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

In [6]:
# expecting 3,370,464 rows
#testPD.head()

In [7]:
#submissionPD.head()

In [8]:
# Did not really need to index, missed the item_nbr on the join, 
#  but will keep in place anyway becuase it has perf benefits on the join
testPD.set_index(['store_nbr','date','item_nbr'], inplace=True)
big_frame.set_index(['store_nbr','date','item_nbr'], inplace=True)

In [9]:
# Join taking the tesPD as the main
joinDF = pd.merge(testPD, big_frame, how='left', left_index=True, right_index=True)

In [11]:
# Functino to set any negative values to 0 as they are not allowed for submission
def fn_set_negative_to_zero(col):
    if col < 0:
        return 0.0
    else:
        return col

In [12]:
# Function to write the submission file
# Sometime the input column name would be the log version, or the moving average value, so it gets re-named 
#  to the standard "visitors" expected by the competition
def fn_write_submission_file(df, colNames, fileName):
    
    print(colNames[1])
    forecastCol = colNames[1]
    
    tempDF = df.copy()
    
    # Rename the second column to "visitors" as per submission
    tempDF.rename(columns={forecastCol: 'unit_sales'}, inplace=True)
    
    #print(tempDF.head())
    tempDF.to_csv(fileName, header=True, index=False, quotechar='"', columns=('id','unit_sales'))
    print('Wrote file: ' + fileName)

In [13]:
# Reset index
dfFinal = joinDF.copy()

# re-name varaibles - Actually just keep only columns that we need
dfFinal = dfFinal[['id_x','forecast','forecast_rnd']]
dfFinal.rename(columns={'id_x': 'id'}, inplace=True)

# set any nan to 0.  Some of the test items do not have entries in each store, so it would error/skip during generation.
#  here I am setting to 0, instead of trying to model/guess a new item being add that never existed or has not
#  been stocked in the past 365 days.
dfFinal['forecast'] = dfFinal['forecast'].replace(np.nan, 0)
dfFinal['forecast_rnd'] = dfFinal['forecast_rnd'].replace(np.nan, 0)

# set any negative to 0
dfFinal['forecast'] = dfFinal.apply(lambda row: fn_set_negative_to_zero(row['forecast']), axis=1)
dfFinal['forecast_rnd'] = dfFinal.apply(lambda row: fn_set_negative_to_zero(row['forecast_rnd']), axis=1)

In [14]:
#dfFinal.head()

In [15]:
import datetime as dt

todayDate = str(dt.date.today().strftime('%Y%m%d'))
exportDir = 'D:\\project\\data\\kg_corpgroc\\'

# Float forecast submission
columns=('id','forecast')
fn_write_submission_file(dfFinal, columns, exportDir + todayDate + '_subm_frcst_flt.csv')

# Rounded Forecast submission
columns=('id','forecast_rnd')
fn_write_submission_file(dfFinal, columns, exportDir + todayDate + '_subm_frcst_rnd.csv')

forecast
Wrote file: D:\project\data\kg_corpgroc\20180105_subm_frcst_flt.csv
forecast_rnd
Wrote file: D:\project\data\kg_corpgroc\20180105_subm_frcst_rnd.csv
