In [1]:
# generate cwt coeffs for holiday and promo calendar
# for holiday, mexican hat wavelet (mexh) of scale = 3 is used
# for promotion, the average of mexh of scale 3, 7, 15 is used
# this two columns will be updated whenever a new promotion or holiday date is added to any stores

# this code will also generate cwt coeffs to capture the seasonality 
# per store by averaing the standardized yearly invoice/phone volume 
# for short term seasonality, the scales of the mexh is 3, 7, and 15, the final cwt coeff will be the average
# for long term seasonality, mexh 20 is used
# in addition, the mean cwt from all store within the same state is calculated for both short and long term seasonlaity
# to enhance winter effect, the state-wise cwt and cwt_quarter coeffs will be filtered by season of the year

# input file: invoice, phone actual

In [2]:
# import libraries
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler
import pywt
import warnings
warnings.filterwarnings("ignore")

In [3]:
# standardize invoice per store
def standardize_data_per_store(raw_df_yearly):
    # perform a robust scaler transform of the dataset for each store
    trans = StandardScaler()
    df = raw_df_yearly
    for store_code in df['store_code'].unique():
        data = np.array(df.loc[df['store_code'] == store_code, 'actual'])
        data = data.reshape(-1, 1)
        data = trans.fit_transform(data)
        df.loc[df['store_code'] == store_code, 'standard_actual'] = data
    return df

# standardize per year
def standardize_data_per_year(raw_df):
    df = raw_df
    df['year'] = df['date'].dt.year
    df['standard_actual'] = 0
    for year in df['year'].unique():
        df_yearly = df.loc[df['year']==year]
        df_yearly = standardize_data_per_store(df_yearly)
        df.loc[df['year'] == year, 'standard_actual'] = list(df_yearly['standard_actual'])
    return df

# calcualte cwt
def mexh_wavelet_standard_actual(scales,x,wavelet = 'mexh'):
    scales = np.arange(1,scales)
    coeffs, freqs = pywt.cwt(list(x['avg_standard_actual']), scales, wavelet = wavelet)
    # create scalogram
    # ax = plt.matshow(coeffs)
    cwt = pd.DataFrame(np.transpose(coeffs))
    cwt['store_code'] = list(x['store_code'])
    cwt['date'] = list(x['date'])
    cwt['date'] = pd.to_datetime(cwt['date'])
    return cwt

# calcualte cwt for all stores
def wavelet_all_stores(raw_df):
    df = raw_df
    cnt = 0
    for store_code in df['store_code'].unique():
        x = df.loc[df['store_code'] == store_code]
        if cnt == 0:
            cwt = mexh_wavelet_standard_actual(22, x)
        else:
            cwt = cwt.append(mexh_wavelet_standard_actual(22, x))
        cnt += 1
    df = pd.merge(df, cwt, how = 'left',left_on = ['store_code','date'],right_on = ['store_code','date'])
    # sort by store_id, effective_date, and actual
    df = df.sort_values(by=['store_code', 'date'])
    # drop duplicated entries
    df.drop_duplicates(['date', 'store_code'], keep='last', inplace=True)

    # use scale 3, 7, and 15 for averaging
    df['cwt'] = (df[3]+df[7]+df[15])/3
    df['cwt_mexh_quarter'] = df[20]
    return df[['store_code','date','season','cwt','cwt_mexh_quarter']]

def mexh_wavelet_promo_holiday(x, colname, scales=18, wavelet='mexh'):
    scales = np.arange(1, scales)
    coeffs, freqs = pywt.cwt(list(x), scales, wavelet=wavelet)

    # create scalogram
    cwt = pd.DataFrame(np.transpose(coeffs)).set_index(x.index)

    if colname == 'promotion':
        cwt['ave_cwt'] = (cwt[3] + cwt[7] + cwt[15]) / 3
    else:
        cwt['ave_cwt'] = -cwt[3]
    return cwt['ave_cwt']


def get_promotion_cwt(promo_df, start_date = '2017-01-01', end_date = '2020-06-30'):
    # get full calendar to the promo
    promo_df = pd.merge(promo_df.assign(is_promo=1),
                       pd.DataFrame(pd.date_range(start=start_date, end=end_date), columns=["date"]),
                       how = 'right', left_on = 'date', right_on = 'date').fillna(0)
    # cwt
    promo_df = promo_df.sort_values(by = 'date')
    promo_cwt = mexh_wavelet_promo_holiday(promo_df['is_promo'],'promotion')
    # add to the original data frame
    promo_df['promotion_cwt'] = list(promo_cwt)
    
    return promo_df
    
def get_holiday_cwt(holiday_df, start_date = '2017-01-01', end_date = '2020-06-30'):
    # get full calendar to the promo
    holiday_df = pd.merge(holiday_df.assign(is_holiday=1),
                       pd.DataFrame(pd.date_range(start=start_date, end=end_date), columns=["date"]),
                       how = 'right', left_on = 'date', right_on = 'date').fillna(0)

    # cwt
    holiday_df = holiday_df.sort_values(by = 'date')
    holiday_cwt = mexh_wavelet_promo_holiday(holiday_df['is_holiday'],'holiday')
    # add to the original data frame
    holiday_df['holiday_cwt'] = list(holiday_cwt)
    
    return holiday_df

# get the season (using solstice -- only works for the Northern Hemisphere)
def season(date):    
    md = date.month * 100 + date.day
    if ((md > 320) and (md < 621)):
        s = 'spring' #spring
    elif ((md > 620) and (md < 923)):
        s = 'summer' #summer
    elif ((md > 922) and (md < 1223)):
        s = 'fall' #fall
    else:
        s = 'winter' #winter
    return s

# get season for full_calendar
def calendar_season(full_calendar):
    season_range = []
    for i in full_calendar['date']:
        season_range.append(season(i))
    full_calendar['season']= season_range
    return full_calendar


def get_full_calendar(raw_df, start_date = '2017-01-01', end_date = '2020-06-30'):
# create full calendar with store_code

    store_code = pd.DataFrame(raw_df.store_code.unique(),columns = ['store_code']).dropna()
    full_calendar = pd.merge(
        store_code.assign(key=1),
        pd.DataFrame(pd.date_range(start=start_date, end=end_date), columns=["date"]).assign(key=1),
        how='left').drop('key',1)

    # get year, month, day for the full calendar
    full_calendar['year'] = full_calendar['date'].dt.year
    full_calendar['month'] = full_calendar['date'].dt.month
    full_calendar['day'] = full_calendar['date'].dt.day
    
    # get season
    full_calendar = calendar_season(full_calendar)
    return full_calendar

def merge_all_df(invoice_cwt,phone_cwt,holiday_cwt,promo_cwt):
    # rename phone column
    phone_cwt = phone_cwt.rename(columns={
    'cwt':'phone_cwt',
    'state_mean_cwt':'phone_state_mean_cwt',
    'cwt_mexh_quarter':'phone_cwt_mexh_quarter',
    'winter_state_mean_cwt':'phone_winter_state_mean_cwt',
    'fall_state_mean_cwt':'phone_fall_state_mean_cwt',
    'state_mean_cwt_mexh_quarter':'phone_state_mean_cwt_mexh_quarter',
    'winter_state_mean_cwt_mexh_quarter':'phone_winter_state_mean_cwt_mexh_quarter',
    'fall_state_mean_cwt_mexh_quarter':'phone_fall_state_mean_cwt_mexh_quarter'
    })

    # rename invoice column
    invoice_cwt = invoice_cwt.rename(columns={
    'cwt':'invoice_cwt',
    'state_mean_cwt':'invoice_state_mean_cwt',
    'cwt_mexh_quarter':'invoice_cwt_mexh_quarter',
    'winter_state_mean_cwt':'invoice_winter_state_mean_cwt',
    'fall_state_mean_cwt':'invoice_fall_state_mean_cwt',
    'state_mean_cwt_mexh_quarter':'invoice_state_mean_cwt_mexh_quarter',
    'winter_state_mean_cwt_mexh_quarter':'invoice_winter_state_mean_cwt_mexh_quarter',
    'fall_state_mean_cwt_mexh_quarter':'invoice_fall_state_mean_cwt_mexh_quarter'
    })
    
    # merge phone and invoice cwt
    actual_cwt = pd.merge(invoice_cwt,phone_cwt,how = 'outer',
                     on = ['date','store_code','store_state_code','weather_region'])
    
    # identify wherther a store has phone or invoice data
    actual_cwt['is_invoice_available'] = np.where(actual_cwt['invoice_cwt'].isna(),0,1)
    actual_cwt['is_phone_available'] = np.where(actual_cwt['phone_cwt'].isna(),0,1)
    
    # fill na 
    actual_cwt = actual_cwt.fillna(0)
    
    # merge with holiday and promotion
    cwt = pd.merge(actual_cwt,promo_cwt[['date','promotion_cwt']], how = 'left', on = ['date'])
    cwt = pd.merge(cwt,holiday_cwt[['date','holiday_cwt']], how = 'left', on = ['date'])
    
    # columns to be returned
    cols = ['date','store_code','invoice_cwt','store_state_code','invoice_state_mean_cwt','invoice_winter_state_mean_cwt',
            'invoice_fall_state_mean_cwt','weather_region','invoice_cwt_mexh_quarter',
            'invoice_state_mean_cwt_mexh_quarter','invoice_winter_state_mean_cwt_mexh_quarter',
            'invoice_fall_state_mean_cwt_mexh_quarter','phone_cwt','phone_state_mean_cwt','phone_winter_state_mean_cwt',
            'phone_fall_state_mean_cwt','phone_cwt_mexh_quarter','phone_state_mean_cwt_mexh_quarter',
            'phone_winter_state_mean_cwt_mexh_quarter','phone_fall_state_mean_cwt_mexh_quarter',
            'is_invoice_available','is_phone_available','promotion_cwt','holiday_cwt']
    return cwt[cols]

In [4]:
# cwt per store using historic values
# first standardize within each year
# then take the mean in the previous years (two years ago at least)
# populate over all dates 

def get_cwt_per_store(raw_invoice_df,start_date = '2017-01-01', end_date = '2020-06-30'):
    # obtain all store code
    raw_df = raw_invoice_df
    
    # get full calendar
    full_calendar = get_full_calendar(raw_invoice_df,start_date, end_date)
    
    # merge
    raw_df = pd.merge(full_calendar,raw_df,how = 'left', 
                      left_on = ['date','store_code'], 
                      right_on = ['effective_date','store_code']
                     ).drop(['effective_date','store_id','metric_id'],1).fillna(method = 'ffill').fillna(method = 'bfill')

    # only select two year ago's data
    raw_df = raw_df.loc[raw_df['year'] < (max(raw_df['year'].unique()) - 1)]
    # standardize every year
    raw_df = standardize_data_per_year(raw_df)
    # calculate mean of the standardized data
    daily_mean = raw_df.groupby(['store_code', 'month','day']).sum().reset_index().rename(
                columns={'standard_actual': 'avg_standard_actual'}).drop(['actual','year'],1)

    # calculate mean of the standardized data
    daily_mean = raw_df.groupby(['store_code', 'month','day']).sum().reset_index().rename(
                                columns={'standard_actual': 'avg_standard_actual'}).drop(['actual','year'],1)


    # back to the orignial dataframe
    raw_df = pd.merge(full_calendar,daily_mean, how = 'left', 
                      on = ['store_code','month','day']).fillna(method = 'ffill')

    # cwt 
    raw_df = wavelet_all_stores(raw_df)

    return raw_df

In [5]:
# find state mean values
# from cwt per store
# find the mean for each state
# find the fall and winter cwt for each state
def get_cwt_per_state(raw_df,store_attribute_df):
    
    # include store state code information
    raw_df = pd.merge(raw_df,store_attribute_df[['store_code','store_state_code']], 
                         how = 'left', on  = ['store_code']).fillna(raw_df['store_code'][:1])
    
    # find state mean
    state_mean = raw_df.groupby(by = ['store_state_code','date']).mean().reset_index()
    
    # merge to original data frame 
    raw_df = pd.merge(raw_df,
                      state_mean[['store_state_code','date','cwt','cwt_mexh_quarter']].rename(
                          columns = {'cwt_mexh_quarter':'state_mean_cwt_mexh_quarter','cwt':'state_mean_cwt'}), 
                      how = 'left',on = ['date','store_state_code'])
    
    return raw_df

def get_cwt_per_state_per_season(raw_df):
    # one hot encoding
    raw_df['fall'] = np.where(raw_df['season'] == 'fall',1,0)
    raw_df['winter'] = np.where(raw_df['season'] == 'winter',1,0)
    raw_df['fall_state_mean_cwt'] = raw_df['state_mean_cwt']*raw_df['fall']
    raw_df['winter_state_mean_cwt'] = raw_df['state_mean_cwt']*raw_df['winter']
    raw_df['fall_state_mean_cwt_mexh_quarter'] = raw_df['state_mean_cwt_mexh_quarter']*raw_df['fall']
    raw_df['winter_state_mean_cwt_mexh_quarter'] = raw_df['state_mean_cwt_mexh_quarter']*raw_df['winter']
    raw_df['weather_region'] = 'other'
    raw_df.loc[raw_df['store_state_code'] == 'CO','weather_region'] = 'CO'
    raw_df.loc[raw_df['store_state_code'] == 'MI','weather_region'] = 'MI'
    raw_df.loc[raw_df['store_state_code'] == 'MN','weather_region'] = 'MN'
    return raw_df
  

In [6]:
# calculat all invoice/phone related cwt columns
def get_cwt_from_actual(raw_df,store_attribute_df,start_date = '2017-01-01', end_date = '2020-06-30'):
    
    # cwt for each store
    cwt_df = get_cwt_per_store(raw_df,start_date = start_date, end_date = end_date)

    # get cwt for each state 
    cwt_df = get_cwt_per_state(cwt_df,store_attribute_df)

    # state mean cwt for fall and winter
    cwt_df = get_cwt_per_state_per_season(cwt_df)

    return cwt_df

In [7]:
# import files
raw_invoice_df = pd.read_csv('invoice_daily_actuals.csv')
raw_phone_df = pd.read_csv('phone_daily_actuals.csv')
store_attribute_df = pd.read_csv('final_store_attributes.csv')
holiday_df = pd.read_csv('holiday_calendar.csv')
promotion_df = pd.read_csv('promotion_calendar.csv')

# convert to datetitme
raw_invoice_df['effective_date'] = pd.to_datetime(raw_invoice_df['effective_date'], format='%Y%m%d')
raw_phone_df['effective_date'] = pd.to_datetime(raw_phone_df['effective_date'], format="%Y-%m-%d")
holiday_df['date'] = pd.to_datetime(holiday_df['HolidayDate'])
promotion_df['date'] = pd.to_datetime(promotion_df['PromotionDate'])

# calculate cwt for invoice and phone
# cwt = get_cwt_per_store(raw_invoice_df)
invoice_cwt_df = get_cwt_from_actual(raw_invoice_df,store_attribute_df)
phone_cwt_df = get_cwt_from_actual(raw_phone_df,store_attribute_df)


# calculate holiday and promotion cwt
holiday_cwt_df = get_holiday_cwt(holiday_df)
promotion_cwt_df = get_promotion_cwt(promotion_df)

# merge all data frames
cwt = merge_all_df(invoice_cwt_df,phone_cwt_df,holiday_cwt_df,promotion_cwt_df)

# save
cwt.to_csv('cwt.csv', header = True, index = False)