# Packing the process in functions

In [None]:
def data_quality(x):
    
    
    #modify types
    temp = x.astype({'month': 'O', 'wday': 'O'})
    
    # nulls
    temp.event_name_1.fillna('No_event', inplace=True)
    
    # impute by mode
    temp.sell_price = temp.groupby('item_id')['sell_price'].transform(lambda x: x.fillna(x.mode()[0]))

    return temp
    

In [71]:
def create_variables(df):
    
    #### Intermitent demand
    
    def out_of_stock(sales, n = 5):
        zero_sales = pd.Series(np.where(sales == 0,1,0))
        num_zeros = zero_sales.rolling(n).sum()
        out_of_stock = np.where(num_zeros == n,1,0)
        return(out_of_stock)
    
    df = df.sort_values(by = ['store_id','item_id','date'])
    df['out_of_stock_3'] = df.groupby(['store_id','item_id']).sales.transform(lambda x: out_of_stock(x, 3))
    df['out_of_stock_7'] = df.groupby(['store_id','item_id']).sales.transform(lambda x: out_of_stock(x, 7))
    df['out_of_stock_15']= df.groupby(['store_id','item_id']).sales.transform(lambda x: out_of_stock(x, 15))

    #### LAGS
    
    def create_lags(df, variable, num_lags = 7):
        # create object dataframe
        lags = pd.DataFrame()

        # create lags
        for each in range(1,num_lags+1):
            lags[variable + '_lag_'+str(each)] = df[variable].shift(each)

        #return lags dataframe
        return lags

    # sell price ---> 7 days lag
    lags_sell_price_df = (df.groupby(['store_id', 'item_id'])
                            .apply(lambda x: create_lags(df = x, variable = 'sell_price', num_lags= 7), include_groups=False)
                            .reset_index()
                            .set_index('date'))

    # out_of_stock ---> 1 day lag
    lags_out_of_stock_3_df = (df.groupby(['store_id','item_id'])
                                .apply(lambda x: create_lags(df = x, variable = 'out_of_stock_3', num_lags= 1), include_groups=False)
                                .reset_index()
                                .set_index('date'))
    lags_out_of_stock_7_df = (df.groupby(['store_id','item_id'])
                                .apply(lambda x: create_lags(df = x, variable = 'out_of_stock_7', num_lags= 1), include_groups=False)
                                .reset_index()
                                .set_index('date'))
    lags_out_of_stock_15_df = (df.groupby(['store_id','item_id'])
                                .apply(lambda x: create_lags(df = x, variable = 'out_of_stock_15', num_lags= 1), include_groups=False)
                                .reset_index()
                                .set_index('date'))

    # sales ----> 15 days lag
    lags_sales_df = (df.groupby(['store_id','item_id'])
                        .apply(lambda x: create_lags(df = x, variable = 'sales', num_lags= 15))
                        .reset_index()
                        .set_index('date'))


    #### ROLLING WINDOWS
    
    
    def local_min(df, variable, num_periods = 7):
        lmin = pd.DataFrame()

        for each in range(2,num_periods+1):
            lmin[variable+'_minlocal_'+str(each)] = df[variable].shift(1).rolling(each).min()

        return lmin

    def local_max(df, variable, num_periods = 7):
        lmax = pd.DataFrame()

        for each in range(2, num_periods+1):
            lmax[variable+'_maxlocal_'+str(each)] = df[variable].shift(1).rolling(each).max()

        return lmax

    def local_mean(df, variable, num_periods = 7):
        lmean = pd.DataFrame()

        for each in range(2,num_periods+1):
            lmean[variable+'_meanlocal_'+str(each)] = df[variable].shift(1).rolling(each).mean()

        return lmean


    min_local_df = (df.groupby(['store_id','item_id'])
                  .apply(lambda x: local_min(df = x, variable = 'sales', num_periods= 15))
                  .reset_index()
                  .set_index('date'))
    mean_local_df = (df.groupby(['store_id','item_id'])
                        .apply(lambda x: local_mean(df = x, variable = 'sales', num_periods= 15))
                        .reset_index()
                        .set_index('date'))
    max_local_df = (df.groupby(['store_id','item_id'])
                        .apply(lambda x: local_max(df = x, variable = 'sales', num_periods= 15))
                        .reset_index()
                        .set_index('date'))

    #### JOIN DATAFRAMES
    
    df_joined = pd.concat([df,
                      lags_sell_price_df,
                      lags_out_of_stock_3_df,
                      lags_out_of_stock_7_df,
                      lags_out_of_stock_15_df,
                      lags_sales_df,
                      min_local_df,
                      mean_local_df,
                      max_local_df], axis = 1)

    # delete duplicated columns
    df_joined = df_joined.loc[:,~df_joined.columns.duplicated()]
    df_joined.dropna(inplace=True)
    
    # delete original variables, we already have used them to build new variables
    
    df_joined.drop(columns = ['sell_price','out_of_stock_3','out_of_stock_7','out_of_stock_15'],
                  inplace=True)
    
    # Create a single variable for the product
    df_joined.insert(loc=0, column = "product", value = df_joined.store_id + '_' + df_joined.item_id)
    df_joined = df_joined.drop(columns = ['store_id','item_id'])
    
    
    return df_joined
    
    

In [72]:
def transform_variables(x,y=None,mode = 'training'):
    
    '''
    This works both for training and for execution
    Parameter y is optional because isn't used in execution
    
    
    Training: apply fit_transform and saves the objects
    Execution: loads the objects and applies only transform
    '''
    
    x.reset_index(inplace = True)

    # ENCODERS
    path_ohe = '../../04_Models_/ohe_retail.pickle' 
    path_te  = '../../04_Models_/te_retail.pickle' 
    
    
    #ONE HOT ENCODING
    var_ohe = ['event_name_1']
    if mode == 'training':
        
        ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
        ohe_x = ohe.fit_transform(x[var_ohe])
        ohe_x = pd.DataFrame(ohe_x, columns = ohe.get_feature_names_out())
        with open(path_ohe, mode='wb') as file:
           pickle.dump(ohe, file)
    else:
        # execution mode
        with open(path_ohe, mode='rb') as file:
            ohe = pickle.load(file)
        ohe_x = ohe.transform(x[var_ohe])
        ohe_x = pd.DataFrame(ohe_x, columns = ohe.get_feature_names_out())

    #TARGET ENCODING    
    var_te = ['month','wday','weekday']
    if mode == 'training':
        
        #Make sure Y is as long as X
        y.reset_index(inplace = True, drop = True)
        y = y.loc[y.index.isin(x.index)]
        
        # Training mode
        te = TargetEncoder(min_samples_leaf=100, return_df = False)
        te_x = te.fit_transform(x[var_te], y = y)
        names_te = [variable + '_te' for variable in var_te]
        te_x = pd.DataFrame(te_x, columns = names_te)
        with open(path_te, mode='wb') as file:
           pickle.dump(te, file)
    else:
        # training mode
        with open(path_te, mode='rb') as file:
            te = pickle.load(file)
        te_x = te.transform(x[var_te])
        names_te = [variable + '_te' for variable in var_te]
        te_x = pd.DataFrame(te_x, columns = names_te)
    
      
    # CLEANUP
    #eliminate originals
    x = x.drop(columns=['event_name_1','month','wday','weekday'])
    # include the other dataframes
    x = pd.concat([x,ohe_x,te_x], axis=1).set_index('date')

    # output
    return(x)