weekly sales for top 5 categories in each department

In [5]:
def primary_key_with_weekly_sales(primary_key_table, weekly_sales_df) :
    """
    Join the primary key table with the weekly sales DataFrame and fill null values with 0.

    Args:
    - primary_key_table: DataFrame, the primary key table containing 'week', 'stg_item_category_desc_txt', and 'stg_outlet_cd' columns
    - weekly_sales_df: DataFrame, the weekly sales data containing 'week', 'stg_item_category_desc_txt', and 'stg_outlet_cd' columns

    Returns:
    - DataFrame: The resulting DataFrame after joining with weekly sales and filling null values with 0
    """
    
    # Join the primary key table with the weekly sales DataFrame
    final_df = (
        primary_key_table
        .join(
            weekly_sales_df,
            on=["week", "stg_item_category_desc_txt", "stg_outlet_cd"],
            how="left"
        )
    )
    
    # Fill null values with 0
    final_df = final_df.na.fill(0)
    
    return final_df


target variable

In [7]:
def target_variable(final_df):
    """
    Calculates the next week's quantity for each item category and outlet.

    Args:
        final_df ): DataFrame containing the joined data

    Returns:
        pyspark.sql.DataFrame: DataFrame with an additional column "next_week_qty".
    """
    # Define window specification
    window_spec = Window.partitionBy("stg_outlet_cd", "stg_item_category_desc_txt").orderBy("week")

    # Calculate next week's quantity
    df_with_next_week_qty = (final_df
                             .withColumn(
                                 "next_week_qty", f.lead("weekly_sales_qty").over(window_spec)))

    return df_with_next_week_qty
