In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import repeat
from sklearn.linear_model import LinearRegression
from statistics import mean 

# Load data
gmv_df = pd.read_csv('Desktop/gmv_by_product_type.csv')

# Extract year and week number
gmv_df['year'] = gmv_df['FL_WEEK_NUM'].astype(str).str[0:4]
gmv_df['week_num'] = gmv_df['FL_WEEK_NUM'].astype(str).str[4:6]
gmv_df.rename(columns={'FL_WEEK_NUM': 'fl_week_num', 'PRODUCT_TYPE' : 'product' , 
                       'WEEKLY_GMV' : 'gmv' , 'NUM_TRANSACTIONS' : 'transactions' ,'year': 'year', 'week_num': 'week'}, inplace=True)

gmv_df['year'] = gmv_df['year'].astype(int)
gmv_df['week'] = gmv_df['week'].astype(int)

# Calculate cumulative transactions and GMV
gmv_df['cum_transactions'] = gmv_df.groupby(['product', 'year'])['transactions'].cumsum()
gmv_df['cum_gmv'] = gmv_df.groupby(['product', 'year'])['gmv'].cumsum()



product_list=["UpgradedAccessProduct","UnlimitedMonthlyProduct","ExpertTrackProduct",
               "UnlimitedAnnualProduct","MicrocredentialProduct","PaidForCourseProduct"]

final_df = pd.DataFrame()
for p in product_list:

# Analysis and prediction for MicrocredentialProduct
    product_name = p
    gmv_product = gmv_df[gmv_df['product'] == product_name]
    gmv2022 = gmv_product[gmv_product['year'] == 2022]
    gmv2023 = gmv_product[gmv_product['year'] == 2023]
    current_week = gmv2023['week'].max()
    x = 52 - current_week
    gmv2023=gmv2023[gmv2023['week'] < current_week]
    pred_week_num = np.arange(current_week, 53)
    pred_type = ['predicted'] * (x + 1)
    pred_year = [2023] * (x + 1)

    # Linear Regression Model
    model = LinearRegression().fit(gmv2023[['week']], gmv2023['cum_gmv'])
    slope = model.coef_[0]
    intercept = gmv2023[gmv2023['product'] == product_name]['cum_gmv'].max()
    pred_lm = np.arange(1, x + 2) * slope + intercept

    lm_gmv = pd.DataFrame({'pred_year': pred_year, 'pred_week_num': pred_week_num, 'pred_lm': pred_lm, 'pred_type': pred_type})
    lm_gmv['cum_lm_gmv'] = lm_gmv['pred_lm'].cumsum()

    ### Year-over-Year comparison

    # Year-over-Year comparison

    growth_from_2023 = gmv2022[gmv2022['week'] > current_week-1]['gmv'].cumsum()
    pred_gmv = growth_from_2023 + gmv2023[gmv2023['product'] == product_name]['cum_gmv'].max()

    ################## Old Way to create the scaled forecast #####################
    #ind_23=gmv2023[gmv2023['week'] == current_week-1].index[0]
    #ind_22=gmv2022[gmv2022['week'] == current_week-1].index[0]
    ###YOY_comparison = gmv2023.loc[ind_23]['cum_gmv'] / gmv2022.loc[ind_22]['cum_gmv']
    ###scaled_pred_gmv = growth_from_2023 * YOY_comparison + gmv2023[gmv2023['product'] == product_name]['cum_gmv'].max()

    ################## New Way to create the scaled forecast #####################

    # Find missing sequential integers
    max_value = gmv2023['week'].max()
    all_integers = set(range(1, max_value + 1))
    existing_integers = set(gmv2023['week'])
    missing_integers = sorted(all_integers - existing_integers)

    # Add new rows for missing integers
    if missing_integers:
        missing_rows = pd.DataFrame({'week': missing_integers})
        df = pd.concat([gmv2023, missing_rows]).sort_values('week').reset_index(drop=True)

    else:
        df=gmv2023


    # Extracting columns 'A' and 'B' into a new DataFrame
    new_df = df[['week', 'gmv']].copy()

    # Replacing NAs in column 'B' with 0
    new_df['gmv'].fillna(0, inplace=True)

    #GMV 2022

    short_2022=gmv2022[gmv2022['week'] < current_week]
    new_2022 = short_2022[['week', 'gmv']].copy()

    ##Finding the difference between the average weekly gmv between last year and this year. Numbers below 
    #1 mean that this year is doing
    ##Better than last year i.e. if we get .7 then we have currently ~70% less than last year at this time, 
    #and could expect our WOW model to do 70% worse, so we multiply the WOW growth from 2023 by this value 


    YOY_comparison=mean(new_df['gmv'].values)/mean(new_2022['gmv'].values)
    scaled_pred_gmv = growth_from_2023 * YOY_comparison + gmv2023[gmv2023['product'] == product_name]['cum_gmv'].max()

    d=pd.DataFrame({'year': pred_year, 'week': pred_week_num,'product': product_column, 'wow_growth_model': pred_gmv, 'scaled_wow_growth_model': scaled_pred_gmv,'linear_model': pred_lm})

    final_df=pd.concat([final_df,d])

In [37]:
final_df

Unnamed: 0,year,week,product,wow_growth_model,scaled_wow_growth_model,linear_model
196,2023,33,PaidForCourseProduct,452306.89,444782.823912,442383.640268
201,2023,34,PaidForCourseProduct,473505.99,459474.522793,456971.470535
204,2023,35,PaidForCourseProduct,496037.11,475089.356937,471559.300803
212,2023,36,PaidForCourseProduct,519410.19,491287.698060,486147.131070
221,2023,37,PaidForCourseProduct,541919.78,506887.611180,500734.961338
...,...,...,...,...,...,...
283,2023,48,PaidForCourseProduct,261879.44,250978.819763,254564.076891
290,2023,49,PaidForCourseProduct,268887.65,257176.348754,259996.501072
298,2023,50,PaidForCourseProduct,277468.44,264764.548137,265428.925253
303,2023,51,PaidForCourseProduct,281114.94,267989.235963,270861.349434
