In [1]:
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

In [2]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [3]:
%%bigquery unique_orders_2324 --project fb-business-datasets
SELECT order_number, created_at, email, `interval`, Total_Net
FROM fb-business-datasets.ds_tables.unique_orders_2324_standard

Query is running:   0%|          |

Downloading:   0%|          |

#### Total Purchased Order Value of All Active Customers in 2023

In [4]:
### calculate the real total net made by 2023 customers

# customers = unique_orders_2324[unique_orders_2324['created_at'] < '2024-1-1'].email.unique()

# unique_orders_2324['order_date'] = unique_orders_2324['created_at'].dt.date
# unique_orders_2324['order_date'] = pd.to_datetime(unique_orders_2324['order_date'])
# real_total_month_1 = unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 1) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()
# real_total_month_2 = unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 2) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()
# real_total_month_3 = unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 3) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()
# real_total_month_4 = unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 4) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()
# real_total_month_5 = unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 5) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()
# real_total_month_6= unique_orders_2324[(unique_orders_2324['order_date'].dt.year == 2024) &
#                                  (unique_orders_2324['order_date'].dt.month == 6) &
#                                  (unique_orders_2324['email'].isin(customers))].Total_Net.sum()


# df = pd.DataFrame({
#     'calculated_at': datetime.datetime.utcnow(), # UTC time is used here
#     'Month': ['2024-1-31','2024-2-29','2024-3-31','2024-4-30','2024-5-31','2024-6-30'],
#     'Real_Total': [real_total_month_1, real_total_month_2, real_total_month_3, real_total_month_4, real_total_month_5, real_total_month_6]
# })
# df

In [5]:
def calculate_real_totals(unique_orders, calibration_end_date, num_months):
  """
  Calculates the real total purchased value made by all customers in the calibration period of the prediction model.

  Parameters:
    unique_orders (DataFrame): The most recent updated `unique_order` dataframe.
    calibration_end_date (Date): The date set as the parameter of `calibration_period` of the prediction model.
    num_of_months (Int): The number of months you want to calculate the real total purchased value for after the calibration end date.

  Returns:
    DataFrame: A DataFrame with the total purchase value for each month of all active customers in the calibration period.
  """

  unique_orders['order_date'] = pd.to_datetime(unique_orders['created_at'].dt.date)
  customers = unique_orders[unique_orders['created_at'] < calibration_end_date]['email'].unique()

  results = []
  for i in range(num_months):
    month_start = (datetime.datetime.strptime(calibration_end_date, '%Y-%m-%d').replace(day=1) + relativedelta(months=i)).date()
    month_filter = ((unique_orders['order_date'].dt.year == month_start.year) & (unique_orders['order_date'].dt.month == month_start.month) &
                   (unique_orders['email'].isin(customers)))
    real_total = unique_orders[month_filter]['Total_Net'].sum()
    results.append({
        'calculated_at': datetime.datetime.utcnow(),
        'calibration_period': pd.to_datetime(calibration_end_date),
        'Period': pd.to_datetime(month_start),
        'Real_Total': real_total
        })

  return pd.DataFrame(results)

In [6]:
df1 = calculate_real_totals(unique_orders_2324, '2024-1-1', 6)
df1

Unnamed: 0,calculated_at,calibration_period,Period,Real_Total
0,2024-07-02 15:45:32.537445,2024-01-01,2024-01-01,1058786.85
1,2024-07-02 15:45:33.232307,2024-01-01,2024-02-01,831022.1
2,2024-07-02 15:45:33.846006,2024-01-01,2024-03-01,766419.0
3,2024-07-02 15:45:34.438216,2024-01-01,2024-04-01,864888.05
4,2024-07-02 15:45:35.023725,2024-01-01,2024-05-01,776475.4
5,2024-07-02 15:45:35.583463,2024-01-01,2024-06-01,530929.2


In [7]:
df1.dtypes

calculated_at         datetime64[ns]
calibration_period    datetime64[ns]
Period                datetime64[ns]
Real_Total                   float64
dtype: object

In [8]:
project_id = 'fb-business-datasets'  # Google Cloud project ID
destination_table = 'fb-business-datasets.ds_tables.real_total_purchase_value'  # destination table name in BigQuery
df1.to_gbq(destination_table, project_id=project_id, if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 5489.93it/s]


#### Total Purchased Order Value for Each Active Customers in 2023

In [9]:
def calculate_real_per_customer(unique_orders, calibration_end_date, num_months):
    """
    Calculates the real total purchased value, real purchase number, and real average order value made by each customers in calibration period
    of the prediction model.

    Parameters:
      unique_orders (DataFrame): The most recent updated `unique_order` dataframe.
      calibration_end_date (Date): The date set as the parameter of `calibration_period` of the prediction model.
      num_of_months (Int): The number of months you want to calculate the real total purchased value for after the calibration end date.

    Returns:
      DataFrame: A DataFrame with the total purchase value for each month of all active customers in the calibration period.
    """
    unique_orders['order_date'] = pd.to_datetime(unique_orders['created_at'].dt.date)
    customers = unique_orders[unique_orders['created_at'] < calibration_end_date]['email'].unique()
    customers_df = pd.DataFrame({'email': customers,
                                 'calculated_at': datetime.datetime.utcnow(),
                                 'calibration_period': pd.to_datetime(calibration_end_date),})
    avg_order_value = unique_orders.groupby('email')['Total_Net'].sum() / unique_orders.groupby('email')['order_number'].nunique()
    avg_order_value = avg_order_value.rename('avg_order_value')
    customers_df = pd.merge(customers_df, avg_order_value, on='email', how='left')

    monthly_data = []

    for i in range(num_months):
      month_start = (datetime.datetime.strptime(calibration_end_date, '%Y-%m-%d').replace(day=1) + relativedelta(months=i)).date()
      month_filter = ((unique_orders['order_date'].dt.year == month_start.year) & (unique_orders['order_date'].dt.month == month_start.month) &
                      (unique_orders['email'].isin(customers)))

      monthly_customer = unique_orders[month_filter].groupby('email').agg(Real_Total = ('Total_Net', 'sum'), Order_Count=('email', 'count')).reset_index()
      monthly_customer['Period'] = pd.to_datetime(month_start)
      monthly_data.append(monthly_customer)

    results_df = pd.concat(monthly_data, ignore_index=True)
    results_df = pd.merge(customers_df, results_df, on='email', how='left')
    return results_df

In [10]:
df2 = calculate_real_per_customer(unique_orders_2324, '2024-1-1', 6).sort_values('email')
df2

Unnamed: 0,email,calculated_at,calibration_period,avg_order_value,Real_Total,Order_Count,Period
16177,+1508596smbak55@gmail.com,2024-07-02 15:45:41.913359,2024-01-01,24.0,,,NaT
195743,0-orange-braces@icloud.com,2024-07-02 15:45:41.913359,2024-01-01,50.0,,,NaT
362353,0.aliases.iambic@icloud.com,2024-07-02 15:45:41.913359,2024-01-01,55.0,,,NaT
376327,00.finned-abodes@icloud.com,2024-07-02 15:45:41.913359,2024-01-01,78.0,,,NaT
300848,00.scythe-spinier@icloud.com,2024-07-02 15:45:41.913359,2024-01-01,34.5,,,NaT
...,...,...,...,...,...,...,...
371274,zzvia01@gmail.com,2024-07-02 15:45:41.913359,2024-01-01,74.0,,,NaT
133523,zzzzpops@yahoo.com,2024-07-02 15:45:41.913359,2024-01-01,19.5,13.0,1.0,2024-03-01
344778,zzzzzzzaj@aol.com,2024-07-02 15:45:41.913359,2024-01-01,117.0,,,NaT
75431,ògmamuel@yahoo.com,2024-07-02 15:45:41.913359,2024-01-01,26.0,,,NaT


In [11]:
df2.dtypes

email                         object
calculated_at         datetime64[ns]
calibration_period    datetime64[ns]
avg_order_value              float64
Real_Total                   float64
Order_Count                  float64
Period                datetime64[ns]
dtype: object

In [12]:
project_id = 'fb-business-datasets'  # Google Cloud project ID
destination_table = 'fb-business-datasets.ds_tables.real_purchase_per_customer'  # destination table name in BigQuery
df2.to_gbq(destination_table, project_id=project_id, if_exists='append')

100%|██████████| 1/1 [00:00<00:00, 1407.01it/s]
