# Multi-Date Expected Proceeds Prediction Workflow

This notebook runs the expected proceeds prediction workflow for multiple inference dates and combines the results.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time

# Import custom modules
from country_utils import add_signup_country_group
from data_splitter_utils import split_data_by_date, split_data_by_user_type
from trial_predictions import TrialPredictionModel
from direct_purchase_predictions import DirectPurchasePredictionModel
from lag_purchase_predictions import LagPurchasePredictionModel
from prediction_processor import PredictionProcessor

# Connect to Snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()

## 1. Define Inference Dates

In [None]:
# Define list of inference dates
# Format: 'YYYY-MM-DD'
inference_dates = [
    '2023-01-01',
    '2023-02-01',
    '2023-03-01',
    '2023-04-01',
    '2023-05-01'
]

# Alternatively, generate dates in a range
def generate_date_range(start_date, end_date, interval_days=30):
    """Generate a list of dates between start_date and end_date with specified interval"""
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    
    dates = []
    current = start
    
    while current <= end:
        dates.append(current.strftime('%Y-%m-%d'))
        current += timedelta(days=interval_days)
    
    return dates

# Uncomment to use date range instead of explicit list
# inference_dates = generate_date_range('2023-01-01', '2023-05-01', 30)

print(f"Will process {len(inference_dates)} inference dates:")
for date in inference_dates:
    print(f"  - {date}")

## 2. Load Input Data

In [None]:
# Fetch data from Snowflake
# Get data for a wide enough range to cover all inference dates
earliest_date = min(inference_dates)
latest_date = max(inference_dates)

# Calculate the training window needed
training_window_days = 180  # Use 1/2 year of training data

# Calculate the earliest date needed for training
earliest_training_date = (datetime.strptime(earliest_date, '%Y-%m-%d') - 
                          timedelta(days=training_window_days)).strftime('%Y-%m-%d')

print(f"Fetching data from {earliest_training_date} to {latest_date}")

input_query = f"""
    SELECT 
        *
    FROM blinkist_dev.dbt_mjaama.exp_proceeds_input
    WHERE report_date >= '{earliest_training_date}'
    AND report_date <= '{latest_date}'
    """
    
input_df = session.sql(input_query).to_pandas()

# Load product dimension data
product_query = """
    select sku as product_name, price 
    from BLINKIST_PRODUCTION.reference_tables.product_dim
    where is_purchasable;
    """
    
product_df = session.sql(product_query).to_pandas()

# Display data info
print(f"Input data shape: {input_df.shape}")
print(f"Product data shape: {product_df.shape}")

# Add signup_country_group to the input data
input_df = add_signup_country_group(input_df)

# Check the distribution of country groups
country_group_counts = input_df['signup_country_group'].value_counts()
print("\nCountry group distribution:")
print(country_group_counts)

## 3. Define Prediction Function

In [None]:
def run_prediction_for_date(input_data, product_data, inference_date, training_window_days=180):
    """Run the full prediction workflow for a specific inference date
    
    Args:
        input_data: DataFrame with input data
        product_data: DataFrame with product dimension data
        inference_date: Date string in format 'YYYY-MM-DD'
        training_window_days: Number of days to use for training
        
    Returns:
        DataFrame with predictions for the inference date
    """
    print(f"\n{'='*50}")
    print(f"Processing inference date: {inference_date}")
    print(f"{'='*50}")
    
    # 1. Split data by date
    inference_df, training_d8_df, training_d100_df = split_data_by_date(
        input_data,
        inference_date=inference_date,
        training_window_days=training_window_days,
        date_column='report_date'
    )
    
    print(f"Inference data shape: {inference_df.shape}")
    print(f"Training d8 data shape: {training_d8_df.shape}")
    print(f"Training d100 data shape: {training_d100_df.shape}")
    
    # Check if we have enough data
    if inference_df.empty or training_d8_df.empty or training_d100_df.empty:
        print(f"Insufficient data for inference date {inference_date}. Skipping.")
        return pd.DataFrame()
    
    # 2. Split data by user type
    trial_inference, day0_payers_inference, other_inference = split_data_by_user_type(inference_df)
    trial_training_d8, day0_payers_training_d8, other_training_d8 = split_data_by_user_type(training_d8_df)
    trial_training_d100, day0_payers_training_d100, other_training_d100 = split_data_by_user_type(training_d100_df)
    
    print("Inference data user type distribution:")
    print(f"Trial users: {trial_inference.shape[0]}")
    print(f"Day 0 payers: {day0_payers_inference.shape[0]}")
    print(f"Other users: {other_inference.shape[0]}")
    
    # 3. Train models
    print("\nTraining models...")
    
    # Trial model
    trial_model = None
    if not trial_training_d8.empty and not trial_training_d100.empty:
        trial_model = TrialPredictionModel(product_dim_df=product_data)
        trial_model.fit(trial_training_d8, trial_training_d100)
        print("  - Trial model trained successfully")
    else:
        print("  - Insufficient data for trial model")
    
    # Direct purchase model
    direct_model = None
    if not day0_payers_training_d8.empty and not day0_payers_training_d100.empty:
        direct_model = DirectPurchasePredictionModel()
        direct_model.fit(day0_payers_training_d8, day0_payers_training_d100)
        print("  - Direct purchase model trained successfully")
    else:
        print("  - Insufficient data for direct purchase model")
    
    # Lag purchase model
    lag_model = None
    if not other_training_d8.empty and not other_training_d100.empty:
        lag_model = LagPurchasePredictionModel(product_dim_df=product_data)
        lag_model.fit(other_training_d8, other_training_d100)
        print("  - Lag purchase model trained successfully")
    else:
        print("  - Insufficient data for lag purchase model")
    
    # 4. Make predictions
    print("\nMaking predictions...")
    predictions = []
    
    # Trial users
    if trial_model is not None and not trial_inference.empty:
        trial_predictions = trial_model.predict(trial_inference)
        predictions.append(trial_predictions)
        print(f"  - Predicted for {trial_predictions.shape[0]} trial users")
    
    # Day 0 payers
    if direct_model is not None and not day0_payers_inference.empty:
        direct_predictions = direct_model.predict(day0_payers_inference)
        predictions.append(direct_predictions)
        print(f"  - Predicted for {direct_predictions.shape[0]} day 0 payers")
    
    # Other users
    if lag_model is not None and not other_inference.empty:
        lag_predictions = lag_model.predict(other_inference)
        predictions.append(lag_predictions)
        print(f"  - Predicted for {lag_predictions.shape[0]} other users")
    
    return  pd.concat(predictions, ignore_index=False)

## 4. Run Predictions for All Dates

In [None]:
# Initialize list to store results
all_results = []

# Track timing
start_time = time.time()

# Process each inference date
for date in inference_dates:
    date_start_time = time.time()
    
    # Run prediction for this date
    result_df = run_prediction_for_date(
        input_data=input_df,
        product_data=product_df,
        inference_date=date,
        training_window_days=training_window_days
    )
    
    # Add to results if not empty
    if not result_df.empty:
        all_results.append(result_df)
        
    date_end_time = time.time()
    print(f"Processed {date} in {date_end_time - date_start_time:.2f} seconds")

# Combine all results
if all_results:
    combined_results = pd.concat(all_results, ignore_index=False)
    print(f"\nCombined results shape: {combined_results.shape}")
else:
    combined_results = pd.DataFrame()
    print("\nNo results generated.")

end_time = time.time()
print(f"Total processing time: {end_time - start_time:.2f} seconds")

## 5. Analyze Combined Results

In [None]:
if not combined_results.empty:
    # Initialize the prediction processor
    processor = PredictionProcessor()
    
    # Analyze by inference date
    date_performance = processor.aggregate_predictions(
        combined_results, 
        custom_grouping=['inference_date']
    )
    
    print("Performance by Inference Date:")
    display(date_performance.sort_values('inference_date'))
    
    # Analyze by channel and inference date
    channel_date_performance = processor.aggregate_predictions(
        combined_results, 
        custom_grouping=['inference_date', 'channel_group']
    )
    
    print("\nPerformance by Channel and Inference Date:")
    display(channel_date_performance.sort_values(['inference_date', 'expected_proceeds_d100'], ascending=[True, False]))
    
    # Analyze by country group and inference date
    country_date_performance = processor.aggregate_predictions(
        combined_results, 
        custom_grouping=['inference_date', 'signup_country_group']
    )
    
    print("\nPerformance by Country Group and Inference Date:")
    display(country_date_performance.sort_values(['inference_date', 'expected_proceeds_d100'], ascending=[True, False]))

## 6. Save Combined Results

In [None]:
# Save combined results to Snowflake
if not combined_results.empty:

    session = get_snowflake_session()
    # Filter columns that exist in the DataFrame

    
    # Convert to Snowpark DataFrame
    snowpark_df = session.create_dataframe(combined_results)
    
    # Save to Snowflake table
    table_name = "BLINKIST_DEV.DBT_MJAAMA.MULTI_DATE_EXPECTED_PROCEEDS_PREDICTIONS"
    snowpark_df.write.mode("overwrite").save_as_table(table_name)
    
    print(f"Combined predictions saved to {table_name}")

## 7. Export Results to CSV (Optional)

In [None]:
# Export combined results to CSV
if not combined_results.empty:
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"multi_date_predictions_{timestamp}.csv"
    
    # Save to CSV
    combined_results.to_csv(filename, index=False)
    print(f"Combined predictions exported to {filename}")