# Expected Proceeds Prediction Workflow

This notebook demonstrates the end-to-end workflow for predicting expected proceeds at day 8 and day 100 for different user types.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Import custom modules
from country_utils import add_signup_country_group
from data_utils import split_data_by_date, split_data_by_user_type
from trial_predictions import TrialPredictionModel
from direct_purchase_predictions import DirectPurchasePredictionModel
from lag_purchase_predictions import LagPurchasePredictionModel
from prediction_processor import PredictionProcessor

# Connect to Snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()

ModuleNotFoundError: No module named 'pandas'

## 1. Load Data

In [None]:
# Fetch data from Snowflake
input_query = """
    SELECT 
        *
    FROM blinkist_dev.dbt_mjaama.exp_proceeds_input
    """
    
input_df = session.sql(input_query).to_pandas()

# Load product dimension data
product_query = """
    select sku as product_name, price 
    from BLINKIST_PRODUCTION.reference_tables.product_dim
    where is_purchasable;
    """
    
product_df = session.sql(product_query).to_pandas()

# Display data info
print(f"Input data shape: {input_df.shape}")
print(f"Product data shape: {product_df.shape}")

## 2. Add Country Groups

In [None]:
# Add signup_country_group to the input data
input_df = add_signup_country_group(input_df)

# Check the distribution of country groups
country_group_counts = input_df['signup_country_group'].value_counts()
print("Country group distribution:")
print(country_group_counts)

## 3. Split Data by Date

In [None]:
# Define inference date and training window
inference_date = '2025-01-01'  # Using a future date for inference
training_window_days = 180     # Use 1/2 year of training data

# Split data into inference, training_d8, and training_d100 datasets
inference_df, training_d8_df, training_d100_df = split_data_by_date(
    input_df,
    inference_date=inference_date,
    training_window_days=training_window_days,
    date_column='report_date'
)

print(f"Inference data shape: {inference_df.shape}")
print(f"Training d8 data shape: {training_d8_df.shape}")
print(f"Training d100 data shape: {training_d100_df.shape}")

## 4. Split Data by User Type

In [None]:
# Split inference data by user type
trial_inference, day0_payers_inference, other_inference = split_data_by_user_type(inference_df)

# Split training data for d8 predictions by user type
trial_training_d8, day0_payers_training_d8, other_training_d8 = split_data_by_user_type(training_d8_df)

# Split training data for d100 predictions by user type
trial_training_d100, day0_payers_training_d100, other_training_d100 = split_data_by_user_type(training_d100_df)

# Display user type distribution
print("Inference data user type distribution:")
print(f"Trial users: {trial_inference.shape[0]}")
print(f"Day 0 payers: {day0_payers_inference.shape[0]}")
print(f"Other users: {other_inference.shape[0]}")

## 5. Train Models

In [None]:
# 1. Train Trial Prediction Model
print("Training Trial Prediction Model...")
trial_model = TrialPredictionModel(product_dim_df=product_df)
trial_model.fit(trial_training_d8, trial_training_d100)

# 2. Train Direct Purchase Prediction Model
print("Training Direct Purchase Prediction Model...")
direct_model = DirectPurchasePredictionModel()
direct_model.fit(day0_payers_training_d8, day0_payers_training_d100)

# 3. Train Lag Purchase Prediction Model
print("Training Lag Purchase Prediction Model...")
lag_model = LagPurchasePredictionModel(product_dim_df=product_df)
lag_model.fit(other_training_d8, other_training_d100)

## 6. Make Predictions

In [None]:
# Make predictions for each user type
predictions = []

# 1. Trial users
if not trial_inference.empty:
    print("Predicting for trial users...")
    trial_predictions = trial_model.predict(trial_inference)
    predictions.append(trial_predictions)

# 2. Day 0 payers
if not day0_payers_inference.empty:
    print("Predicting for day 0 payers...")
    direct_predictions = direct_model.predict(day0_payers_inference)
    predictions.append(direct_predictions)

# 3. Other users
if not other_inference.empty:
    print("Predicting for other users...")
    lag_predictions = lag_model.predict(other_inference)
    predictions.append(lag_predictions)

## 7. Process and Aggregate Predictions

In [None]:
# Initialize the prediction processor
processor = PredictionProcessor()

predictions_df = pd.concat(predictions, ignore_index=False)
# Combine all predictions
all_predictions = processor.process_predictions(predictions_df)
