# Intro

This notebook is used to test the different patterns in the synthetic data generated.

These might include:
- Overall Churn Rate
- Different churn patterns
- Reactivation rates for churning customers

These are an attempt to generate more realistic data.

In [21]:
# Imports
import pandas as pd
from faker import Faker
import numpy as np
import random

### Generate Customer data

In [6]:
# Create a Faker object
fake = Faker(use_weighting=True, locale='en_US', include_private=False)

# Set the seed
Faker.seed(4242)

# Generate customer data
customers = []

for _ in range(10000):
    customers.append({
        'id': fake.uuid4(),
        'name': fake.first_name(),
        'last_name': fake.last_name(),
        'email': fake.email(),
        'registration_date': fake.date_between(start_date='-1y', end_date='today'),
        'birth_date': fake.date_of_birth(minimum_age=18, maximum_age=65),
        'street_address': fake.street_address(),
        'city': fake.city(),
        'state': fake.state(),
        'country': 'USA',
        'email_opt_in': fake.boolean(chance_of_getting_true=40)
        })
        
# Save the data to a DataFrame
customers_df = pd.DataFrame(customers)

# Export the csv file 
customers_df.to_csv('../data/customers_test.csv', index=False)

### Generate sales data with churn patterns

This is a simple implementation with 2 types of churn patterns:
1. Sudden churn: the customer randomly churns without a prior decrease in sales
2. Gradual churn: the customer's purchases linearly decrease until they churn

In [22]:
# Now, lets create sales data with churn patterns

# Import the customers data
customers_df = pd.read_csv('../data/customers_test.csv')


# Define churn parameters
churn_rate = 0.20  # Overall churn rate (20%)
sudden_churn_rate = 0.3 # 30% of churns are sudden

# Assign churn behavior to customers
customers_df['churn_type'] = np.random.choice(
    ['no_churn', 'sudden_churn', 'gradual_churn'],
    p=[1 - churn_rate, sudden_churn_rate * churn_rate, (1 - sudden_churn_rate) * churn_rate],
    size=len(customers_df)
)

start_date = pd.to_datetime('2023-01-01')
end_date = pd.to_datetime('2023-12-31')

sales_data = []
for _, customer in customers_df.iterrows():
    churn_month = None
    if customer['churn_type'] == 'sudden_churn':
        churn_month = random.randint(1, 12)  # Randomly assign churn month
    elif customer['churn_type'] == 'gradual_churn':
        churn_month = random.randint(6, 12)  # Churn happens in the latter half of the year

    for month in range(1, 13):
        if churn_month and month >= churn_month:
            break  # Stop generating sales after churn

        num_orders = fake.random_int(min=1, max=10)  # Adjust as needed
        if customer['churn_type'] == 'gradual_churn':
            num_orders = max(0, num_orders - (month - churn_month + 6) // 2)  # Gradually decrease orders

        for _ in range(num_orders):
            order_date = fake.date_between(start_date=start_date + pd.DateOffset(months=month-1),
                                           end_date=start_date + pd.DateOffset(months=month))
            total_amount = fake.random_int(min=10, max=1000)  # Adjust as needed
            sales_data.append({
                'order_id': fake.uuid4(),
                'customer_id': customer['id'],
                'order_date': order_date,
                'total_amount': total_amount
            })

# Drop the churn_type column
customers_df.drop(columns='churn_type', inplace=True)

sales_df = pd.DataFrame(sales_data)

sales_df.head()

Unnamed: 0,order_id,customer_id,order_date,total_amount
0,65fdb9a7-b224-408f-aeea-31f97bb9bdcf,21bade02-6a6a-4768-b2ed-66ffdcc99396,2023-01-18,970
1,16800bc0-c6e6-47a8-9144-b8d47be5487d,21bade02-6a6a-4768-b2ed-66ffdcc99396,2023-01-22,948
2,ca0495f4-9f98-4295-87fc-58892f277317,21bade02-6a6a-4768-b2ed-66ffdcc99396,2023-01-31,231
3,2f7c9c66-93f3-4588-816e-0825652c8bcc,21bade02-6a6a-4768-b2ed-66ffdcc99396,2023-01-21,897
4,5d28134d-ef9f-40ad-bcd6-06a3bf183f39,21bade02-6a6a-4768-b2ed-66ffdcc99396,2023-01-03,275


### Non linear decline churn

This script implements a non linear function for the decline churn

In [17]:
# Define churn parameters
churn_rate = 0.20  # Overall churn rate (20%)
sudden_churn_ratio = 0.3  # 30% of churners will be sudden churners
reactivation_rate = 0.05  # 5% of churned customers will reactivate

# Assign churn behavior to customers
customers_df['churn_type'] = np.random.choice(
    ['no_churn', 'sudden_churn', 'gradual_churn'],
    p=[1 - churn_rate, sudden_churn_ratio * churn_rate, (1 - sudden_churn_ratio) * churn_rate],
    size=len(customers_df)
)

# Generate sales data over 12 months
start_date = pd.to_datetime('2023-01-01')
end_date = pd.to_datetime('2023-12-31')

sales_data = []
for _, customer in customers_df.iterrows():
    churn_month = None
    reactivation_month = None
    if customer['churn_type'] == 'sudden_churn':
        churn_month = random.randint(1, 12) 
    elif customer['churn_type'] == 'gradual_churn':
        churn_month = random.randint(3, 12)  # Allow churn to start earlier

    if customer['churn_type'] != 'no_churn' and churn_month is not None:
        reactivation_month = random.randint(churn_month + 1, 12) if churn_month < 12 else None

    for month in range(1, 13):
        if churn_month and month >= churn_month and (not reactivation_month or month < reactivation_month):
            continue  # Skip months during churn period (unless reactivated)

        num_orders = fake.random_int(min=1, max=5)
        if customer['churn_type'] == 'gradual_churn':
            decline_factor = max(0, 1 - 0.2 * ((month - churn_month + 3) // 3))  # Non-linear decline
            num_orders = int(num_orders * decline_factor)

        for _ in range(num_orders):
            order_date = fake.date_between(start_date=start_date + pd.DateOffset(months=month-1),
                                           end_date=start_date + pd.DateOffset(months=month))
            total_amount = fake.random_int(min=10, max=200)
            sales_data.append({
                'order_id': fake.uuid4(),
                'customer_id': customer['id'],
                'order_date': order_date,
                'total_amount': total_amount
            })


# Drop the churn_type column
customers_df.drop(columns='churn_type', inplace=True)

sales_df_2 = pd.DataFrame(sales_data)

sales_df_2.head()

### Churn with multiple decline functions

This implementation mixes different patterns of decline churn:
- Exponential decline
- Logarithmic decline
- Step wise decline

In [23]:
# Define the functions

# Exponential decline
def exponential_decline(month, churn_month, decay_rate=0.2):
    """
    Calculates the exponential decline factor for a given month, churn month, and decay rate.
    Parameters:
    - month (int): The current month.
    - churn_month (int): The month when the decline starts.
    - decay_rate (float): The rate at which the decline occurs. Default is 0.2.
    Returns:
    - float: The exponential decline factor for the given month.
    """
    
    if month < churn_month:
        return 1 # No decline before churn
    else:
        time_since_churn = month - churn_month + 1
        return np.exp(-decay_rate * time_since_churn)
    
# Logarithmic decline
def logarithmic_decline(month, churn_month, base=2):
    """
    Calculates the logarithmic decline factor for a given month, churn month, and base.
    Parameters:
    - month (int): The current month.
    - churn_month (int): The month when the decline starts.
    - base (int): The base of the logarithm. Default is 2.
    Returns:
    - float: The logarithmic decline factor for the given month.
    """
    
    if month < churn_month:
        return 1 # No decline before churn
    else:
        time_since_churn = month - churn_month + 1
        return 1 / np.log2(base * time_since_churn)
    
# Step wise decline
def stepwise_decline(month, churn_month, step_size=3):
    """
    Calculates the step-wise decline factor for a given month, churn month, and step size.
    Parameters:
    - month (int): The current month.
    - churn_month (int): The month when the decline starts.
    - step_size (int): The number of months between steps. Default is 3.
    Returns:
    - float: The step-wise decline factor for the given month.
    """
    
    if month < churn_month:
        return 1 # No decline before churn
    else:
       steps_since_churn = (month - churn_month) // step_size
       return decline_factor ** steps_since_churn