In [29]:
import pandas as pd
import numpy as np

# -----------------------------
# 1) Load data
# -----------------------------
# Read the fake category data (update the path if needed)
df = pd.read_csv(r"D:\General Drive\Tracksuit-Takehome-TA\Tracksuit-Takehome-TA\fake_category_data.csv")

# Quick sanity checks: preview rows, data types, and summary statistics
print(df.head())
print(df.info())
print(df.describe())

# -----------------------------
# 2) Baseline (Naive) cost
# -----------------------------
# If we ran each category alone, to get ~200 qualified respondents:
# expected required respondents = 200 / incidence_rate
df["required_respondents_if_alone"] = 200 / df["incidence_rate"]

# View the computed requirement per category
print(df[["category_name", "incidence_rate", "required_respondents_if_alone"]].head())

# Naive total cost = sum of required respondents across categories
total_cost_naive = df["required_respondents_if_alone"].sum()
print("Naive total respondents:", total_cost_naive)

# -----------------------------
# 3) Greedy Packing (Bin Packing by time constraint)
# -----------------------------
# Sort categories so that the "hardest/most expensive" categories (low incidence) are allocated first
df_sorted = df.sort_values("required_respondents_if_alone", ascending=False).reset_index(drop=True)

MAX_TIME = 480  # 8 minutes in seconds
survey_groups = []  # each group is a "survey version" (list of categories)

# Greedy first-fit packing:
# Place each category into the first group where it fits under MAX_TIME.
# If no group fits, create a new group.
for _, row in df_sorted.iterrows():
    placed = False

    for group in survey_groups:
        total_time = sum(cat["category_length_seconds"] for cat in group)

        # If adding this category keeps total time <= MAX_TIME, put it in this group
        if total_time + row["category_length_seconds"] <= MAX_TIME:
            group.append(row)
            placed = True
            break

    # If it didn't fit anywhere, start a new survey group
    if not placed:
        survey_groups.append([row])

# Print number of survey versions created (after the loop, not inside it)
print("Number of survey groups:", len(survey_groups))

# -----------------------------
# 4) Estimate required respondents per group (deterministic)
# -----------------------------
# For each group, we need enough respondents so that the HARDEST category in that group
# (the one with the highest 200/incidence) reaches ~200 qualified respondents on average.
max_required_list = []
for group in survey_groups:
    max_required = max(cat["required_respondents_if_alone"] for cat in group)
    max_required_list.append(max_required)

# Total estimated respondents = sum over groups (each respondent sees exactly one group)
total_required = sum(max_required_list)
print("Estimated total respondents (no buffer):", total_required)

# -----------------------------
# 5) Add buffer to reduce probabilistic shortfall risk
# -----------------------------
# Because qualification is random, using N = 200/p gives ~200 on average,
# but some runs could fall below 200. We add a safety buffer (e.g., 10%).
buffer = 1.10
total_with_buffer = sum(x * buffer for x in max_required_list)
print("Estimated total respondents (10% buffer):", total_with_buffer)

# Optional: show group-level numbers
for i, (group, req) in enumerate(zip(survey_groups, max_required_list), start=1):
    time_sum = sum(cat["category_length_seconds"] for cat in group)
    print(f"Group {i:02d}: categories={len(group)}, total_time={time_sum:.1f}s, max_required={req:.1f}")

   category_id                category_name  incidence_rate  \
0            1     Fertility or IVF service        0.095621   
1            2  Big and Tall Men's Clothing        0.131231   
2            4       Self Tan (Female Only)        0.191096   
3            5                 Baby Feeding        0.198451   
4            6                Baby products        0.188133   

   category_length_seconds  
0               164.504580  
1                69.826299  
2               115.928005  
3               166.231168  
4                60.252290  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   category_id              77 non-null     int64  
 1   category_name            77 non-null     object 
 2   incidence_rate           77 non-null     float64
 3   category_length_seconds  77 non-null     float64
dtypes: float64(2), i

In [31]:
import numpy as np
import pandas as pd

def simulate_group(group_df, N, target=200, n_sims=500, seed=42):
    """
    Monte Carlo simulation to validate whether a survey group
    achieves at least `target` qualified respondents per category.

    Parameters
    ----------
    group_df : DataFrame
        Contains categories in this survey version with their incidence_rate.
    N : int
        Number of respondents assigned to this survey version.
    target : int
        Required number of qualified respondents per category (default=200).
    n_sims : int
        Number of Monte Carlo simulation runs.
    seed : int
        Random seed for reproducibility.

    Returns
    -------
    dict
        Summary statistics including success rate and distribution of minimum qualified counts.
    """

    # Initialize random number generator for reproducibility
    rng = np.random.default_rng(seed)

    # Extract incidence probabilities for categories in this group
    ps = group_df["incidence_rate"].to_numpy()  # shape (k,)
    k = len(ps)  # number of categories in this survey version

    success = 0           # count of simulations where all categories meet the target
    min_quals = []        # track minimum qualified count per simulation

    for _ in range(n_sims):

        # Simulate qualification:
        # For each category: Binomial(N respondents, probability p)
        # This gives the number of qualified respondents per category
        q = rng.binomial(N, ps)

        # Store the minimum qualified count across categories (worst category)
        min_quals.append(q.min())

        # Check if ALL categories reached the required target
        if (q >= target).all():
            success += 1

    # Return validation metrics
    return {
        "N": N,  # respondents assigned to this group
        "k_categories": k,  # number of categories in this group
        "success_rate": success / n_sims,  # probability all categories meet target
        "min_qualified_mean": float(np.mean(min_quals)),  # average worst-case category
        "min_qualified_p05": float(np.quantile(min_quals, 0.05)),  # 5th percentile worst-case
    }


# ==========================================================
# Run validation for all survey groups
# ==========================================================

results = []

for group in survey_groups:
    # Convert group (list of rows) to DataFrame
    group_df = pd.DataFrame(group)

    # Deterministic estimate:
    # Hardest category determines required respondents
    max_required = max(cat["required_respondents_if_alone"] for cat in group)

    # Apply safety buffer (e.g., 20%) to reduce risk of falling below target
    N = int(np.ceil(max_required * 1.20))

    # Run Monte Carlo validation
    res = simulate_group(group_df, N)

    results.append(res)

# Show results for all groups
results

[{'N': 2510,
  'k_categories': 5,
  'success_rate': 0.996,
  'min_qualified_mean': 239.358,
  'min_qualified_p05': 214.0},
 {'N': 1210,
  'k_categories': 3,
  'success_rate': 1.0,
  'min_qualified_mean': 238.648,
  'min_qualified_p05': 218.0},
 {'N': 1026,
  'k_categories': 6,
  'success_rate': 1.0,
  'min_qualified_mean': 235.53,
  'min_qualified_p05': 218.0},
 {'N': 877,
  'k_categories': 4,
  'success_rate': 1.0,
  'min_qualified_mean': 232.162,
  'min_qualified_p05': 215.0},
 {'N': 788,
  'k_categories': 4,
  'success_rate': 0.996,
  'min_qualified_mean': 230.12,
  'min_qualified_p05': 216.0},
 {'N': 777,
  'k_categories': 5,
  'success_rate': 0.996,
  'min_qualified_mean': 228.57,
  'min_qualified_p05': 213.95},
 {'N': 748,
  'k_categories': 5,
  'success_rate': 0.998,
  'min_qualified_mean': 233.38,
  'min_qualified_p05': 217.95},
 {'N': 661,
  'k_categories': 4,
  'success_rate': 0.998,
  'min_qualified_mean': 237.892,
  'min_qualified_p05': 218.95},
 {'N': 605,
  'k_categories'