## Baseline (Naive) cost 

In [5]:
import pandas as pd
import numpy as np

# -----------------------------
# 1) Load data
# -----------------------------
# Read the fake category data (update the path if needed)
df = pd.read_csv(r"D:\General Drive\Tracksuit-Takehome-TA\Tracksuit-Takehome-TA\fake_category_data.csv")

# Quick sanity checks: preview rows, data types, and summary statistics
print(df.head())
print(df.info())
print(df.describe())

# -----------------------------
# 2) Baseline (Naive) cost
# -----------------------------
# If we ran each category alone, to get ~200 qualified respondents:
# expected required respondents = 200 / incidence_rate
df["required_respondents_if_alone"] = 200 / df["incidence_rate"]

# View the computed requirement per category
print(df[["category_name", "incidence_rate", "required_respondents_if_alone"]].head())

# Naive total cost = sum of required respondents across categories
total_cost_naive = df["required_respondents_if_alone"].sum()
print("Naive total respondents:", total_cost_naive)



   category_id                category_name  incidence_rate  \
0            1     Fertility or IVF service        0.095621   
1            2  Big and Tall Men's Clothing        0.131231   
2            4       Self Tan (Female Only)        0.191096   
3            5                 Baby Feeding        0.198451   
4            6                Baby products        0.188133   

   category_length_seconds  
0               164.504580  
1                69.826299  
2               115.928005  
3               166.231168  
4                60.252290  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   category_id              77 non-null     int64  
 1   category_name            77 non-null     object 
 2   incidence_rate           77 non-null     float64
 3   category_length_seconds  77 non-null     float64
dtypes: float64(2), i

## Greedy packing optimization

In [None]:
# -----------------------------
# 3) Greedy Packing (Bin Packing by time constraint)
# -----------------------------
# Sort categories so that the "hardest/most expensive" categories (low incidence) are allocated first
df_sorted = df.sort_values("required_respondents_if_alone", ascending=False).reset_index(drop=True)

MAX_TIME = 480  # 8 minutes in seconds
survey_groups = []  # each group is a "survey version" (list of categories)

# Greedy first-fit packing:
# Place each category into the first group where it fits under MAX_TIME.
# If no group fits, create a new group.
for _, row in df_sorted.iterrows():
    placed = False

    for group in survey_groups:
        total_time = sum(cat["category_length_seconds"] for cat in group)

        # If adding this category keeps total time <= MAX_TIME, put it in this group
        if total_time + row["category_length_seconds"] <= MAX_TIME:
            group.append(row)
            placed = True
            break

    # If it didn't fit anywhere, start a new survey group
    if not placed:
        survey_groups.append([row])

# Print number of survey versions created (after the loop, not inside it)
print("Number of survey groups using greedy first-fit packing:", len(survey_groups))


## Estimate required respondents per group and build a summary table

In [6]:
# -----------------------------
# 4) Estimate required respondents per group (deterministic)
# -----------------------------
# Idea:
# - Each respondent completes exactly ONE survey group.
# - Within a group, respondents may qualify for multiple categories.
# - To hit ~200 qualified respondents for every category (on average),
#   we need at least enough respondents to satisfy the "hardest" category in that group.
#   ("Hardest" = the one that needs the most respondents if run alone, i.e., 200 / incidence.)
# - So: required respondents for a group ≈ max(required_respondents_if_alone) across its categories.

max_required_list = []

for group in survey_groups:
    # Find the maximum required respondents among categories inside this group
    # (this determines the group-level respondent requirement)
    max_required = max(cat["required_respondents_if_alone"] for cat in group)
    max_required_list.append(max_required)

# Total estimated respondents across all groups
# (since groups are run separately and each respondent is assigned to one group)
total_required = sum(max_required_list)

print("Estimated total respondents (no buffer):", total_required)

# -----------------------------
# Build a summary table for each group
# -----------------------------
# We record:
# - group_id: sequential id starting from 1
# - num_categories: how many categories are in the group
# - total_time_seconds: total interview time if someone answers all categories in that group
# - max_required_no_buffer: deterministic respondent estimate for the group (no safety buffer)
groups_summary = []

for i, group in enumerate(survey_groups, start=1):
    groups_summary.append({
        "group_id": i,
        "num_categories": len(group),
        "total_time_seconds": sum(cat["category_length_seconds"] for cat in group),
        "max_required_no_buffer": max(cat["required_respondents_if_alone"] for cat in group)
    })

# Convert to DataFrame and save for reporting / inspection
groups_df = pd.DataFrame(groups_summary)
groups_df.to_csv("survey_groups_summary.csv", index=False)

print("Survey groups summary saved.")

Estimated total respondents (no buffer): 11069.24477046791
Survey groups summary saved.


## Monte carlo simulation

In [7]:
import numpy as np
import pandas as pd

def simulate_group(group_df, N, target=200, n_sims=500, seed=42):
    """
    Monte Carlo simulation to validate whether a survey group
    achieves at least `target` qualified respondents per category.

    Parameters
    ----------
    group_df : DataFrame
        Contains categories in this survey version with their incidence_rate.
    N : int
        Number of respondents assigned to this survey version.
    target : int
        Required number of qualified respondents per category (default=200).
    n_sims : int
        Number of Monte Carlo simulation runs.
    seed : int
        Random seed for reproducibility.

    Returns
    -------
    dict
        Summary statistics including success rate and distribution of minimum qualified counts.
    """

    # ----------------------------------------------------------
    # Initialize random generator (ensures reproducible results)
    # ----------------------------------------------------------
    rng = np.random.default_rng(seed)

    # ----------------------------------------------------------
    # Extract incidence probabilities for categories
    # ----------------------------------------------------------
    # ps = array of probabilities (one per category)
    ps = group_df["incidence_rate"].to_numpy()
    k = len(ps)  # number of categories in this group

    # ----------------------------------------------------------
    # Storage variables for simulation tracking
    # ----------------------------------------------------------
    success = 0       # counts how many simulations succeed (all categories ≥ target)
    min_quals = []    # stores worst-category qualification count per simulation

    # ----------------------------------------------------------
    # Monte Carlo simulation loop
    # ----------------------------------------------------------
    for _ in range(n_sims):

        # Simulate number of qualified respondents per category
        # For each category:
        #   q_i ~ Binomial(N respondents, probability p_i)
        # This assumes independence between respondents
        q = rng.binomial(N, ps)

        # Store minimum qualified count (worst performing category)
        min_quals.append(q.min())

        # Check if ALL categories reached required target
        if (q >= target).all():
            success += 1

    # ----------------------------------------------------------
    # Return validation statistics
    # ----------------------------------------------------------
    return {
        "N": N,  # respondents assigned to this group
        "k_categories": k,  # number of categories in group
        "success_rate": success / n_sims,  # probability all categories hit target
        "min_qualified_mean": float(np.mean(min_quals)),  # average worst-case category
        "min_qualified_p05": float(np.quantile(min_quals, 0.05)),  # 5th percentile (risk measure)
    }


# ==========================================================
# Run validation for all survey groups
# ==========================================================

results = []

for group in survey_groups:

    # Convert group (list of dictionaries) into DataFrame
    group_df = pd.DataFrame(group)

    # ----------------------------------------------------------
    # Deterministic baseline estimate:
    # Hardest category determines required respondents
    # (i.e., the one needing the most respondents if run alone)
    # ----------------------------------------------------------
    max_required = max(cat["required_respondents_if_alone"] for cat in group)

    # ----------------------------------------------------------
    # Apply safety buffer (e.g., +20%)
    # This reduces probability of falling below 200 due to randomness
    # ----------------------------------------------------------
    N = int(np.ceil(max_required * 1.20))

    # Run Monte Carlo validation
    res = simulate_group(group_df, N)

    results.append(res)

# ----------------------------------------------------------
# Convert results to DataFrame for reporting
# ----------------------------------------------------------
results_df = pd.DataFrame(results)

# Save results for documentation / report
results_df.to_csv("validation_results.csv", index=False)

print("Results saved to validation_results.csv")

Results saved to validation_results.csv
