In [8]:
import pandas as pd
import numpy as np
from scipy.stats import beta
from datetime import timedelta
from IPython.display import display, HTML

def simulate_central_scenario(seed=1234):
    np.random.seed(seed)

    # 1. Create underwriting dates from 2016-01-01 to 2017-12-31
    dt_policydates = pd.DataFrame({
        "date_UW": pd.date_range("2016-01-01", "2017-12-31", freq="D")
    })

    # 2. Simulate daily policy counts with Poisson distribution
    dt_policydates["policycount"] = np.random.poisson(700, size=len(dt_policydates))
    dt_policydates["date_lapse"] = dt_policydates["date_UW"] + pd.DateOffset(years=1)
    dt_policydates["expodays"] = (dt_policydates["date_lapse"] - dt_policydates["date_UW"]).dt.days
    dt_policydates["pol_prefix"] = (
        dt_policydates["date_UW"].dt.year * 10000 +
        dt_policydates["date_UW"].dt.month * 100 +
        dt_policydates["date_UW"].dt.day
    )

    # 3. Define coverage splits
    dt_policydates["Cover_B"] = (dt_policydates["policycount"] * 0.25).round().astype(int)
    dt_policydates["Cover_BO"] = (dt_policydates["policycount"] * 0.45).round().astype(int)
    dt_policydates["Cover_BOT"] = dt_policydates["policycount"] - dt_policydates["Cover_B"] - dt_policydates["Cover_BO"]

    # 4. Expand rows by policycount
    dt_policy = dt_policydates.loc[
        dt_policydates.index.repeat(dt_policydates["policycount"])
    ][["date_UW", "pol_prefix"]].copy()
    dt_policy["pol_seq"] = dt_policy.groupby("pol_prefix").cumcount() + 1

    # FIX: Use int64 to avoid overflow for pol_number
    dt_policy["pol_number"] = (dt_policy["pol_prefix"].astype(np.int64) * 10000 + dt_policy["pol_seq"]).astype(str)

    # 5. Merge coverage data
    dt_policydates.drop(columns="pol_prefix", inplace=True)
    dt_policy = pd.merge(dt_policy, dt_policydates, on="date_UW", how="left")

    # 6. Assign cover type
    dt_policy["Cover"] = "BO"
    dt_policy.loc[dt_policy["pol_seq"] <= (dt_policy["policycount"] - dt_policy["Cover_BO"]), "Cover"] = "BOT"
    dt_policy.loc[dt_policy["pol_seq"] <= dt_policy["Cover_B"], "Cover"] = "B"

    # 7. Assign brand and base price
    brand_pattern = np.tile(np.repeat([1, 2, 3, 4], [9, 6, 3, 2]), int(np.ceil(len(dt_policy)/20)))[:len(dt_policy)]
    base_price_pattern = np.tile(np.repeat([600, 550, 300, 150], [9, 6, 3, 2]), int(np.ceil(len(dt_policy)/20)))[:len(dt_policy)]
    dt_policy["Brand"] = brand_pattern
    dt_policy["Base_Price"] = base_price_pattern

    # 8. Model and multipliers
    dt_policy["Model"] = 0
    dt_policy["Model_mult"] = 1.0
    model_vals = np.repeat([3, 2, 1, 0], [10, 7, 2, 1])
    model_mults = np.repeat([1.15**3, 1.15**2, 1.15, 1.0], [10, 7, 2, 1])

    for brand in dt_policy["Brand"].unique():
        mask = dt_policy["Brand"] == brand
        n = mask.sum()
        dt_policy.loc[mask, "Model"] = np.tile(model_vals, int(np.ceil(n/len(model_vals))))[:n]
        dt_policy.loc[mask, "Model_mult"] = np.tile(model_mults, int(np.ceil(n/len(model_mults))))[:n]

    dt_policy["Price"] = np.ceil(dt_policy["Base_Price"] * dt_policy["Model_mult"]).astype(int)

    # 9. Final columns
    dt_policy = dt_policy[["pol_number", "date_UW", "date_lapse", "Cover", "Brand", "Model", "Price"]]

    # --- Simulate Claims ---
    # Breakage (15% of all)
    claim_idx = np.random.choice(dt_policy.index, size=int(0.15 * len(dt_policy)), replace=False)
    dt_claim = pd.DataFrame({
        "pol_number": dt_policy.loc[claim_idx, "pol_number"].values,
        "claim_type": "B",
        "claim_count": 1,
        "claim_sev": beta.rvs(2, 5, size=len(claim_idx))
    })

    # Oxidation (5% of BO and BOT)
    oxidation_idx = dt_policy[dt_policy["Cover"] != "B"].index
    ox_claim_idx = np.random.choice(oxidation_idx, size=int(0.05 * len(oxidation_idx)), replace=False)
    ox_claims = pd.DataFrame({
        "pol_number": dt_policy.loc[ox_claim_idx, "pol_number"].values,
        "claim_type": "O",
        "claim_count": 1,
        "claim_sev": beta.rvs(5, 3, size=len(ox_claim_idx))
    })
    dt_claim = pd.concat([dt_claim, ox_claims], ignore_index=True)

    # Theft (5% * severity multiplier by Model)
    for m in range(4):
        theft_idx = dt_policy[(dt_policy["Cover"] == "BOT") & (dt_policy["Model"] == m)].index
        count = int(0.05 * (1 + m) * len(theft_idx))
        sampled = np.random.choice(theft_idx, size=min(count, len(theft_idx)), replace=False)
        theft_claims = pd.DataFrame({
            "pol_number": dt_policy.loc[sampled, "pol_number"].values,
            "claim_type": "T",
            "claim_count": 1,
            "claim_sev": beta.rvs(5, 0.5, size=len(sampled))
        })
        dt_claim = pd.concat([dt_claim, theft_claims], ignore_index=True)

    # --- Join policy info ---
    dt_claim = pd.merge(dt_claim, dt_policy[["pol_number", "date_UW", "Price", "Brand"]], on="pol_number", how="left")

    # --- Simulate dates ---
    dt_claim["date_lapse"] = dt_claim["date_UW"] + pd.DateOffset(years=1)
    dt_claim["expodays"] = (dt_claim["date_lapse"] - dt_claim["date_UW"]).dt.days
    dt_claim["occ_delay_days"] = (dt_claim["expodays"] * np.random.uniform(0, 1, len(dt_claim))).astype(int)
    dt_claim["delay_report"] = (365 * beta.rvs(0.4, 10, size=len(dt_claim))).astype(int)
    dt_claim["delay_pay"] = (10 + 40 * beta.rvs(7, 7, size=len(dt_claim))).astype(int)

    dt_claim["date_occur"] = dt_claim["date_UW"] + pd.to_timedelta(dt_claim["occ_delay_days"], unit="D")
    dt_claim["date_report"] = dt_claim["date_occur"] + pd.to_timedelta(dt_claim["delay_report"], unit="D")
    dt_claim["date_pay"] = dt_claim["date_report"] + pd.to_timedelta(dt_claim["delay_pay"], unit="D")
    dt_claim["claim_cost"] = (dt_claim["Price"] * dt_claim["claim_sev"]).round().astype(int)

    # --- Create claim key and remove duplicates ---
    dt_claim["clm_prefix"] = (
        dt_claim["date_occur"].dt.year * 10000 +
        dt_claim["date_occur"].dt.month * 100 +
        dt_claim["date_occur"].dt.day
    )
    dt_claim["clm_seq"] = dt_claim.groupby("clm_prefix").cumcount() + 1

    # FIX: Use int64 to avoid overflow for clm_number
    dt_claim["clm_number"] = (dt_claim["clm_prefix"].astype(np.int64) * 10000 + dt_claim["clm_seq"]).astype(str)

    dt_claim["polclm_seq"] = dt_claim.groupby("pol_number").cumcount() + 1
    dt_claim = dt_claim[dt_claim["polclm_seq"] == 1]

    # --- Final claim columns ---
    dt_claim = dt_claim[[
        "clm_number", "pol_number", "claim_type", "claim_count", "claim_sev",
        "date_occur", "date_report", "date_pay", "claim_cost"
    ]]

    return dt_policy.reset_index(drop=True), dt_claim.reset_index(drop=True)


In [9]:
dt_policy, dt_claim = simulate_central_scenario(seed=1234)

# Show top rows
display(HTML(dt_policy.head().to_html(index=False)))
display(HTML(dt_claim.head().to_html(index=False)))


pol_number,date_UW,date_lapse,Cover,Brand,Model,Price
201601010001,2016-01-01,2017-01-01,B,1,3,913
201601010002,2016-01-01,2017-01-01,B,1,3,913
201601010003,2016-01-01,2017-01-01,B,1,3,913
201601010004,2016-01-01,2017-01-01,B,1,3,913
201601010005,2016-01-01,2017-01-01,B,1,3,913


clm_number,pol_number,claim_type,claim_count,claim_sev,date_occur,date_report,date_pay,claim_cost
201801180001,201707030353,B,1,0.63918,2018-01-18,2018-02-11,2018-03-06,292
201612260001,201610100547,B,1,0.260456,2016-12-26,2016-12-26,2017-01-26,143
201701200001,201604150398,B,1,0.070113,2017-01-20,2017-01-25,2017-02-16,28
201707150001,201608220393,B,1,0.150239,2017-07-15,2017-07-17,2017-08-12,126
201804100001,201706270312,B,1,0.371893,2018-04-10,2018-04-10,2018-05-09,235


In [10]:
dt_claim.to_csv("aa.csv")