In [1]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)
n = 20000

age = rng.integers(18, 80, size=n)
region = rng.choice(["A", "B", "C", "D"], size=n, p=[0.35, 0.25, 0.25, 0.15])
vehicle_type = rng.choice(["small", "mid", "large"], size=n, p=[0.45, 0.40, 0.15])
mileage_k = rng.integers(1, 50, size=n)
exposure = rng.uniform(0.1, 1.0, size=n).round(2)

base = -2.2
age_effect = np.where(age < 25, 0.35, np.where(age < 35, 0.15, 0.0))
region_effect = pd.Series(region).map({"A":0.00, "B":0.10, "C":0.18, "D":0.28}).to_numpy()
veh_effect = pd.Series(vehicle_type).map({"small":0.00, "mid":0.08, "large":0.16}).to_numpy()
mile_effect = (mileage_k - mileage_k.mean()) * 0.01

log_lambda = base + age_effect + region_effect + veh_effect + mile_effect + np.log(exposure)
lam = np.exp(log_lambda)
claim_count = rng.poisson(lam)

severity_mean = 1200 * (1 + 0.20*(vehicle_type=="large") + 0.10*(region=="D"))
shape = 2.0
scale = severity_mean / shape
claim_cost_per_claim = rng.gamma(shape=shape, scale=scale)

total_claim_cost = claim_count * claim_cost_per_claim

df = pd.DataFrame({
    "policy_id": np.arange(1, n+1),
    "age": age,
    "region": region,
    "vehicle_type": vehicle_type,
    "mileage_k": mileage_k,
    "exposure": exposure,
    "claim_count": claim_count,
    "total_claim_cost": total_claim_cost.round(2),
})
df["has_claim"] = (df["claim_count"] > 0).astype(int)
df["avg_severity"] = np.where(df["claim_count"] > 0, df["total_claim_cost"] / df["claim_count"], 0).round(2)

df.head()


Unnamed: 0,policy_id,age,region,vehicle_type,mileage_k,exposure,claim_count,total_claim_cost,has_claim,avg_severity
0,1,23,C,small,37,0.85,0,0.0,0,0.0
1,2,65,C,small,44,0.76,0,0.0,0,0.0
2,3,58,A,large,29,0.88,0,0.0,0,0.0
3,4,45,A,small,21,0.95,0,0.0,0,0.0
4,5,44,A,small,1,0.61,0,0.0,0,0.0


In [2]:
import os
os.makedirs("data/raw", exist_ok=True)
df.to_csv("data/raw/policy_claims.csv", index=False)
print("rows:", len(df))


rows: 20000
