In [5]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import random

# Step 2: Set Seed for Reproducibility
np.random.seed(42)

# Step 3: Define Parameters
n = 5000  # number of policies to simulate

device_types = ['Phone', 'Laptop', 'Tablet', 'TV', 'Washing Machine', 'Refrigerator', 'Headphones', 'Charger']
brands = {
    'Phone': ['Apple', 'Samsung', 'Google', 'OnePlus'],
    'Laptop': ['Apple', 'Dell', 'HP', 'Lenovo'],
    'Tablet': ['Apple', 'Samsung', 'Amazon'],
    'TV': ['Samsung', 'LG', 'Sony', 'Panasonic'],
    'Washing Machine': ['LG', 'Bosch', 'Whirlpool'],
    'Refrigerator': ['Samsung', 'Whirlpool', 'Bosch'],
    'Headphones': ['Sony', 'Bose', 'Beats'],
    'Charger': ['Anker', 'Belkin', 'Apple']
}

vendors = ['Amazon UK', 'Currys', 'Argos', 'John Lewis', 'AO.com']
sales_channels = ['Online', 'In-store', 'Marketplace']
regions = ['London', 'South East', 'North West', 'Scotland', 'Wales', 'West Midlands']

# Step 4: Generate Dataset
data = []
for i in range(1, n + 1):
    policy_id = f"P{i:06d}"
    device_type = random.choice(device_types)
    brand = random.choice(brands[device_type])
    mean_price = 400 if device_type in ['Phone', 'Laptop'] else 200
    retail_price = round(max(np.random.normal(loc=mean_price, scale=100), 20), 2)
    plan_duration = random.choice([12, 24, 36])
    vendor = random.choice(vendors)
    region = random.choice(regions)
    customer_age = np.random.randint(18, 70)
    sales_channel = random.choice(sales_channels)

    # Claim logic
    claim_prob = min(0.1 + 0.005 * (70 - customer_age) + 0.05 * (device_type in ['Phone', 'Laptop']), 0.8)
    claim_made = np.random.binomial(1, p=claim_prob)

    if claim_made:
        loc = 0.3 * retail_price
        scale = max(0.1 * retail_price, 1)
        claim_cost = round(max(np.random.normal(loc=loc, scale=scale), 20), 2)
    else:
        claim_cost = 0.0

    plan_price = round(0.07 * retail_price + np.random.normal(5, 3), 2)

    data.append([
        policy_id, device_type, brand, retail_price, plan_duration, vendor,
        region, customer_age, sales_channel, claim_made, claim_cost, plan_price
    ])

# Step 5: Create DataFrame
columns = ['Policy_ID', 'Device_Type', 'Brand', 'Retail_Price', 'Plan_Duration', 'Vendor',
           'Region', 'Customer_Age', 'Sales_Channel', 'Claim_Made', 'Claim_Cost', 'Plan_Price']

df_device_insurance = pd.DataFrame(data, columns=columns)

# Step 6: Save Dataset to CSV
df_device_insurance.to_csv('../data/raw/device_insurance_portfolio_uk.csv', index=False)

# Step 7: Preview
df_device_insurance.head()

Unnamed: 0,Policy_ID,Device_Type,Brand,Retail_Price,Plan_Duration,Vendor,Region,Customer_Age,Sales_Channel,Claim_Made,Claim_Cost,Plan_Price
0,P000001,Phone,Google,449.67,36,John Lewis,South East,60,Online,0,0.0,36.06
1,P000002,Laptop,Lenovo,280.22,24,Argos,West Midlands,40,Online,0,0.0,31.04
2,P000003,Phone,Samsung,427.9,36,John Lewis,South East,41,In-store,0,0.0,37.98
3,P000004,Charger,Apple,207.98,12,John Lewis,West Midlands,47,Online,0,0.0,19.08
4,P000005,Charger,Anker,153.66,24,Currys,Scotland,29,Online,0,0.0,14.36
