## 02 notebook:

This notebook focuses on feature engineering and data preparation for modeling.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/raw/fraud_insurance_claims.csv")
df.shape

(1000, 39)

## FEATURE ENGINEERING

In [3]:

# Convert to datetime


In [4]:
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'])
df['incident_date'] = pd.to_datetime(df['incident_date'])

In [5]:
df['policy_age_days'] = (df['incident_date'] - df['policy_bind_date']).dt.days

In [6]:
df['incident_month'] = df['incident_date'].dt.month
df['incident_dayofweek'] = df['incident_date'].dt.dayofweek

In [7]:

# Claim Ratio Features


In [8]:
df['claim_to_premium_ratio'] = df['total_claim_amount'] / df['policy_annual_premium']

df['injury_claim_ratio'] = df['injury_claim'] / df['total_claim_amount']
df['property_claim_ratio'] = df['property_claim'] / df['total_claim_amount']
df['vehicle_claim_ratio'] = df['vehicle_claim'] / df['total_claim_amount']

In [10]:

# Handle divide-by-zero 


In [11]:
ratio_cols = [
    'claim_to_premium_ratio',
    'injury_claim_ratio',
    'property_claim_ratio',
    'vehicle_claim_ratio'
]

df[ratio_cols] = df[ratio_cols].replace([np.inf, -np.inf], 0)
df[ratio_cols] = df[ratio_cols].fillna(0)

In [12]:

# Consistency / Flag Features


In [13]:
df['no_witness_high_severity'] = (
    (df['witnesses'] == 0) &
    (df['incident_severity'].isin(['Major Damage', 'Total Loss']))
).astype(int)

df['no_police_but_injury'] = (
    (df['police_report_available'] == 'NO') &
    (df['bodily_injuries'] > 0)
).astype(int)


In [14]:

# Drop Redundant Columns


In [15]:
df = df.drop(columns=['policy_bind_date', 'incident_date'])

In [19]:
df.shape
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,fraud_reported,policy_age_days,incident_month,incident_dayofweek,claim_to_premium_ratio,injury_claim_ratio,property_claim_ratio,vehicle_claim_ratio,no_witness_high_severity,no_police_but_injury
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,...,Y,100,1,6,50.898778,0.090909,0.181818,0.727273,0,0
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,...,Y,3130,1,2,4.234811,0.153846,0.153846,0.692308,0,0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,...,N,5282,2,6,24.519864,0.222222,0.111111,0.666667,0,1
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,...,Y,8996,1,5,44.782234,0.1,0.1,0.8,0,1
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,...,N,256,2,1,4.103769,0.2,0.1,0.7,0,0


In [21]:

## SAVED FEATURE ENGINEERING DATASET


In [22]:
df.to_csv("../data/processed/insurance_fraud_fe.csv", index=False)
