# Insurance Claim Severity - Exploratory Data Analysis

This notebook provides an in-depth analysis of our synthetic insurance claims dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn')
sns.set_palette('husl')

# Load the dataset
df = pd.read_csv('../data/train.csv')
df.head()

## 1. Target Variable Analysis

In [None]:
# Distribution of claim severity
plt.figure(figsize=(12, 6))
sns.histplot(df['claim_severity'], kde=True, bins=100)
plt.title('Distribution of Claim Severity')
plt.xlabel('Claim Amount')
plt.ylabel('Frequency')
plt.show()

# Check skewness
print(f"Claim severity skewness: {df['claim_severity'].skew():.2f}")

# Log-transform the target variable
df['log_claim_severity'] = np.log1p(df['claim_severity'])
plt.figure(figsize=(12, 6))
sns.histplot(df['log_claim_severity'], kde=True, bins=100)
plt.title('Distribution of Log-transformed Claim Severity')
plt.xlabel('Log(Claim Amount)')
plt.ylabel('Frequency')
plt.show()

# Check skewness after log transformation
print(f"Log-transformed claim severity skewness: {df['log_claim_severity'].skew():.2f}")

## 2. High-Risk Factors Analysis

In [None]:
# Create bins for analysis
df['age_group'] = pd.cut(df['driver_age'], bins=[18, 25, 40, 60, 80],
                        labels=['young', 'mid_age', 'senior', 'elderly'])
df['vehicle_age_group'] = pd.cut(df['vehicle_age'], bins=[0, 5, 10, 15, 20],
                                labels=['new', 'mid_age', 'old', 'very_old'])

# Age vs Claim Severity
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='log_claim_severity', data=df)
plt.title('Claim Severity by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Log(Claim Amount)')
plt.show()

# Vehicle Age vs Claim Severity
plt.figure(figsize=(12, 6))
sns.boxplot(x='vehicle_age_group', y='log_claim_severity', data=df)
plt.title('Claim Severity by Vehicle Age Group')
plt.xlabel('Vehicle Age Group')
plt.ylabel('Log(Claim Amount)')
plt.show()

# Vehicle Type vs Claim Severity
plt.figure(figsize=(12, 6))
sns.boxplot(x='vehicle_type', y='log_claim_severity', data=df)
plt.title('Claim Severity by Vehicle Type')
plt.xlabel('Vehicle Type')
plt.ylabel('Log(Claim Amount)')
plt.show()

# Vehicle Make vs Claim Severity
plt.figure(figsize=(12, 6))
sns.boxplot(x='vehicle_make', y='log_claim_severity', data=df)
plt.title('Claim Severity by Vehicle Make')
plt.xlabel('Vehicle Make')
plt.ylabel('Log(Claim Amount)')
plt.show()

## 3. Correlation Analysis

In [None]:
# Convert categorical variables to dummy variables for correlation analysis
df_corr = pd.get_dummies(df)

# Calculate correlations
correlations = df_corr.corr()[['log_claim_severity']].sort_values(by='log_claim_severity', ascending=False)

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_corr.corr(), cmap='coolwarm', center=0, annot=False)
plt.title('Correlation Heatmap')
plt.show()

# Display top correlations with claim severity
print("Top correlations with claim severity:")
print(correlations.head(10))
print("
Bottom correlations with claim severity:")
print(correlations.tail(10))

## 4. Interaction Effects

In [None]:
# Create interaction plots
# Age Group vs Vehicle Type
plt.figure(figsize=(15, 6))
sns.boxplot(x='age_group', y='log_claim_severity', hue='vehicle_type', data=df)
plt.title('Claim Severity by Age Group and Vehicle Type')
plt.xlabel('Age Group')
plt.ylabel('Log(Claim Amount)')
plt.legend(title='Vehicle Type')
plt.show()

# Vehicle Age vs Vehicle Make
plt.figure(figsize=(15, 6))
sns.boxplot(x='vehicle_age_group', y='log_claim_severity', hue='vehicle_make', data=df)
plt.title('Claim Severity by Vehicle Age and Make')
plt.xlabel('Vehicle Age Group')
plt.ylabel('Log(Claim Amount)')
plt.legend(title='Vehicle Make')
plt.show()

## 5. Summary of Key Insights

### Target Variable Distribution
- The claim severity distribution is positively skewed
- Log transformation improves the distribution

### High-Risk Factors
1. **Age Groups**:
   - Young drivers (18-25) have higher claim severity
   - Claim severity generally decreases with age

2. **Vehicle Factors**:
   - Sports cars have the highest claim severity
   - High-end vehicles have significantly higher claims
   - Older vehicles tend to have higher claims

3. **Interaction Effects**:
   - Young drivers with sports cars have the highest risk
   - High-end vehicles across all age groups have elevated risk

### Recommendations for Feature Engineering
1. Create interaction features:
   - age_group × vehicle_type
   - vehicle_age_group × vehicle_make
   - age_group × vehicle_make

2. Consider polynomial features for:
   - driver_age
   - vehicle_age
   - past_claims

3. Use log-transformed claim severity as the target variable

These insights will help guide our feature engineering and model selection process.