In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from datetime import datetime
from scipy import stats
from scipy.stats import chi2_contingency, normaltest, skew, kurtosis

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


✅ Libraries imported successfully!


In [None]:
# Load the insurance dataset
df = pd.read_csv('../data/raw/insurance_data.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Data Types: {df.dtypes.value_counts().to_dict()}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display first few rows
print("\nFirst 5 rows of the dataset:")
display(df.head())

print("\nDataset Information:")
df.info()


📊 Dataset Shape: (10000, 19)
📅 Data Types: {dtype('O'): 9, dtype('int64'): 5, dtype('float64'): 5}
💾 Memory Usage: 6.23 MB

🔍 First 5 rows of the dataset:


Unnamed: 0,PolicyID,Province,PostalCode,ZipCode,Gender,Age,VehicleType,VehicleMake,VehicleYear,CoverType,TotalPremium,TotalClaims,CustomValueEstimate,TransactionMonth,TransactionDate,LossRatio,HasClaim,VehicleAge,ClaimFrequency
0,POL1000000,Quebec,U1A,46048,Female,46.0,Sedan,Ford,2016,Basic,733.74,0.0,3121.34,2022-11,2022-11-05,0.0,0,7,0
1,POL1000001,Alberta,H3E,23434,Female,38.3,Sedan,Mazda,2014,Premium,877.52,1926.78,2408.48,2023-05,2023-05-26,2.195711,1,9,1
2,POL1000002,Quebec,V8C,87397,Male,34.9,Coupe,Ford,2021,Premium,1186.29,0.0,11460.12,2022-01,2022-01-21,0.0,0,2,0
3,POL1000003,Ontario,N0A,22280,Male,31.1,Van,Honda,2019,Basic,779.73,0.0,8589.55,2023-03,2023-03-31,0.0,0,4,0
4,POL1000004,Quebec,G3Q,88907,Male,53.6,Sedan,Chevrolet,2011,Basic,621.76,2464.99,3081.24,2022-06,2022-06-11,3.964536,1,12,1



📋 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PolicyID             10000 non-null  object 
 1   Province             10000 non-null  object 
 2   PostalCode           10000 non-null  object 
 3   ZipCode              10000 non-null  int64  
 4   Gender               10000 non-null  object 
 5   Age                  10000 non-null  float64
 6   VehicleType          10000 non-null  object 
 7   VehicleMake          10000 non-null  object 
 8   VehicleYear          10000 non-null  int64  
 9   CoverType            10000 non-null  object 
 10  TotalPremium         10000 non-null  float64
 11  TotalClaims          10000 non-null  float64
 12  CustomValueEstimate  10000 non-null  float64
 13  TransactionMonth     10000 non-null  object 
 14  TransactionDate      10000 non-null  object 
 15  LossRatio    

In [None]:
# Data Quality Assessment
print("🔍 Missing Values Analysis:")
missing_data = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percentage
}).sort_values('Missing Count', ascending=False)

print(missing_summary[missing_summary['Missing Count'] > 0])

if missing_summary['Missing Count'].sum() == 0:
    print("No missing values found in the dataset!")

# Check for duplicate records
duplicates = df.duplicated().sum()
print(f"\nDuplicate Records: {duplicates}")

# Data type verification
print("\nData Types Verification:")
for col in df.columns:
    dtype = df[col].dtype
    unique_count = df[col].nunique()
    print(f"{col:20} | {str(dtype):12} | Unique Values: {unique_count}")


🔍 Missing Values Analysis:
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []
✅ No missing values found in the dataset!

🔄 Duplicate Records: 0

📊 Data Types Verification:
PolicyID             | object       | Unique Values: 10000
Province             | object       | Unique Values: 9
PostalCode           | object       | Unique Values: 5218
ZipCode              | int64        | Unique Values: 9474
Gender               | object       | Unique Values: 3
Age                  | float64      | Unique Values: 557
VehicleType          | object       | Unique Values: 7
VehicleMake          | object       | Unique Values: 12
VehicleYear          | int64        | Unique Values: 13
CoverType            | object       | Unique Values: 3
TotalPremium         | float64      | Unique Values: 9439
TotalClaims          | float64      | Unique Values: 1476
CustomValueEstimate  | float64      | Unique Values: 9868
TransactionMonth     | object       | Unique Values: 19
TransactionDat

In [None]:
# Descriptive Statistics for Numerical Variables
print("Descriptive Statistics for Numerical Variables:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
desc_stats = df[numerical_cols].describe()

# Add additional statistical measures
additional_stats = pd.DataFrame({
    'skewness': df[numerical_cols].skew(),
    'kurtosis': df[numerical_cols].kurtosis(),
    'variance': df[numerical_cols].var()
}).round(4)

# Combine descriptive statistics
full_stats = pd.concat([desc_stats.round(2), additional_stats.T], axis=0)
display(full_stats)


📈 Descriptive Statistics for Numerical Variables:


Unnamed: 0,ZipCode,Age,VehicleYear,TotalPremium,TotalClaims,CustomValueEstimate,LossRatio,HasClaim,VehicleAge,ClaimFrequency
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,55321.62,40.26,2015.99,975.67,323.53,6554.6,0.36,0.15,7.01,0.15
std,26201.49,11.48,3.73,302.63,1021.59,5261.59,1.17,0.36,3.73,0.36
min,10007.0,18.0,2010.0,426.09,0.0,2400.12,0.0,0.0,1.0,0.0
25%,31979.5,32.2,2013.0,759.08,0.0,3085.33,0.0,0.0,4.0,0.0
50%,55905.5,40.2,2016.0,917.83,0.0,3610.29,0.0,0.0,7.0,0.0
75%,78145.25,48.0,2019.0,1123.34,0.0,9096.95,0.0,0.0,10.0,0.0
max,99998.0,80.0,2022.0,2913.15,12754.01,25485.12,18.98,1.0,13.0,1.0
skewness,-0.0224,0.1738,0.0055,1.2763,4.6926,1.4342,5.0658,1.9271,-0.0055,1.9271
kurtosis,-1.2256,-0.2462,-1.2053,2.4637,29.9037,0.935,37.4892,1.7139,-1.2053,1.7139


In [None]:
# Calculate overall Loss Ratio
overall_loss_ratio = df['TotalClaims'].sum() / df['TotalPremium'].sum()
print(f"Overall Portfolio Loss Ratio: {overall_loss_ratio:.4f} ({overall_loss_ratio*100:.2f}%)")

# Calculate Loss Ratio by different segments
print("\nLoss Ratio Analysis by Segments:")

# By Province
province_loss_ratio = df.groupby('Province').agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum',
    'PolicyID': 'count'
}).reset_index()
province_loss_ratio['LossRatio'] = province_loss_ratio['TotalClaims'] / province_loss_ratio['TotalPremium']
province_loss_ratio = province_loss_ratio.sort_values('LossRatio', ascending=False)
province_loss_ratio.columns = ['Province', 'Total_Claims', 'Total_Premium', 'Policy_Count', 'Loss_Ratio']

print("\nLoss Ratio by Province:")
display(province_loss_ratio.round(4))

# By Vehicle Type
vehicle_loss_ratio = df.groupby('VehicleType').agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum',
    'PolicyID': 'count'
}).reset_index()
vehicle_loss_ratio['LossRatio'] = vehicle_loss_ratio['TotalClaims'] / vehicle_loss_ratio['TotalPremium']
vehicle_loss_ratio = vehicle_loss_ratio.sort_values('LossRatio', ascending=False)
vehicle_loss_ratio.columns = ['Vehicle_Type', 'Total_Claims', 'Total_Premium', 'Policy_Count', 'Loss_Ratio']

print("\nLoss Ratio by Vehicle Type:")
display(vehicle_loss_ratio.round(4))

# By Gender
gender_loss_ratio = df.groupby('Gender').agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum',
    'PolicyID': 'count'
}).reset_index()
gender_loss_ratio['LossRatio'] = gender_loss_ratio['TotalClaims'] / gender_loss_ratio['TotalPremium']
gender_loss_ratio = gender_loss_ratio.sort_values('LossRatio', ascending=False)
gender_loss_ratio.columns = ['Gender', 'Total_Claims', 'Total_Premium', 'Policy_Count', 'Loss_Ratio']

print("\nLoss Ratio by Gender:")
display(gender_loss_ratio.round(4))


📊 Overall Portfolio Loss Ratio: 0.3316 (33.16%)

🎯 Loss Ratio Analysis by Segments:

📍 Loss Ratio by Province:


Unnamed: 0,Province,Total_Claims,Total_Premium,Policy_Count,Loss_Ratio
3,New Brunswick,88453.37,169155.24,209,0.5229
8,Saskatchewan,110319.84,235356.42,308,0.4687
7,Quebec,797060.97,2005443.37,2258,0.3974
5,Nova Scotia,93717.68,243684.12,281,0.3846
2,Manitoba,110304.2,300062.95,374,0.3676
1,British Columbia,411907.83,1339814.61,1341,0.3074
6,Ontario,1291044.68,4230264.88,3901,0.3052
0,Alberta,315845.41,1167833.81,1233,0.2705
4,Newfoundland,16657.3,65119.71,95,0.2558



🚗 Loss Ratio by Vehicle Type:


Unnamed: 0,Vehicle_Type,Total_Claims,Total_Premium,Policy_Count,Loss_Ratio
3,SUV,888033.64,2444935.53,2469,0.3632
5,Truck,315665.42,892779.4,801,0.3536
6,Van,168994.0,483280.33,477,0.3497
4,Sedan,1044827.56,3152625.24,3496,0.3314
2,Hatchback,430914.72,1327489.81,1551,0.3246
1,Coupe,258528.76,946913.7,802,0.273
0,Convertible,128347.18,508711.1,404,0.2523



👥 Loss Ratio by Gender:


Unnamed: 0,Gender,Total_Claims,Total_Premium,Policy_Count,Loss_Ratio
2,Other,68002.77,189023.02,207,0.3598
1,Male,1642070.58,4904246.67,4772,0.3348
0,Female,1525237.93,4663465.42,5021,0.3271


In [None]:
# Distribution Analysis of Key Financial Variables
financial_vars = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']

print("Distribution Analysis of Financial Variables:")
for var in financial_vars:
    print(f"\n--- {var} ---")
    data = df[var]
    
    # Basic statistics
    print(f"Mean: ${data.mean():,.2f}")
    print(f"Median: ${data.median():,.2f}")
    print(f"Std Dev: ${data.std():,.2f}")
    print(f"Skewness: {skew(data):.4f}")
    print(f"Kurtosis: {kurtosis(data):.4f}")
    
    # Test for normality
    stat, p_value = normaltest(data)
    print(f"Normality Test (D'Agostino): p-value = {p_value:.6f}")
    if p_value < 0.05:
        print("Data is NOT normally distributed")
    else:
        print("Data appears normally distributed")

# Outlier Detection using IQR method
print("\nOutlier Detection (IQR Method):")
for var in financial_vars:
    data = df[var]
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    outlier_percentage = (len(outliers) / len(data)) * 100
    
    print(f"\n{var}:")
    print(f"  Lower Bound: ${lower_bound:,.2f}")
    print(f"  Upper Bound: ${upper_bound:,.2f}")
    print(f"  Outliers: {len(outliers)} ({outlier_percentage:.2f}%)")
    if len(outliers) > 0:
        print(f"  Max Outlier: ${outliers.max():,.2f}")
        print(f"  Min Outlier: ${outliers.min():,.2f}")


📈 Distribution Analysis of Financial Variables:

--- TotalPremium ---
Mean: $975.67
Median: $917.83
Std Dev: $302.63
Skewness: 1.2761
Kurtosis: 2.4619
Normality Test (D'Agostino): p-value = 0.000000
❌ Data is NOT normally distributed

--- TotalClaims ---
Mean: $323.53
Median: $0.00
Std Dev: $1,021.59
Skewness: 4.6919
Kurtosis: 29.8882
Normality Test (D'Agostino): p-value = 0.000000
❌ Data is NOT normally distributed

--- CustomValueEstimate ---
Mean: $6,554.60
Median: $3,610.29
Std Dev: $5,261.59
Skewness: 1.4340
Kurtosis: 0.9339
Normality Test (D'Agostino): p-value = 0.000000
❌ Data is NOT normally distributed

🎯 Outlier Detection (IQR Method):

TotalPremium:
  Lower Bound: $212.70
  Upper Bound: $1,669.73
  Outliers: 320 (3.20%)
  Max Outlier: $2,913.15
  Min Outlier: $1,669.74

TotalClaims:
  Lower Bound: $0.00
  Upper Bound: $0.00
  Outliers: 1531 (15.31%)
  Max Outlier: $12,754.01
  Min Outlier: $100.00

CustomValueEstimate:
  Lower Bound: $-5,932.11
  Upper Bound: $18,114.39
  Ou

In [None]:
# Temporal Analysis
print("Temporal Trends Analysis:")

# Convert TransactionDate to datetime if not already
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['Year_Month'] = df['TransactionDate'].dt.to_period('M')

# Monthly aggregations
monthly_stats = df.groupby('Year_Month').agg({
    'TotalPremium': ['sum', 'mean', 'count'],
    'TotalClaims': ['sum', 'mean'],
    'HasClaim': ['sum', 'mean'],
    'CustomValueEstimate': 'mean'
}).round(2)

# Flatten column names
monthly_stats.columns = ['_'.join(col).strip() for col in monthly_stats.columns]
monthly_stats.reset_index(inplace=True)

# Calculate additional metrics
monthly_stats['Claim_Frequency'] = monthly_stats['HasClaim_sum'] / monthly_stats['TotalPremium_count'] * 100
monthly_stats['Avg_Claim_Severity'] = monthly_stats['TotalClaims_sum'] / monthly_stats['HasClaim_sum']
monthly_stats['Loss_Ratio'] = monthly_stats['TotalClaims_sum'] / monthly_stats['TotalPremium_sum']

# Handle division by zero
monthly_stats['Avg_Claim_Severity'] = monthly_stats['Avg_Claim_Severity'].fillna(0)
monthly_stats['Loss_Ratio'] = monthly_stats['Loss_Ratio'].fillna(0)

print("Monthly Trends Summary:")
display(monthly_stats.round(2))

# Identify trends
print("\nTrend Analysis:")
print(f"Claim Frequency Range: {monthly_stats['Claim_Frequency'].min():.2f}% - {monthly_stats['Claim_Frequency'].max():.2f}%")
print(f"Average Claim Severity Range: ${monthly_stats['Avg_Claim_Severity'].min():,.2f} - ${monthly_stats['Avg_Claim_Severity'].max():,.2f}")
print(f"Loss Ratio Range: {monthly_stats['Loss_Ratio'].min():.4f} - {monthly_stats['Loss_Ratio'].max():.4f}")

# Correlation with time (using month number as proxy)
monthly_stats['Month_Number'] = range(1, len(monthly_stats) + 1)
freq_correlation = monthly_stats['Claim_Frequency'].corr(monthly_stats['Month_Number'])
severity_correlation = monthly_stats['Avg_Claim_Severity'].corr(monthly_stats['Month_Number'])

print(f"\nCorrelation with Time:")
print(f"Claim Frequency vs Time: {freq_correlation:.4f}")
print(f"Claim Severity vs Time: {severity_correlation:.4f}")

if abs(freq_correlation) > 0.3:
    trend_direction = "increasing" if freq_correlation > 0 else "decreasing"
    print(f"Notable {trend_direction} trend in claim frequency over time")
if abs(severity_correlation) > 0.3:
    trend_direction = "increasing" if severity_correlation > 0 else "decreasing"
    print(f"Notable {trend_direction} trend in claim severity over time")


📅 Temporal Trends Analysis:
📈 Monthly Trends Summary:


Unnamed: 0,Year_Month,TotalPremium_sum,TotalPremium_mean,TotalPremium_count,TotalClaims_sum,TotalClaims_mean,HasClaim_sum,HasClaim_mean,CustomValueEstimate_mean,Claim_Frequency,Avg_Claim_Severity,Loss_Ratio
0,2022-01,543995.89,981.94,554,203999.07,368.23,90,0.16,6624.5,16.25,2266.66,0.38
1,2022-02,492779.69,985.56,500,166516.62,333.03,79,0.16,6535.95,15.8,2107.81,0.34
2,2022-03,615333.17,973.63,632,218729.92,346.09,105,0.17,6623.56,16.61,2083.14,0.36
3,2022-04,545878.61,992.51,550,154555.69,281.01,76,0.14,6316.53,13.82,2033.63,0.28
4,2022-05,536020.11,964.06,556,161560.47,290.58,82,0.15,6465.68,14.75,1970.25,0.3
5,2022-06,527603.65,966.31,546,188306.92,344.88,91,0.17,6628.72,16.67,2069.31,0.36
6,2022-07,577277.92,968.59,596,170016.46,285.26,85,0.14,6606.92,14.26,2000.19,0.29
7,2022-08,548995.45,968.25,567,178018.37,313.97,86,0.15,6922.67,15.17,2069.98,0.32
8,2022-09,508684.18,983.92,517,147761.2,285.81,83,0.16,6155.16,16.05,1780.26,0.29
9,2022-10,552536.25,969.36,570,173124.11,303.73,86,0.15,6397.95,15.09,2013.07,0.31



🔍 Trend Analysis:
Claim Frequency Range: 8.33% - 17.16%
Average Claim Severity Range: $1,768.56 - $2,542.99
Loss Ratio Range: 0.2266 - 0.4256

📊 Correlation with Time:
Claim Frequency vs Time: -0.4220
Claim Severity vs Time: 0.2578
🔺 Notable decreasing trend in claim frequency over time


In [None]:
# Vehicle Analysis
print("Vehicle Analysis - Claims by Make and Type:")

# Analysis by Vehicle Make
vehicle_make_analysis = df.groupby('VehicleMake').agg({
    'TotalClaims': ['sum', 'mean', 'count'],
    'TotalPremium': ['sum', 'mean'],
    'HasClaim': ['sum', 'mean'],
    'CustomValueEstimate': 'mean'
}).round(2)

vehicle_make_analysis.columns = ['_'.join(col).strip() for col in vehicle_make_analysis.columns]
vehicle_make_analysis.reset_index(inplace=True)

# Calculate additional metrics
vehicle_make_analysis['Avg_Claim_Amount'] = vehicle_make_analysis['TotalClaims_sum'] / vehicle_make_analysis['HasClaim_sum']
vehicle_make_analysis['Claim_Frequency_Pct'] = vehicle_make_analysis['HasClaim_mean'] * 100
vehicle_make_analysis['Loss_Ratio'] = vehicle_make_analysis['TotalClaims_sum'] / vehicle_make_analysis['TotalPremium_sum']

# Handle division by zero
vehicle_make_analysis['Avg_Claim_Amount'] = vehicle_make_analysis['Avg_Claim_Amount'].fillna(0)

# Sort by different metrics
print("Vehicle Makes Ranked by Average Claim Amount:")
top_claim_makes = vehicle_make_analysis.sort_values('Avg_Claim_Amount', ascending=False)
display(top_claim_makes[['VehicleMake', 'TotalClaims_count', 'Avg_Claim_Amount', 'Claim_Frequency_Pct', 'Loss_Ratio']].round(2))

print("\nTop 3 Highest Risk Vehicle Makes:")
for i, row in top_claim_makes.head(3).iterrows():
    print(f"{row['VehicleMake']}: Avg Claim ${row['Avg_Claim_Amount']:,.2f}, Frequency {row['Claim_Frequency_Pct']:.1f}%")

print("\nTop 3 Lowest Risk Vehicle Makes:")
for i, row in top_claim_makes.tail(3).iterrows():
    if row['Avg_Claim_Amount'] > 0:  # Only include makes with actual claims
        print(f"{row['VehicleMake']}: Avg Claim ${row['Avg_Claim_Amount']:,.2f}, Frequency {row['Claim_Frequency_Pct']:.1f}%")

# Vehicle Type Analysis
print("\n\n🚙 Vehicle Types Analysis:")
vehicle_type_analysis = df.groupby('VehicleType').agg({
    'TotalClaims': ['sum', 'mean'],
    'HasClaim': ['sum', 'mean'],
    'TotalPremium': 'sum',
    'CustomValueEstimate': 'mean'
}).round(2)

vehicle_type_analysis.columns = ['_'.join(col).strip() for col in vehicle_type_analysis.columns]
vehicle_type_analysis.reset_index(inplace=True)
vehicle_type_analysis['Avg_Claim_Amount'] = vehicle_type_analysis['TotalClaims_sum'] / vehicle_type_analysis['HasClaim_sum']
vehicle_type_analysis['Claim_Frequency_Pct'] = vehicle_type_analysis['HasClaim_mean'] * 100
vehicle_type_analysis = vehicle_type_analysis.sort_values('Avg_Claim_Amount', ascending=False)

display(vehicle_type_analysis[['VehicleType', 'Avg_Claim_Amount', 'Claim_Frequency_Pct', 'CustomValueEstimate_mean']].round(2))


🚗 Vehicle Analysis - Claims by Make and Type:
📊 Vehicle Makes Ranked by Average Claim Amount:


Unnamed: 0,VehicleMake,TotalClaims_count,Avg_Claim_Amount,Claim_Frequency_Pct,Loss_Ratio
8,Mercedes,829,2414.79,15.0,0.38
5,Hyundai,830,2252.02,12.0,0.28
11,Volkswagen,850,2183.09,17.0,0.39
9,Nissan,829,2168.28,18.0,0.39
6,Kia,796,2146.76,16.0,0.35
3,Ford,842,2122.65,15.0,0.32
7,Mazda,843,2103.1,14.0,0.3
1,BMW,871,2042.92,16.0,0.34
0,Audi,858,2041.68,16.0,0.33
10,Toyota,839,2034.21,14.0,0.29



🏆 Top 3 Highest Risk Vehicle Makes:
Mercedes: Avg Claim $2,414.79, Frequency 15.0%
Hyundai: Avg Claim $2,252.02, Frequency 12.0%
Volkswagen: Avg Claim $2,183.09, Frequency 17.0%

✅ Top 3 Lowest Risk Vehicle Makes:
Toyota: Avg Claim $2,034.21, Frequency 14.0%
Chevrolet: Avg Claim $1,954.45, Frequency 16.0%
Honda: Avg Claim $1,906.90, Frequency 16.0%


🚙 Vehicle Types Analysis:


Unnamed: 0,VehicleType,Avg_Claim_Amount,Claim_Frequency_Pct,CustomValueEstimate_mean
5,Truck,2447.02,16.0,7769.09
3,SUV,2265.39,16.0,7527.52
6,Van,2194.73,16.0,6166.68
1,Coupe,2136.6,15.0,6239.96
0,Convertible,2070.12,15.0,6352.67
4,Sedan,1971.37,15.0,6010.79
2,Hatchback,1958.7,14.0,5938.99


In [None]:
# VISUALIZATION 1: Interactive Loss Ratio Heatmap by Province and Vehicle Type
print("Creating Visualization 1: Interactive Loss Ratio Heatmap")

# Create pivot table for heatmap
loss_ratio_pivot = df.groupby(['Province', 'VehicleType']).agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum'
}).reset_index()

loss_ratio_pivot['Loss_Ratio'] = loss_ratio_pivot['TotalClaims'] / loss_ratio_pivot['TotalPremium']
heatmap_data = loss_ratio_pivot.pivot(index='Province', columns='VehicleType', values='Loss_Ratio')

# Create interactive heatmap using Plotly
fig1 = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    colorscale='RdYlBu_r',
    colorbar=dict(title="Loss Ratio"),
    hoverongaps=False,
    hovertemplate='Province: %{y}<br>Vehicle Type: %{x}<br>Loss Ratio: %{z:.4f}<extra></extra>'
))

fig1.update_layout(
    title={
        'text': 'Loss Ratio Heatmap: Risk Patterns by Province & Vehicle Type',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 18}
    },
    xaxis_title="Vehicle Type",
    yaxis_title="Province",
    font=dict(size=12),
    height=600,
    width=900
)

fig1.show()
print("Heatmap created - Shows risk concentration patterns across geography and vehicle types")


🎨 Creating Visualization 1: Interactive Loss Ratio Heatmap


✅ Heatmap created - Shows risk concentration patterns across geography and vehicle types


In [None]:
# VISUALIZATION 2: Multi-Dimensional Bubble Chart - Risk vs Profitability
print("Creating Visualization 2: Risk vs Profitability Bubble Chart")

# Prepare data for bubble chart
bubble_data = df.groupby('VehicleMake').agg({
    'TotalClaims': 'sum',
    'TotalPremium': 'sum',
    'HasClaim': 'mean',
    'PolicyID': 'count',
    'CustomValueEstimate': 'mean'
}).reset_index()

bubble_data['Loss_Ratio'] = bubble_data['TotalClaims'] / bubble_data['TotalPremium']
bubble_data['Claim_Frequency'] = bubble_data['HasClaim'] * 100
bubble_data['Portfolio_Size'] = bubble_data['PolicyID']

# Create bubble chart
fig2 = px.scatter(
    bubble_data,
    x='Claim_Frequency',
    y='Loss_Ratio',
    size='Portfolio_Size',
    color='CustomValueEstimate',
    hover_name='VehicleMake',
    hover_data={
        'Claim_Frequency': ':.2f',
        'Loss_Ratio': ':.4f',
        'Portfolio_Size': ':,',
        'CustomValueEstimate': ':,.0f'
    },
    labels={
        'Claim_Frequency': 'Claim Frequency (%)',
        'Loss_Ratio': 'Loss Ratio',
        'Portfolio_Size': 'Number of Policies',
        'CustomValueEstimate': 'Avg Vehicle Value ($)'
    },
    title='Risk vs Profitability: Vehicle Make Analysis<br><sub>Size = Portfolio Size, Color = Average Vehicle Value</sub>',
    color_continuous_scale='Viridis'
)

# Add quadrant lines
avg_freq = bubble_data['Claim_Frequency'].mean()
avg_loss = bubble_data['Loss_Ratio'].mean()

fig2.add_hline(y=avg_loss, line_dash="dash", line_color="red", 
               annotation_text="Average Loss Ratio", annotation_position="right")
fig2.add_vline(x=avg_freq, line_dash="dash", line_color="red",
               annotation_text="Average Claim Frequency", annotation_position="top")

fig2.update_layout(
    width=1000,
    height=700,
    title_font_size=16,
    title_x=0.5
)

fig2.show()
print("Bubble chart created - Reveals risk-profitability relationships and portfolio concentration")


🎨 Creating Visualization 2: Risk vs Profitability Bubble Chart


✅ Bubble chart created - Reveals risk-profitability relationships and portfolio concentration


In [None]:
# VISUALIZATION 3: Temporal Evolution Dashboard with Multiple Metrics
print("🎨 Creating Visualization 3: Temporal Evolution Dashboard")

# Create subplots
fig3 = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Monthly Claim Frequency Trend', 'Monthly Loss Ratio Evolution',
                   'Premium vs Claims Volume', 'Risk Score by Vehicle Age'),
    specs=[[{"secondary_y": True}, {"secondary_y": False}],
           [{"secondary_y": True}, {"secondary_y": False}]],
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

# Convert Year_Month to string for plotting
monthly_stats['Month_Str'] = monthly_stats['Year_Month'].astype(str)

# Plot 1: Claim Frequency Trend with Volume
fig3.add_trace(
    go.Scatter(x=monthly_stats['Month_Str'], y=monthly_stats['Claim_Frequency'],
               mode='lines+markers', name='Claim Frequency (%)',
               line=dict(color='red', width=3)),
    row=1, col=1
)
fig3.add_trace(
    go.Bar(x=monthly_stats['Month_Str'], y=monthly_stats['TotalPremium_count'],
           name='Policy Count', opacity=0.6, marker_color='lightblue'),
    row=1, col=1, secondary_y=True
)

# Plot 2: Loss Ratio Evolution
fig3.add_trace(
    go.Scatter(x=monthly_stats['Month_Str'], y=monthly_stats['Loss_Ratio'],
               mode='lines+markers', name='Loss Ratio',
               line=dict(color='orange', width=3), fill='tonexty'),
    row=1, col=2
)

# Plot 3: Premium vs Claims Volume
fig3.add_trace(
    go.Scatter(x=monthly_stats['Month_Str'], y=monthly_stats['TotalPremium_sum'],
               mode='lines+markers', name='Premium Volume',
               line=dict(color='green', width=2)),
    row=2, col=1
)
fig3.add_trace(
    go.Scatter(x=monthly_stats['Month_Str'], y=monthly_stats['TotalClaims_sum'],
               mode='lines+markers', name='Claims Volume',
               line=dict(color='red', width=2)),
    row=2, col=1, secondary_y=True
)

# Plot 4: Risk Score by Vehicle Age
age_risk = df.groupby('VehicleAge').agg({
    'HasClaim': 'mean',
    'TotalClaims': 'mean'
}).reset_index()
age_risk['Risk_Score'] = age_risk['HasClaim'] * age_risk['TotalClaims']

fig3.add_trace(
    go.Bar(x=age_risk['VehicleAge'], y=age_risk['Risk_Score'],
           name='Risk Score', marker_color='purple'),
    row=2, col=2
)

# Update layout
fig3.update_layout(
    title_text="Insurance Analytics Dashboard: Temporal & Risk Evolution",
    title_x=0.5,
    title_font_size=18,
    height=800,
    width=1200,
    showlegend=True
)

# Update y-axis labels
fig3.update_yaxes(title_text="Claim Frequency (%)", row=1, col=1)
fig3.update_yaxes(title_text="Policy Count", row=1, col=1, secondary_y=True)
fig3.update_yaxes(title_text="Loss Ratio", row=1, col=2)
fig3.update_yaxes(title_text="Premium Volume ($)", row=2, col=1)
fig3.update_yaxes(title_text="Claims Volume ($)", row=2, col=1, secondary_y=True)
fig3.update_yaxes(title_text="Risk Score", row=2, col=2)

# Update x-axis labels
fig3.update_xaxes(title_text="Month", row=2, col=1)
fig3.update_xaxes(title_text="Month", row=2, col=2)
fig3.update_xaxes(title_text="Vehicle Age (Years)", row=2, col=2)

fig3.show()
print("Dashboard created - Comprehensive view of temporal trends and risk factors")


🎨 Creating Visualization 3: Temporal Evolution Dashboard


✅ Dashboard created - Comprehensive view of temporal trends and risk factors
