In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Data Loading ---
# Load the dataset
df = pd.read_csv('data/raw/historical_insurance_claims.csv') # Adjust path if needed

# --- Data Summarization ---
print("--- Data Head ---")
print(df.head())

print("\n--- Data Info ---")
df.info()

print("\n--- Data Description (Numerical) ---")
print(df.describe())

print("\n--- Data Description (Categorical) ---")
print(df.describe(include='object'))

print("\n--- Data Types Check ---")
print(df.dtypes)
# Ensure dates are datetime objects:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
# Add a TransactionMonth column if not already present or for ease of use
df['TransactionMonth'] = df['TransactionDate'].dt.to_period('M')


# --- Data Quality Assessment ---
print("\n--- Missing Values ---")
print(df.isnull().sum()[df.isnull().sum() > 0])
# Consider strategies for handling missing values (e.g., imputation, removal) if significant

# --- Univariate Analysis ---
# Distribution of numerical variables (e.g., TotalPremium, TotalClaims)
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['TotalPremium'], kde=True)
plt.title('Distribution of TotalPremium')
plt.subplot(1, 2, 2)
sns.histplot(df['TotalClaims'], kde=True)
plt.title('Distribution of TotalClaims')
plt.tight_layout()
plt.show()

# Distribution of categorical variables (e.g., Province, VehicleType, Gender)
plt.figure(figsize=(18, 6))
plt.subplot(1, 3, 1)
sns.countplot(y='Province', data=df, order=df['Province'].value_counts().index)
plt.title('Count of Policies by Province')
plt.subplot(1, 3, 2)
sns.countplot(y='VehicleType', data=df, order=df['VehicleType'].value_counts().index)
plt.title('Count of Policies by VehicleType')
plt.subplot(1, 3, 3)
sns.countplot(x='Gender', data=df)
plt.title('Count of Policies by Gender')
plt.tight_layout()
plt.show()


# --- Bivariate/Multivariate Analysis ---
# Overall Loss Ratio
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
overall_loss_ratio = df['TotalClaims'].sum() / df['TotalPremium'].sum()
print(f"\nOverall Loss Ratio: {overall_loss_ratio:.2f}")

# Loss Ratio by Province, VehicleType, Gender
print("\nLoss Ratio by Province:")
print(df.groupby('Province')['LossRatio'].mean().sort_values(ascending=False))

print("\nLoss Ratio by VehicleType:")
print(df.groupby('VehicleType')['LossRatio'].mean().sort_values(ascending=False))

print("\nLoss Ratio by Gender:")
print(df.groupby('Gender')['LossRatio'].mean().sort_values(ascending=False))

# Temporal trends of claim frequency or severity
claims_over_time = df.groupby('TransactionMonth')['TotalClaims'].sum()
premium_over_time = df.groupby('TransactionMonth')['TotalPremium'].sum()

plt.figure(figsize=(12, 6))
plt.plot(claims_over_time.index.astype(str), claims_over_time.values, label='Total Claims')
plt.plot(premium_over_time.index.astype(str), premium_over_time.values, label='Total Premium')
plt.title('Total Claims and Total Premium Over Time')
plt.xlabel('Month')
plt.ylabel('Amount')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Correlations between TotalPremium, TotalClaims as a function of ZipCode (or PostalCode)
# Note: For many zip codes, this might be too granular for a direct scatter plot.
# Aggregate by PostalCode first for mean values.
postal_code_summary = df.groupby('PostalCode').agg(
    AvgTotalPremium=('TotalPremium', 'mean'),
    AvgTotalClaims=('TotalClaims', 'mean')
).reset_index()

plt.figure(figsize=(10, 7))
sns.scatterplot(x='AvgTotalPremium', y='AvgTotalClaims', data=postal_code_summary)
plt.title('Average Total Premium vs. Average Total Claims by PostalCode')
plt.xlabel('Average Total Premium')
plt.ylabel('Average Total Claims')
plt.grid(True)
plt.show()

print("\nCorrelation Matrix (Numerical Features):")
numerical_cols = df.select_dtypes(include=np.number).columns
print(df[numerical_cols].corr())

# Which vehicle makes/models are associated with highest/lowest claim amounts
top_claim_makes = df.groupby('Make')['TotalClaims'].sum().sort_values(ascending=False).head(10)
bottom_claim_makes = df.groupby('Make')['TotalClaims'].sum().sort_values(ascending=True).head(10)
print("\nTop 10 Vehicle Makes by Total Claims:\n", top_claim_makes)
print("\nBottom 10 Vehicle Makes by Total Claims:\n", bottom_claim_makes)

# --- Outlier Detection ---
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['TotalPremium'])
plt.title('Box Plot of TotalPremium')
plt.subplot(1, 2, 2)
sns.boxplot(y=df['TotalClaims'])
plt.title('Box Plot of TotalClaims')
plt.tight_layout()
plt.show()

# --- Creative and Beautiful Plots (Example) ---
# Plot 1: Loss Ratio by MainCrestaZone
plt.figure(figsize=(12, 7))
lr_by_cresta = df.groupby('MainCrestaZone')['LossRatio'].mean().sort_values(ascending=False)
sns.barplot(x=lr_by_cresta.values, y=lr_by_cresta.index, palette='viridis')
plt.title('Average Loss Ratio by Main Cresta Zone', fontsize=16)
plt.xlabel('Average Loss Ratio', fontsize=12)
plt.ylabel('Main Cresta Zone', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot 2: Relationship between Vehicle Custom Value Estimate and Total Claims
# Filter for claims > 0 to see the relationship for actual claims
df_claims_only = df[df['TotalClaims'] > 0]
plt.figure(figsize=(10, 8))
sns.regplot(x='CustomValueEstimate', y='TotalClaims', data=df_claims_only,
            scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('Total Claims vs. Custom Value Estimate for Claims Occurred', fontsize=16)
plt.xlabel('Custom Value Estimate', fontsize=12)
plt.ylabel('Total Claims', fontsize=12)
plt.xscale('log') # Log scale might be better for skewed data
plt.yscale('log')
plt.grid(True)
plt.tight_layout()
plt.show()
# Plot 3: Premium Distribution by CoverGroup
plt.figure(figsize=(14, 8))
sns.violinplot(x='CoverGroup', y='TotalPremium', data=df, inner='quartile', palette='coolwarm')
plt.title('Distribution of Total Premium by Cover Group', fontsize=16)
plt.xlabel('Cover Group', fontsize=12)
plt.ylabel('Total Premium', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()