# EDA Analysis

## Import Data 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set Seaborn theme for consistent styling
sns.set_theme(style="whitegrid", palette="husl")

# Create directories for outputs
Path("outputs/plots").mkdir(parents=True, exist_ok=True)

try:
    df = pd.read_csv('../data/raw/MachineLearningRating_v3.txt', delimiter='|')
except FileNotFoundError:
    print("Error: raw/MachineLearningRating_v3.txt not found")
    exit(1)

  df = pd.read_csv('../data/raw/MachineLearningRating_v3.txt', delimiter='|')


## Data Understanding and Descriptive Statistics

In [3]:
# 1. Data Understanding and Descriptive Statistics
print("Dataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df[['TotalPremium', 'TotalClaims', 'CustomValueEstimate']].describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      854137 non-null   object 
 9   AccountType               959866 non-null   object 
 10  MaritalStatus             991839 non-null   object 
 11  Gender                    990562 non-null   object 
 12  Country                   1000098 non-null  object 
 13  Province     

## Data quality assessment

In [4]:
# 2. Data Quality Assessment
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                         145961
AccountType                   40232
MaritalStatus                  8259
Gender                         9536
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                          552
VehicleType                     552
RegistrationYear                  0
make                            552
Model                           552
Cylinders                       552
cubiccapacity                   552
kilowatts                       552
bodytype                        552
NumberOfDoo

In [5]:
# Handle missing values (example: impute numerical with median, categorical with mode)
df['TotalPremium'] = df['TotalPremium'].fillna(df['TotalPremium'].median())
df['TotalClaims'] = df['TotalClaims'].fillna(df['TotalClaims'].median())
df['CustomValueEstimate'] = df['CustomValueEstimate'].fillna(df['CustomValueEstimate'].median())

In [6]:
# Categorical: Impute with mode
for col in ['Gender', 'Province', 'VehicleType']:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])


# Save cleaned dataset
df.to_csv('../data/processed/insurance_data_cleaned.csv', index=False)
print("Cleaned dataset saved as 'data/insurance_data_cleaned.csv'")

Cleaned dataset saved as 'data/insurance_data_cleaned.csv'


## Univariate Analysis

In [9]:
# 3. Univariate Analysis
# Numerical: Histograms
plt.figure(figsize=(12, 4))
for i, col in enumerate(['TotalPremium', 'TotalClaims', 'CustomValueEstimate'], 1):
    if col in df.columns:
        plt.subplot(1, 3, i)
        sns.histplot(df[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}')
    else:
        print(f"Warning: Column '{col}' not found in dataset.")
plt.tight_layout()
plt.savefig('outputs/plots/univariate_distributions.png')
plt.close()

# Categorical: Bar charts
if 'Province' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='Province')
    plt.title('Distribution of Policies by Province')
    plt.xticks(rotation=45)
    plt.savefig('outputs/plots/province_distribution.png')
    plt.close()
else:
    print("Warning: 'Province' column not found in dataset.")

## Bivariate/ Multivariate Analysis

In [10]:
# 4. Bivariate/Multivariate Analysis
# Loss Ratio by Province
if 'TotalClaims' in df.columns and 'TotalPremium' in df.columns:
    df['LossRatio'] = df['TotalClaims'] / df['TotalPremium'].replace(0, np.nan)  # Avoid division by zero
    if 'Province' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df, x='Province', y='LossRatio')
        plt.title('Loss Ratio by Province')
        plt.xticks(rotation=45)
        plt.savefig('outputs/plots/loss_ratio_province.png')
        plt.close()
    else:
        print("Warning: 'Province' column not found for loss ratio analysis.")
else:
    print("Warning: 'TotalClaims' or 'TotalPremium' column not found for loss ratio calculation.")

# Correlation matrix
numerical_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']
available_cols = [col for col in numerical_cols if col in df.columns]
if available_cols:
    plt.figure(figsize=(8, 6))
    sns.heatmap(df[available_cols].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.savefig('outputs/plots/correlation_matrix.png')
    plt.close()
else:
    print("Warning: No numerical columns available for correlation matrix.")

## Outlier Detection

In [14]:
# 5. Outlier Detection
plt.figure(figsize=(12, 4))
for i, col in enumerate(['TotalClaims', 'CustomValueEstimate', 'TotalPremium'], 1):
    if col in df.columns:
        plt.subplot(1, 3, i)
        sns.boxplot(y=df[col])
        plt.title(f'Boxplot of {col}')
    else:
        print(f"Warning: Column '{col}' not found for outlier detection.")
plt.tight_layout()
plt.savefig('outputs/plots/outlier_boxplots.png')
plt.close()


## Temporal Trends

In [12]:
# 6. Temporal Trends
if 'TransactionMonth' in df.columns:
    try:
        df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], errors='coerce')
        df['Month'] = df['TransactionMonth'].dt.to_period('M')
        claim_trends = df.groupby('Month')['TotalClaims'].mean()
        plt.figure(figsize=(10, 6))
        claim_trends.plot()
        plt.title('Average Claim Amount Over Time')
        plt.xlabel('Month')
        plt.ylabel('Average TotalClaims')
        plt.savefig('outputs/plots/claim_trends.png')
        plt.close()
    except Exception as e:
        print(f"Error in temporal analysis: {e}")
else:
    print("Warning: 'TransactionMonth' column not found for temporal analysis.")

print("EDA completed. Visualizations saved in 'outputs/plots' directory.")

EDA completed. Visualizations saved in 'outputs/plots' directory.
