# Exploratory Data Analysis (EDA)
This notebook covers all required EDA steps for the finance and insurance data project.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the project root or parent directory to sys.path so 'src' can be imported
sys.path.append(os.path.abspath(os.path.join(os.pardir)))

from src import eda, utils

data = utils.load_data('../data/raw/MachineLearningRating_v3.txt')
data = eda.calculate_loss_ratio(data)

In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

## Data Structure and Quality Assessment

In [None]:
print('Data types:')
print(eda.review_data_types(data))
print('Missing values:')
print(eda.check_missing_values(data))

## Descriptive Statistics

In [None]:
print(eda.descriptive_statistics(data))

## Univariate Analysis: Distributions

In [None]:
eda.plot_histograms(data, ['TotalPremium', 'TotalClaims', 'CustomValueEstimate'])
eda.plot_bar_chart(data, 'Province')
eda.plot_bar_chart(data, 'VehicleType')
eda.plot_bar_chart(data, 'Gender')

## Bivariate/Multivariate Analysis: Correlations and Trends

In [None]:
# Check if 'ZipCode' exists in the DataFrame before using it as hue
if 'ZipCode' in data.columns:
    eda.plot_scatter(data, 'TotalPremium', 'TotalClaims', hue='ZipCode')
else:
    eda.plot_scatter(data, 'TotalPremium', 'TotalClaims')

eda.plot_correlation_matrix(data)

## Outlier Detection

In [None]:
eda.plot_boxplot(data, 'TotalClaims')
eda.plot_boxplot(data, 'CustomValueEstimate')

## Creative Plots: Key Insights

In [None]:
eda.plot_loss_ratio_by_group(data, 'Province')
eda.plot_loss_ratio_by_group(data, 'VehicleType')
eda.plot_loss_ratio_by_group(data, 'Gender')

# Task 3: Hypothesis Testing on Risk Drivers
This section statistically validates or rejects key hypotheses about risk drivers using claim frequency, claim severity, and margin.

## Metrics Calculation
We define:
- **Claim Frequency**: Proportion of policies with at least one claim.
- **Claim Severity**: Average claim amount, given a claim occurred.
- **Margin**: TotalPremium - TotalClaims.

## Hypothesis 1: No risk differences across provinces
H₀: There are no risk differences across provinces.

In [None]:
# Choose two provinces for A/B test (e.g., the two largest by count)
province_counts = data['Province'].value_counts()
province_a, province_b = province_counts.index[:2]

# Claim Frequency (Chi-squared test)
chi2, p_freq = eda.chi2_test_groups(data, 'Province', 'TotalClaims', province_a, province_b)
print(f"Chi-squared test for claim frequency between {province_a} and {province_b}: p-value = {p_freq:.4f}")

# Claim Severity (t-test)
t_stat, p_sev = eda.t_test_groups(data[data['TotalClaims'] > 0], 'Province', 'TotalClaims', province_a, province_b)
print(f"T-test for claim severity between {province_a} and {province_b}: p-value = {p_sev:.4f}")

## Hypothesis 2: No risk differences between postal codes
H₀: There are no risk differences between postal codes.

# Choose two postal codes for A/B test (e.g., the two largest by count)
postal_counts = data['PostalCode'].value_counts()
postal_a, postal_b = postal_counts.index[:2]

# Claim Frequency (Chi-squared test)
chi2, p_freq_postal = eda.chi2_test_groups(data, 'PostalCode', 'TotalClaims', postal_a, postal_b)
print(f"Chi-squared test for claim frequency between {postal_a} and {postal_b}: p-value = {p_freq_postal:.4f}")

# Claim Severity (t-test)
t_stat, p_sev_postal = eda.t_test_groups(data[data['TotalClaims'] > 0], 'PostalCode', 'TotalClaims', postal_a, postal_b)
print(f"T-test for claim severity between {postal_a} and {postal_b}: p-value = {p_sev_postal:.4f}")

In [None]:
# Choose two zip codes for A/B test (e.g., the two largest by count)
zip_counts = data['PostalCode'].value_counts()
zip_a, zip_b = zip_counts.index[:2]

# Claim Frequency (Chi-squared test)
chi2, p_freq_zip = eda.chi2_test_groups(data, 'PostalCode', 'TotalClaims', zip_a, zip_b)
print(f"Chi-squared test for claim frequency between {zip_a} and {zip_b}: p-value = {p_freq_zip:.4f}")

# Claim Severity (t-test)
t_stat, p_sev_zip = eda.t_test_groups(data[data['TotalClaims'] > 0], 'PostalCode', 'TotalClaims', zip_a, zip_b)
print(f"T-test for claim severity between {zip_a} and {zip_b}: p-value = {p_sev_zip:.4f}")

# Margin (t-test)
margin_a = data[data['PostalCode'] == zip_a]
margin_b = data[data['PostalCode'] == zip_b]
t_stat, p_margin = stats.ttest_ind(
    margin_a['TotalPremium'] - margin_a['TotalClaims'],
    margin_b['TotalPremium'] - margin_b['TotalClaims'],
    nan_policy='omit'
)
print(f"T-test for margin between {zip_a} and {zip_b}: p-value = {p_margin:.4f}")

## Hypothesis 3: No significant margin (profit) difference between zip codes
H₀: There are no significant margin (profit) differences between zip codes.

## Hypothesis 4: No significant risk difference between Women and Men
H₀: There are not significant risk differences between Women and Men.

In [None]:
# Claim Frequency (Chi-squared test)
chi2, p_freq_gender = eda.chi2_test_groups(data, 'Gender', 'TotalClaims', 'Female', 'Male')
print(f"Chi-squared test for claim frequency between Female and Male: p-value = {p_freq_gender:.4f}")

# Claim Severity (t-test)
t_stat, p_sev_gender = eda.t_test_groups(data[data['TotalClaims'] > 0], 'Gender', 'TotalClaims', 'Female', 'Male')
print(f"T-test for claim severity between Female and Male: p-value = {p_sev_gender:.4f}")

## Interpretation & Business Recommendation
For each hypothesis, if p-value < 0.05, reject the null hypothesis and provide a business interpretation.

Example: "We reject the null hypothesis for provinces (p < 0.01). Specifically, Province A exhibits a higher loss ratio than Province B, suggesting a regional risk adjustment to our premiums may be warranted."