# ILEC 資料 EDA - 關鍵變數分佈分析

本 Notebook 分析 ILEC 死亡率資料的關鍵變數分佈。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 設定中文字體
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'Microsoft JhengHei', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

# 設定樣式
sns.set_style('whitegrid')
sns.set_palette('husl')

In [None]:
# 載入資料 (10% 抽樣以加速分析)
print('Loading data...')
df_full = pd.read_parquet('../data/ilec_cleaned.parquet')
print(f'Full data shape: {df_full.shape}')

# 抽樣 10%
df = df_full.sample(frac=0.1, random_state=42)
print(f'Sampled 10%: {df.shape}')

# 清理記憶體
del df_full

In [None]:
# 資料概覽
print('=== 資料欄位 ===')
print(df.columns.tolist())
print()
print('=== 資料型態 ===')
print(df.dtypes)

## 1. 數值變數分佈

In [None]:
# 數值變數
numerical_cols = ['Attained_Age', 'Issue_Age', 'Duration']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    ax = axes[i]
    
    # Histogram
    ax.hist(df[col], bins=50, edgecolor='white', alpha=0.7, color='steelblue')
    
    # 統計資訊
    mean_val = df[col].mean()
    median_val = df[col].median()
    
    ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.1f}')
    ax.axvline(median_val, color='orange', linestyle='-', linewidth=2, label=f'Median: {median_val:.1f}')
    
    ax.set_title(f'{col} 分佈', fontsize=14)
    ax.set_xlabel(col)
    ax.set_ylabel('頻次')
    ax.legend()

plt.tight_layout()
plt.savefig('../data/plots/eda_numerical_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# 數值變數統計摘要
numerical_stats = df[numerical_cols].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
print('=== 數值變數統計摘要 ===')
display(numerical_stats.round(2))

## 2. 類別變數分佈

In [None]:
# 類別變數
categorical_cols = ['Sex', 'Smoker_Status', 'Insurance_Plan', 'Preferred_Class']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    ax = axes[i]
    
    # 計算比例
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100
    
    # 長條圖
    bars = ax.bar(range(len(value_counts)), value_counts.values, color=sns.color_palette('husl', len(value_counts)))
    
    # 標籤
    ax.set_xticks(range(len(value_counts)))
    ax.set_xticklabels(value_counts.index, rotation=45, ha='right')
    
    # 在長條上顯示百分比
    for j, (bar, pct) in enumerate(zip(bars, percentages.values)):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000, 
                f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)
    
    ax.set_title(f'{col} 分佈', fontsize=14)
    ax.set_ylabel('樣本數')

plt.tight_layout()
plt.savefig('../data/plots/eda_categorical_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Face Amount Band 分佈

In [None]:
# Face Amount Band
fig, ax = plt.subplots(figsize=(12, 6))

value_counts = df['Face_Amount_Band'].value_counts().sort_index()
percentages = df['Face_Amount_Band'].value_counts(normalize=True).sort_index() * 100

bars = ax.bar(range(len(value_counts)), value_counts.values, color='teal', edgecolor='white')
ax.set_xticks(range(len(value_counts)))
ax.set_xticklabels([s.replace(':', '\n') for s in value_counts.index], rotation=0, ha='center', fontsize=8)

for bar, pct in zip(bars, percentages.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 500, 
            f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)

ax.set_title('Face Amount Band 分佈', fontsize=14)
ax.set_ylabel('樣本數')
plt.tight_layout()
plt.savefig('../data/plots/eda_face_amount_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. 年度趨勢

In [None]:
# 年度分佈
yearly_stats = df.groupby('Observation_Year').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum'
}).reset_index()
yearly_stats['Death_Rate'] = yearly_stats['Death_Count'] / yearly_stats['Policies_Exposed'] * 1000

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 曝險量
axes[0].bar(yearly_stats['Observation_Year'], yearly_stats['Policies_Exposed']/1e6, color='steelblue')
axes[0].set_title('年度曝險量', fontsize=14)
axes[0].set_ylabel('曝險量 (百萬)')
axes[0].set_xlabel('年度')

# 死亡數
axes[1].bar(yearly_stats['Observation_Year'], yearly_stats['Death_Count']/1e3, color='coral')
axes[1].set_title('年度死亡數', fontsize=14)
axes[1].set_ylabel('死亡數 (千)')
axes[1].set_xlabel('年度')

# 死亡率
axes[2].plot(yearly_stats['Observation_Year'], yearly_stats['Death_Rate'], 'o-', color='darkred', linewidth=2, markersize=8)
axes[2].set_title('年度死亡率 (per 1000)', fontsize=14)
axes[2].set_ylabel('死亡率 (‰)')
axes[2].set_xlabel('年度')
axes[2].set_ylim(bottom=0)

plt.tight_layout()
plt.savefig('../data/plots/eda_yearly_trend.png', dpi=150, bbox_inches='tight')
plt.show()

print('=== 年度統計 ===')
display(yearly_stats)

## 5. 關鍵變數與死亡率的關係

In [None]:
# 年齡 vs 死亡率
age_mortality = df.groupby('Attained_Age').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum'
}).reset_index()
age_mortality['Death_Rate'] = age_mortality['Death_Count'] / age_mortality['Policies_Exposed'] * 1000

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(age_mortality['Attained_Age'], age_mortality['Death_Rate'], color='darkred', linewidth=2)
ax.fill_between(age_mortality['Attained_Age'], 0, age_mortality['Death_Rate'], alpha=0.3, color='coral')
ax.set_title('Attained Age vs 死亡率', fontsize=14)
ax.set_xlabel('Attained Age')
ax.set_ylabel('死亡率 (per 1000)')
ax.set_xlim(0, 100)
ax.set_yscale('log')
plt.tight_layout()
plt.savefig('../data/plots/eda_age_mortality.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Duration vs 死亡率 (選擇效果)
dur_mortality = df.groupby('Duration').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum'
}).reset_index()
dur_mortality['Death_Rate'] = dur_mortality['Death_Count'] / dur_mortality['Policies_Exposed'] * 1000

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(dur_mortality['Duration'], dur_mortality['Death_Rate'], 'o-', color='teal', linewidth=2, markersize=4)
ax.axvline(2, color='red', linestyle='--', alpha=0.7, label='Selection Period (1-2 yrs)')
ax.set_title('Duration vs 死亡率 (選擇效果分析)', fontsize=14)
ax.set_xlabel('Duration (保單年度)')
ax.set_ylabel('死亡率 (per 1000)')
ax.set_xlim(0, 50)
ax.legend()
plt.tight_layout()
plt.savefig('../data/plots/eda_duration_mortality.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# 吸菸狀態 vs 死亡率
smoker_mortality = df.groupby('Smoker_Status').agg({
    'Policies_Exposed': 'sum',
    'Death_Count': 'sum'
}).reset_index()
smoker_mortality['Death_Rate'] = smoker_mortality['Death_Count'] / smoker_mortality['Policies_Exposed'] * 1000

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(smoker_mortality['Smoker_Status'], smoker_mortality['Death_Rate'], 
              color=['green', 'orange', 'gray'])
ax.set_title('Smoker Status vs 死亡率', fontsize=14)
ax.set_xlabel('Smoker Status')
ax.set_ylabel('死亡率 (per 1000)')

for bar in bars:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.2, 
            f'{bar.get_height():.2f}', ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.savefig('../data/plots/eda_smoker_mortality.png', dpi=150, bbox_inches='tight')
plt.show()

print('=== 吸菸狀態死亡率 ===')
display(smoker_mortality)

## 6. 相關性矩陣

In [None]:
# 數值變數相關性
corr_cols = ['Attained_Age', 'Issue_Age', 'Duration', 'Policies_Exposed', 'Death_Count']
corr_matrix = df[corr_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, fmt='.2f', 
            square=True, linewidths=0.5, ax=ax)
ax.set_title('數值變數相關性矩陣', fontsize=14)
plt.tight_layout()
plt.savefig('../data/plots/eda_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. 摘要

In [None]:
print('='*60)
print('EDA 摘要')
print('='*60)
print(f"\n總樣本數: {len(df):,} (10% 抽樣)")
print(f"資料期間: {df['Observation_Year'].min()} - {df['Observation_Year'].max()}")
print(f"\n數值變數:")
print(f"  - Attained_Age: Mean={df['Attained_Age'].mean():.1f}, Median={df['Attained_Age'].median():.0f}")
print(f"  - Issue_Age: Mean={df['Issue_Age'].mean():.1f}, Median={df['Issue_Age'].median():.0f}")
print(f"  - Duration: Mean={df['Duration'].mean():.1f}, Median={df['Duration'].median():.0f}")
print(f"\n類別變數分佈:")
print(f"  - Sex: M={df['Sex'].value_counts(normalize=True)['M']*100:.1f}%")
print(f"  - Smoker: NS={df['Smoker_Status'].value_counts(normalize=True)['NS']*100:.1f}%")
print(f"  - Top Plan: {df['Insurance_Plan'].value_counts().index[0]} ({df['Insurance_Plan'].value_counts(normalize=True).iloc[0]*100:.1f}%)")
print(f"\n整體死亡率: {df['Death_Count'].sum() / df['Policies_Exposed'].sum() * 1000:.2f} per 1000")
print('='*60)