# Generate EDA Summary Store

This notebook generates:
1. `numerical_summary.json` - Percentiles for numerical features (程式查表用)
2. `categorical_summary.json` - Distribution of categorical features (程式查表用)

**Note:** `insights.md` 由人工填寫，不在此 notebook 生成。

**Output:** `knowledge_base/eda/`

In [None]:
import pandas as pd
import numpy as np
import json
import os

OUTPUT_DIR = '../../knowledge_base/eda'
DATA_PATH = '../../data/ilec_cleaned.parquet'

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'Output directory: {OUTPUT_DIR}')

## 1. Load Data

In [None]:
df = pd.read_parquet(DATA_PATH)
print(f'Data shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')

In [None]:
# Feature definitions (must match model contract)
NUMERICAL_FEATURES = ['Attained_Age', 'Issue_Age', 'Duration']
CATEGORICAL_FEATURES = [
    'Sex', 'Smoker_Status', 'Insurance_Plan', 'Face_Amount_Band',
    'Preferred_Class', 'SOA_Post_Lvl_Ind', 'SOA_Antp_Lvl_TP', 'SOA_Guar_Lvl_TP'
]

print(f'Numerical features: {NUMERICAL_FEATURES}')
print(f'Categorical features: {CATEGORICAL_FEATURES}')

## 2. Numerical Summary (程式查表用)

In [None]:
PERCENTILES = [1, 5, 10, 25, 50, 75, 90, 95, 99]

numerical_summary = {}

for feature in NUMERICAL_FEATURES:
    values = df[feature].dropna()
    
    stats = {
        'count': int(len(values)),
        'mean': round(float(values.mean()), 2),
        'std': round(float(values.std()), 2),
        'min': int(values.min()),
        'max': int(values.max())
    }
    
    percentiles = {}
    for p in PERCENTILES:
        percentiles[str(p)] = round(float(np.percentile(values, p)), 1)
    
    stats['percentiles'] = percentiles
    stats['interpretation'] = {
        'low': f"Below {percentiles['25']} is considered low",
        'typical': f"Between {percentiles['25']} and {percentiles['75']} is typical",
        'high': f"Above {percentiles['75']} is considered high",
        'extreme': f"Above {percentiles['95']} is in top 5%"
    }
    
    numerical_summary[feature] = stats
    print(f'{feature}: mean={stats["mean"]}, range=[{stats["min"]}, {stats["max"]}]')

# Save
with open(f'{OUTPUT_DIR}/numerical_summary.json', 'w') as f:
    json.dump(numerical_summary, f, indent=2)
print('\n✓ numerical_summary.json saved')

## 3. Categorical Summary (程式查表用)

In [None]:
categorical_summary = {}

for feature in CATEGORICAL_FEATURES:
    value_counts = df[feature].value_counts(dropna=False)
    total = len(df[feature])
    
    distribution = {}
    for value, count in value_counts.items():
        key = str(value) if pd.notna(value) else 'Missing'
        distribution[key] = {
            'count': int(count),
            'percentage': round(count / total * 100, 2)
        }
    
    sorted_dist = sorted(distribution.items(), key=lambda x: x[1]['count'], reverse=True)
    
    categorical_summary[feature] = {
        'total': int(total),
        'n_unique': int(df[feature].nunique()),
        'most_common': {
            'value': sorted_dist[0][0],
            'percentage': sorted_dist[0][1]['percentage']
        },
        'distribution': distribution
    }
    
    print(f'{feature}: {categorical_summary[feature]["n_unique"]} unique, '
          f'most common = {sorted_dist[0][0]} ({sorted_dist[0][1]["percentage"]:.1f}%)')

# Save
with open(f'{OUTPUT_DIR}/categorical_summary.json', 'w') as f:
    json.dump(categorical_summary, f, indent=2)
print('\n✓ categorical_summary.json saved')

## 4. Verify Output

In [None]:
print('Generated files:')
for f in os.listdir(OUTPUT_DIR):
    if f.endswith('.json') or f.endswith('.md'):
        filepath = os.path.join(OUTPUT_DIR, f)
        size = os.path.getsize(filepath)
        print(f'  ✓ {f} ({size} bytes)')

print('\n⚠️ Remember to manually create insights.md')

## 5. Helper Functions for Report Generator

In [None]:
def get_percentile_description(feature: str, value: float, summary: dict) -> str:
    """Convert a value to percentile description for LLM."""
    pcts = summary[feature]['percentiles']
    
    for p in sorted([int(k) for k in pcts.keys()]):
        if value <= pcts[str(p)]:
            if p <= 10:
                return f"{feature}={value} is in the bottom {p}% (low)"
            elif p <= 25:
                return f"{feature}={value} is below average ({p}th percentile)"
            elif p <= 75:
                return f"{feature}={value} is typical ({p}th percentile)"
            elif p <= 90:
                return f"{feature}={value} is above average ({p}th percentile)"
            else:
                return f"{feature}={value} is in top {100-p}% (high)"
    
    return f"{feature}={value} is in the top 1% (extreme)"


def get_category_description(feature: str, value: str, summary: dict) -> str:
    """Get category description with population percentage."""
    dist = summary[feature]['distribution']
    if value in dist:
        pct = dist[value]['percentage']
        return f"{feature}={value} ({pct:.1f}% of population)"
    return f"{feature}={value} (unknown category)"


# Test
print('=== Numerical Examples ===')
print(get_percentile_description('Attained_Age', 45, numerical_summary))
print(get_percentile_description('Attained_Age', 75, numerical_summary))
print(get_percentile_description('Duration', 3, numerical_summary))
print(get_percentile_description('Duration', 20, numerical_summary))

print('\n=== Categorical Examples ===')
print(get_category_description('Smoker_Status', 'S', categorical_summary))
print(get_category_description('Smoker_Status', 'NS', categorical_summary))
print(get_category_description('Sex', 'M', categorical_summary))