# Explore Fredholm Integral Equation Data

This notebook explores the FIE-500k dataset for Fredholm integral equations of the second kind.

**Equation form:** $u(x) - \lambda \int_a^b K(x, t) u(t) dt = f(x)$

## Contents
1. Load and inspect the dataset
2. Analyze equation statistics
3. Visualize kernel types
4. Display sample equations with solutions

In [None]:
# Import required libraries
import json
import sys
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print(f"Project root: {project_root}")

## 1. Load Dataset

Load the Fredholm integral equation dataset. If the main dataset is not available, we'll create sample data for demonstration.

In [None]:
# Define data paths
data_dir = project_root / "data"
raw_dir = data_dir / "raw"
processed_dir = data_dir / "processed"

# Create directories if they don't exist
raw_dir.mkdir(parents=True, exist_ok=True)
processed_dir.mkdir(parents=True, exist_ok=True)

# Check for existing dataset
dataset_path = raw_dir / "fie_500k.json"

if dataset_path.exists():
    with open(dataset_path) as f:
        data = json.load(f)
    print(f"Loaded {len(data)} equations from {dataset_path}")
else:
    # Create sample data for demonstration
    print("Dataset not found. Creating sample data for demonstration...")
    
    sample_data = [
        {
            "id": "poly_1",
            "kernel": "x*t",
            "f": "x",
            "lambda_val": 1.0,
            "domain": [0, 1],
            "kernel_type": "polynomial",
            "solution": "3*x/2"
        },
        {
            "id": "poly_2", 
            "kernel": "x**2 + t**2",
            "f": "x**2",
            "lambda_val": 0.5,
            "domain": [0, 1],
            "kernel_type": "polynomial",
            "solution": None
        },
        {
            "id": "exp_1",
            "kernel": "exp(x)*exp(t)",
            "f": "1",
            "lambda_val": 1.0,
            "domain": [0, 1],
            "kernel_type": "exponential",
            "solution": "1 + 2*(e-1)/(3-e**2)*exp(x)"
        },
        {
            "id": "exp_2",
            "kernel": "exp(x+t)",
            "f": "exp(x)",
            "lambda_val": 0.25,
            "domain": [0, 1],
            "kernel_type": "exponential",
            "solution": None
        },
        {
            "id": "trig_1",
            "kernel": "sin(x)*cos(t)",
            "f": "sin(x)",
            "lambda_val": 1.0,
            "domain": [0, 3.14159],
            "kernel_type": "trigonometric",
            "solution": None
        },
        {
            "id": "trig_2",
            "kernel": "cos(x-t)",
            "f": "1",
            "lambda_val": 0.5,
            "domain": [-3.14159, 3.14159],
            "kernel_type": "trigonometric",
            "solution": None
        },
        {
            "id": "sep_1",
            "kernel": "x*sin(t)",
            "f": "x",
            "lambda_val": 1.0,
            "domain": [0, 1],
            "kernel_type": "separable",
            "solution": None
        },
    ]
    
    data = sample_data
    
    # Save sample data
    with open(dataset_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"Created {len(data)} sample equations")

# Convert to DataFrame for analysis
df = pd.DataFrame(data)
print(f"\nDataset shape: {df.shape}")
df.head()

## 2. Dataset Statistics

Analyze the distribution of kernel types, lambda values, and solution availability.

In [None]:
# Basic statistics
print("=" * 50)
print("DATASET STATISTICS")
print("=" * 50)

print(f"\nTotal equations: {len(df)}")
print(f"Columns: {list(df.columns)}")

# Kernel type distribution
if 'kernel_type' in df.columns:
    print(f"\nKernel types:")
    kernel_counts = df['kernel_type'].value_counts()
    for ktype, count in kernel_counts.items():
        print(f"  {ktype}: {count} ({count/len(df)*100:.1f}%)")

# Lambda value statistics
if 'lambda_val' in df.columns:
    print(f"\nLambda value statistics:")
    print(f"  Min: {df['lambda_val'].min():.4f}")
    print(f"  Max: {df['lambda_val'].max():.4f}")
    print(f"  Mean: {df['lambda_val'].mean():.4f}")
    print(f"  Std: {df['lambda_val'].std():.4f}")

# Solution availability
if 'solution' in df.columns:
    has_solution = df['solution'].notna().sum()
    print(f"\nSolutions available: {has_solution}/{len(df)} ({has_solution/len(df)*100:.1f}%)")

## 3. Visualizations

Visualize the distribution of kernel types and lambda values.

In [None]:
# Create visualizations
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Kernel type distribution
if 'kernel_type' in df.columns:
    kernel_counts = df['kernel_type'].value_counts()
    ax1 = axes[0]
    bars = ax1.bar(kernel_counts.index, kernel_counts.values, color=sns.color_palette("husl", len(kernel_counts)))
    ax1.set_title("Kernel Type Distribution", fontsize=14, fontweight='bold')
    ax1.set_xlabel("Kernel Type")
    ax1.set_ylabel("Count")
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, count in zip(bars, kernel_counts.values):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                 str(count), ha='center', va='bottom', fontsize=10)

# Lambda value distribution
if 'lambda_val' in df.columns:
    ax2 = axes[1]
    ax2.hist(df['lambda_val'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
    ax2.set_title("Lambda Value Distribution", fontsize=14, fontweight='bold')
    ax2.set_xlabel("Lambda (λ)")
    ax2.set_ylabel("Frequency")
    ax2.axvline(df['lambda_val'].mean(), color='red', linestyle='--', label=f'Mean: {df["lambda_val"].mean():.2f}')
    ax2.legend()

plt.tight_layout()
plt.show()

## 4. Sample Equations

Display some sample Fredholm integral equations from the dataset.

In [None]:
# Display sample equations
def display_equation(eq):
    """Display a single Fredholm equation nicely."""
    print("=" * 60)
    print(f"ID: {eq.get('id', 'N/A')}")
    print(f"Type: {eq.get('kernel_type', 'Unknown')}")
    print()
    
    kernel = eq.get('kernel', 'K(x,t)')
    f = eq.get('f', 'f(x)')
    lam = eq.get('lambda_val', 'λ')
    domain = eq.get('domain', [0, 1])
    
    if isinstance(domain, list):
        a, b = domain
    elif isinstance(domain, dict):
        a, b = domain.get('a', 0), domain.get('b', 1)
    else:
        a, b = 0, 1
    
    print(f"Equation: u(x) - {lam} * ∫[{a}, {b}] {kernel} u(t) dt = {f}")
    
    if eq.get('solution'):
        print(f"Solution: u(x) = {eq['solution']}")
    else:
        print("Solution: Not available")
    print()

# Display sample equations from each kernel type
print("SAMPLE EQUATIONS FROM DATASET")
print("=" * 60)

if 'kernel_type' in df.columns:
    for ktype in df['kernel_type'].unique():
        sample = df[df['kernel_type'] == ktype].iloc[0].to_dict()
        display_equation(sample)
else:
    for i, eq in enumerate(data[:5]):
        display_equation(eq)