# Hospital Anomalies: Data Ingestion and QC

This notebook demonstrates data ingestion and quality control using the `publicdata_ca.acquisition` package.

In [None]:
import sys
from pathlib import Path

# Add parent directories to path
sys.path.insert(0, str(Path.cwd().parent.parent.parent))

import pandas as pd
from publicdata_ca.utils.config import load_config
from case_studies.hospital_anomalies.src.ingest import ingest_cihi_data
from case_studies.hospital_anomalies.src.qc import run_qc_checks

## 1. Load Configuration

In [None]:
config_path = Path.cwd().parent / 'config' / 'default.yaml'
config = load_config(config_path)
config_dict = config.to_dict()

print(f"Datasets to ingest: {config_dict['datasets']}")
print(f"Date range: {config_dict['date_range']}")

## 2. Ingest CIHI Data

In [None]:
dataset_ids = config_dict['datasets']
datasets = ingest_cihi_data(dataset_ids)

for dataset_id, df in datasets.items():
    print(f"\n{dataset_id}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    display(df.head())

## 3. Quality Control Checks

In [None]:
qc_results = run_qc_checks(datasets, config_dict)

for dataset_id, results in qc_results.items():
    print(f"\n=== {dataset_id} ===")
    print(f"Row count: {results['row_count']}")
    print(f"Column count: {results['column_count']}")
    print(f"Missing data: {results['missing_data']}")

## 4. Explore Data

In [None]:
# Pick first dataset for exploration
df = list(datasets.values())[0]

# Summary statistics
display(df.describe())

# Check date range
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")