# Hospital Anomalies: Data Ingestion and QC

This notebook demonstrates data ingestion and quality control using the `publicdata_ca.acquisition` package.

In [1]:
import sys
from pathlib import Path

# Add parent directories to path
sys.path.insert(0, str(Path.cwd().parent.parent.parent))

import pandas as pd
from case_studies.hospital_anomalies.src.utils import load_config
from case_studies.hospital_anomalies.src.ingest import ingest_cihi_data
from case_studies.hospital_anomalies.src.qc import run_qc_checks

## 1. Load Configuration

In [2]:
config_path = Path.cwd().parent / 'config' / 'default.yaml'
config = load_config(config_path)
config_dict = config.to_dict()

print(f"Datasets to ingest: {config_dict['datasets']}")
print(f"Date range: {config_dict['date_range']}")

Datasets to ingest: ['cihi_hospital_admissions', 'cihi_bed_occupancy', 'cihi_icu_utilization']
Date range: {'start': '2019-01-01', 'end': '2023-12-31'}


## 2. Ingest CIHI Data

In [3]:
dataset_ids = config_dict['datasets']
datasets = ingest_cihi_data(dataset_ids)

for dataset_id, df in datasets.items():
    print(f"\n{dataset_id}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    display(df.head())


cihi_hospital_admissions:
  Shape: (1826, 4)
  Columns: ['date', 'region', 'hospital_id', 'admissions']


Unnamed: 0,date,region,hospital_id,admissions
0,2019-01-01,BC,H006,55
1,2019-01-02,Alberta,H009,46
2,2019-01-03,Ontario,H002,57
3,2019-01-04,BC,H012,49
4,2019-01-05,BC,H003,52



cihi_bed_occupancy:
  Shape: (1826, 4)
  Columns: ['date', 'region', 'hospital_id', 'occupancy_rate']


Unnamed: 0,date,region,hospital_id,occupancy_rate
0,2019-01-01,BC,H006,78.001747
1,2019-01-02,Alberta,H009,73.160273
2,2019-01-03,Ontario,H002,79.521612
3,2019-01-04,BC,H012,74.766057
4,2019-01-05,BC,H003,76.554105



cihi_icu_utilization:
  Shape: (1826, 6)
  Columns: ['date', 'region', 'hospital_id', 'icu_beds_used', 'icu_beds_total', 'icu_utilization_rate']


Unnamed: 0,date,region,hospital_id,icu_beds_used,icu_beds_total,icu_utilization_rate
0,2019-01-01,BC,H006,17,25,68.0
1,2019-01-02,Alberta,H009,13,25,52.0
2,2019-01-03,Ontario,H002,17,25,68.0
3,2019-01-04,BC,H012,14,25,56.0
4,2019-01-05,BC,H003,15,25,60.0


## 3. Quality Control Checks

In [4]:
qc_results = run_qc_checks(datasets, config_dict)

for dataset_id, results in qc_results.items():
    print(f"\n=== {dataset_id} ===")
    print(f"Row count: {results['row_count']}")
    print(f"Column count: {results['column_count']}")
    print(f"Missing data: {results['missing_data']}")


=== cihi_hospital_admissions ===
Row count: 1826
Column count: 4
Missing data: {'date': 0.0, 'region': 0.0, 'hospital_id': 0.0, 'admissions': 0.0}

=== cihi_bed_occupancy ===
Row count: 1826
Column count: 4
Missing data: {'date': 0.0, 'region': 0.0, 'hospital_id': 0.0, 'occupancy_rate': 0.0}

=== cihi_icu_utilization ===
Row count: 1826
Column count: 6
Missing data: {'date': 0.0, 'region': 0.0, 'hospital_id': 0.0, 'icu_beds_used': 0.0, 'icu_beds_total': 0.0, 'icu_utilization_rate': 0.0}


## 4. Explore Data

In [5]:
# Pick first dataset for exploration
df = list(datasets.values())[0]

# Summary statistics
display(df.describe())

# Check date range
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")

Unnamed: 0,date,admissions
count,1826,1826.0
mean,2021-07-01 12:00:00,51.043264
min,2019-01-01 00:00:00,17.0
25%,2020-04-01 06:00:00,37.0
50%,2021-07-01 12:00:00,51.0
75%,2022-09-30 18:00:00,64.0
max,2023-12-31 00:00:00,132.0
std,,16.476693



Date range: 2019-01-01 00:00:00 to 2023-12-31 00:00:00
