# Creating CCSR-Based Patient Dictionaries from OMOP Data

This notebook creates patient dictionaries using **CCSR (Clinical Classifications Software Refined)** categories from OMOP Common Data Model data.

## What is CCSR?
CCSR is a diagnosis and procedure categorization scheme developed by AHRQ (Agency for Healthcare Research and Quality) that:
- Groups ICD-10 codes into clinically meaningful categories
- Provides hierarchical organization of diagnoses
- Distinguishes between chronic and acute conditions
- Separates inpatient (IP) and outpatient (OP) encounters

## Output
This notebook generates:
- **Chronic condition dictionaries** by CCSR category
- **Inpatient vs Outpatient** separate mappings
- **First occurrence dates** for each patient-condition pair

## Requirements
- OMOP CDM database
- SNOMED to CCSR mapping file
- Chronic conditions indicator file

## 1. Setup and Imports

In [None]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import pyarrow.csv as csv
import pandas as pd
import numpy as np
from datetime import datetime
import json
from pathlib import Path
import logging
from typing import Dict, List, Set, Optional, Any, Tuple
from collections import defaultdict
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

logger.info("Libraries imported successfully")

## 2. Configuration

In [None]:
@dataclass
class Config:
    """Configuration for CCSR dictionary creation."""
    
    # OMOP data directory
    omop_data_dir: Path = Path('./omop_data')
    
    # CCSR mapping files directory
    ccsr_mappings_dir: Path = Path('./ccsr_mappings')
    
    # Output directory
    output_dir: Path = Path('./output/ccsr_dictionaries')
    
    # Required OMOP tables
    required_tables: List[str] = None
    
    def __post_init__(self):
        if self.required_tables is None:
            self.required_tables = [
                'person',
                'condition_occurrence',
                'visit_occurrence',
                'concept',
                'concept_relationship'
            ]
        
        self.output_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Output directory: {self.output_dir}")

config = Config()

# Note: You need to provide CCSR mapping files:
# 1. snomed_to_ccsr_mapping.csv - Maps SNOMED codes to CCSR categories
# 2. ccsr_chronic_conditions.csv - List of chronic CCSR categories
logger.info("Configuration loaded")

## 3. Utility Functions

In [None]:
def remove_null_type_columns(table: pa.Table) -> pa.Table:
    """Remove columns with null/NA type from PyArrow table."""
    valid_columns = [
        name for name, field in zip(table.column_names, table.schema)
        if not pa.types.is_null(field.type)
    ]
    if len(valid_columns) < len(table.column_names):
        logger.warning(f"Removed {len(table.column_names) - len(valid_columns)} null-type columns")
    return table.select(valid_columns)


def ensure_int64(table: pa.Table, column_name: str) -> pa.Table:
    """Ensure a column is int64 type."""
    if column_name not in table.column_names:
        return table
    
    col = table[column_name]
    if not pa.types.is_integer(col.type) or col.type != pa.int64():
        try:
            col = pc.cast(col, pa.int64())
            idx = table.column_names.index(column_name)
            table = table.set_column(idx, column_name, col)
        except Exception as e:
            logger.warning(f"Could not cast {column_name} to int64: {e}")
    
    return table


def load_omop_table(table_name: str, data_dir: Path) -> Optional[pa.Table]:
    """Load an OMOP table from Parquet or CSV file."""
    try:
        possible_paths = [
            data_dir / f"{table_name}.parquet",
            data_dir / f"{table_name.upper()}.parquet",
            data_dir / f"{table_name}.csv",
        ]
        
        for path in possible_paths:
            if path.exists():
                if path.suffix == '.parquet':
                    table = pq.read_table(path)
                elif path.suffix == '.csv':
                    df = pd.read_csv(path)
                    table = pa.Table.from_pandas(df)
                
                logger.info(f"Loaded {table_name}: {table.num_rows:,} rows")
                return remove_null_type_columns(table)
        
        logger.warning(f"Table {table_name} not found")
        return None
        
    except Exception as e:
        logger.error(f"Error loading {table_name}: {e}")
        return None


def timestamp_to_date_string(ts_array: pa.Array) -> List[Optional[str]]:
    """Convert PyArrow timestamp array to date strings."""
    pyvals = ts_array.to_pylist()
    return [
        v.strftime("%Y-%m-%d") if isinstance(v, datetime) else None
        for v in pyvals
    ]


def save_dictionary(data: Dict, filename: str, output_dir: Path):
    """Save dictionary to JSON file."""
    output_path = output_dir / filename
    
    # Convert sets to lists
    if isinstance(data, dict):
        data = {k: sorted(list(v)) if isinstance(v, set) else v 
                for k, v in data.items()}
    
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)
    
    logger.info(f"Saved {filename} ({len(data)} categories)")

logger.info("Utility functions loaded")

## 4. Data Loader Class

In [None]:
class OMOPDataLoader:
    """Loads and manages OMOP CDM tables."""
    
    def __init__(self, data_dir: Path):
        self.data_dir = data_dir
        self.tables = {}
        
    def load_table(self, table_name: str) -> Optional[pa.Table]:
        """Load a single OMOP table."""
        if table_name not in self.tables:
            self.tables[table_name] = load_omop_table(table_name, self.data_dir)
        return self.tables[table_name]
    
    def load_all_required_tables(self, table_list: List[str]):
        """Load all required tables."""
        logger.info(f"Loading {len(table_list)} OMOP tables...")
        for table_name in table_list:
            self.load_table(table_name)
        logger.info("All tables loaded")
    
    def get_valid_person_ids(self) -> Set[int]:
        """Get set of valid person IDs."""
        person_table = self.load_table('person')
        if person_table is None:
            logger.error("Person table not found!")
            return set()
        
        # Find person_id column
        person_id_col = None
        for col in ['PERSON_ID', 'person_id', 'PersonId']:
            if col in person_table.column_names:
                person_id_col = col
                break
        
        if person_id_col is None:
            logger.error("Could not find person_id column")
            return set()
        
        person_ids = set(person_table[person_id_col].to_pylist())
        logger.info(f"Found {len(person_ids):,} valid persons")
        return person_ids

logger.info("Data loader class defined")

## 5. Load CCSR Mapping Files

You need to provide two mapping files in the `ccsr_mappings` directory:

### 1. `snomed_to_ccsr_mapping.csv`
Maps SNOMED concept IDs to CCSR categories. Expected columns:
- `snomed_concept_id` (int): OMOP/SNOMED concept ID
- `ccsr_category_ip` (str): Inpatient CCSR category code
- `ccsr_category_op` (str): Outpatient CCSR category code

### 2. `ccsr_chronic_conditions.csv`
List of CCSR categories considered chronic. Expected columns:
- `ccsr_category` (str): CCSR category code
- `is_chronic` (bool): Whether condition is chronic

In [None]:
def load_ccsr_mappings(mappings_dir: Path) -> Tuple[pa.Table, Set[str]]:
    """
    Load CCSR mapping files.
    
    Returns:
        snomed_ccsr_map: PyArrow table with SNOMED to CCSR mappings
        chronic_categories: Set of chronic CCSR category codes
    """
    logger.info("Loading CCSR mapping files...")
    
    # Load SNOMED to CCSR mapping
    snomed_ccsr_path = mappings_dir / 'snomed_to_ccsr_mapping.csv'
    if not snomed_ccsr_path.exists():
        raise FileNotFoundError(
            f"SNOMED to CCSR mapping not found: {snomed_ccsr_path}\n"
            "Please provide a CSV with columns: snomed_concept_id, ccsr_category_ip, ccsr_category_op"
        )
    
    # Read mapping file
    snomed_ccsr_df = pd.read_csv(snomed_ccsr_path)
    
    # Ensure required columns exist
    required_cols = ['snomed_concept_id', 'ccsr_category_ip', 'ccsr_category_op']
    missing_cols = set(required_cols) - set(snomed_ccsr_df.columns)
    if missing_cols:
        raise ValueError(f"Missing columns in SNOMED-CCSR mapping: {missing_cols}")
    
    # Convert to PyArrow
    snomed_ccsr_map = pa.Table.from_pandas(snomed_ccsr_df[required_cols])
    snomed_ccsr_map = snomed_ccsr_map.rename_columns([
        'snomed_concept_id',
        'ip_cat',
        'op_cat'
    ])
    
    # Ensure snomed_concept_id is int64
    snomed_ccsr_map = ensure_int64(snomed_ccsr_map, 'snomed_concept_id')
    
    logger.info(f"Loaded {snomed_ccsr_map.num_rows:,} SNOMED-CCSR mappings")
    
    # Load chronic conditions list
    chronic_path = mappings_dir / 'ccsr_chronic_conditions.csv'
    if not chronic_path.exists():
        logger.warning(
            f"Chronic conditions file not found: {chronic_path}\n"
            "All CCSR categories will be considered (no chronic filtering)"
        )
        # Extract all unique categories from mapping
        chronic_categories = set(
            snomed_ccsr_map['ip_cat'].to_pylist() + 
            snomed_ccsr_map['op_cat'].to_pylist()
        )
        chronic_categories.discard(None)
    else:
        chronic_df = pd.read_csv(chronic_path)
        chronic_categories = set(
            chronic_df[chronic_df['is_chronic'] == True]['ccsr_category'].tolist()
        )
    
    logger.info(f"Identified {len(chronic_categories)} chronic CCSR categories")
    
    return snomed_ccsr_map, chronic_categories

# Load mappings
snomed_ccsr_map, chronic_categories = load_ccsr_mappings(config.ccsr_mappings_dir)

## 6. Load OMOP Data

In [None]:
# Initialize data loader
loader = OMOPDataLoader(config.omop_data_dir)

# Load all required tables
loader.load_all_required_tables(config.required_tables)

# Get valid person IDs
valid_person_ids = loader.get_valid_person_ids()

logger.info(f"Data loading complete. Working with {len(valid_person_ids):,} patients")

## 7. Filter CCSR Mapping to Chronic Conditions Only

In [None]:
def filter_to_chronic_categories(snomed_ccsr_map: pa.Table, 
                                 chronic_set: Set[str]) -> pa.Table:
    """
    Filter SNOMED-CCSR mapping to include only chronic conditions.
    """
    logger.info("Filtering to chronic conditions only...")
    
    # Create boolean masks for chronic categories
    ip_categories = snomed_ccsr_map['ip_cat'].to_pylist()
    op_categories = snomed_ccsr_map['op_cat'].to_pylist()
    
    # Keep rows where either IP or OP category is chronic
    chronic_mask = [
        (ip_cat in chronic_set if ip_cat else False) or 
        (op_cat in chronic_set if op_cat else False)
        for ip_cat, op_cat in zip(ip_categories, op_categories)
    ]
    
    filtered_map = snomed_ccsr_map.filter(pa.array(chronic_mask))
    
    logger.info(
        f"Filtered from {snomed_ccsr_map.num_rows:,} to "
        f"{filtered_map.num_rows:,} chronic condition mappings"
    )
    
    return filtered_map

# Filter to chronic conditions
snomed_ccsr_chronic = filter_to_chronic_categories(snomed_ccsr_map, chronic_categories)

# Remove rows with null SNOMED IDs
snomed_ccsr_chronic = snomed_ccsr_chronic.filter(
    pc.is_valid(snomed_ccsr_chronic['snomed_concept_id'])
)

## 8. Join Condition Occurrences with CCSR Categories

In [None]:
def join_conditions_with_ccsr(loader: OMOPDataLoader,
                             snomed_ccsr_map: pa.Table,
                             valid_person_ids: Set[int]) -> pa.Table:
    """
    Join condition_occurrence with CCSR mappings and visit information.
    """
    logger.info("Joining conditions with CCSR categories...")
    
    # Load condition_occurrence
    condition_table = loader.load_table('condition_occurrence')
    if condition_table is None:
        raise ValueError("condition_occurrence table not found")
    
    condition_table = remove_null_type_columns(condition_table)
    
    # Find required columns (case-insensitive)
    col_map = {}
    for std_col in ['PERSON_ID', 'VISIT_OCCURRENCE_ID', 'CONDITION_CONCEPT_ID', 'CONDITION_START_DATE']:
        for actual_col in condition_table.column_names:
            if actual_col.upper() == std_col:
                col_map[std_col] = actual_col
                break
    
    missing = set(['PERSON_ID', 'VISIT_OCCURRENCE_ID', 'CONDITION_CONCEPT_ID', 'CONDITION_START_DATE']) - set(col_map.keys())
    if missing:
        raise ValueError(f"Missing required columns in condition_occurrence: {missing}")
    
    # Select and rename columns
    cond_proj = condition_table.select(list(col_map.values())).rename_columns([
        'person_id',
        'visit_occurrence_id',
        'snomed_concept_id',
        'condition_start_date'
    ])
    
    # Ensure proper types
    cond_proj = ensure_int64(cond_proj, 'snomed_concept_id')
    
    # Join with CCSR mapping
    cond_with_ccsr = cond_proj.join(
        snomed_ccsr_map,
        keys='snomed_concept_id',
        join_type='inner'
    )
    
    logger.info(f"Matched {cond_with_ccsr.num_rows:,} conditions to CCSR categories")
    
    # Load visit_occurrence for IP/OP distinction
    visit_table = loader.load_table('visit_occurrence')
    if visit_table is None:
        raise ValueError("visit_occurrence table not found")
    
    visit_table = remove_null_type_columns(visit_table)
    
    # Find visit columns
    visit_col_map = {}
    for std_col in ['VISIT_OCCURRENCE_ID', 'VISIT_CONCEPT_ID']:
        for actual_col in visit_table.column_names:
            if actual_col.upper() == std_col:
                visit_col_map[std_col] = actual_col
                break
    
    if len(visit_col_map) < 2:
        raise ValueError("Missing VISIT_OCCURRENCE_ID or VISIT_CONCEPT_ID in visit_occurrence")
    
    visit_proj = visit_table.select(list(visit_col_map.values())).rename_columns([
        'visit_occurrence_id',
        'visit_concept_id'
    ])
    
    visit_proj = ensure_int64(visit_proj, 'visit_concept_id')
    
    # Join with visit info
    cond_joined = cond_with_ccsr.join(
        visit_proj,
        keys='visit_occurrence_id',
        join_type='left outer'
    )
    
    cond_joined = ensure_int64(cond_joined, 'visit_concept_id')
    
    # Filter to valid persons
    valid_mask = pc.is_in(
        cond_joined['person_id'],
        pa.array(list(valid_person_ids), type=cond_joined['person_id'].type)
    )
    cond_joined = cond_joined.filter(valid_mask)
    
    # Parse dates
    date_str = pc.cast(cond_joined['condition_start_date'], pa.string())
    date_parsed = pc.strptime(date_str, format='%Y-%m-%d', unit='s')
    cond_joined = cond_joined.append_column('condition_date_parsed', date_parsed)
    
    logger.info(f"Final joined table: {cond_joined.num_rows:,} rows")
    
    return cond_joined

# Join conditions with CCSR
conditions_with_ccsr = join_conditions_with_ccsr(
    loader, 
    snomed_ccsr_chronic, 
    valid_person_ids
)

## 9. Separate Inpatient and Outpatient Encounters

OMOP standard visit concept IDs:
- **9201**: Inpatient visit
- **9202**: Outpatient visit

In [None]:
def separate_ip_op(cond_table: pa.Table) -> Tuple[pa.Table, pa.Table]:
    """
    Separate inpatient and outpatient encounters.
    """
    logger.info("Separating inpatient and outpatient encounters...")
    
    # Create masks for IP and OP
    mask_ip = pc.equal(cond_table['visit_concept_id'], pa.scalar(9201, pa.int64()))
    mask_op = pc.equal(cond_table['visit_concept_id'], pa.scalar(9202, pa.int64()))
    
    # Filter to IP with valid IP categories
    ip_table = cond_table.filter(
        pc.and_(mask_ip, pc.is_valid(cond_table['ip_cat']))
    )
    
    # Filter to OP with valid OP categories
    op_table = cond_table.filter(
        pc.and_(mask_op, pc.is_valid(cond_table['op_cat']))
    )
    
    logger.info(f"Inpatient encounters: {ip_table.num_rows:,}")
    logger.info(f"Outpatient encounters: {op_table.num_rows:,}")
    
    return ip_table, op_table

# Separate encounters
ip_conditions, op_conditions = separate_ip_op(conditions_with_ccsr)

## 10. Create CCSR Dictionaries with Earliest Dates

In [None]:
def create_ccsr_dictionaries(ip_table: pa.Table, 
                             op_table: pa.Table,
                             output_dir: Path) -> Dict:
    """
    Create patient dictionaries by CCSR category for IP and OP separately.
    Includes earliest occurrence dates.
    """
    logger.info("Creating CCSR chronic condition dictionaries...")
    
    # Process Inpatient
    if ip_table.num_rows > 0:
        grouped_ip = (
            ip_table
            .group_by(['ip_cat', 'person_id'])
            .aggregate([('condition_date_parsed', 'min')])
            .rename_columns(['ip_cat', 'person_id', 'min_date'])
        )
        
        # Build dictionaries
        ccsr_ip = defaultdict(set)
        ccsr_ip_dates = defaultdict(dict)
        
        for cat, pid, date in zip(
            grouped_ip['ip_cat'],
            grouped_ip['person_id'],
            timestamp_to_date_string(grouped_ip['min_date'])
        ):
            if cat is not None and pid is not None:
                cat_str = cat.as_py()
                pid_int = pid.as_py()
                ccsr_ip[cat_str].add(pid_int)
                ccsr_ip_dates[cat_str][pid_int] = date
        
        ccsr_ip = {k: sorted(list(v)) for k, v in ccsr_ip.items()}
        logger.info(f"Created {len(ccsr_ip)} inpatient CCSR categories")
    else:
        ccsr_ip = {}
        ccsr_ip_dates = {}
        logger.warning("No inpatient encounters found")
    
    # Process Outpatient
    if op_table.num_rows > 0:
        grouped_op = (
            op_table
            .group_by(['op_cat', 'person_id'])
            .aggregate([('condition_date_parsed', 'min')])
            .rename_columns(['op_cat', 'person_id', 'min_date'])
        )
        
        # Build dictionaries
        ccsr_op = defaultdict(set)
        ccsr_op_dates = defaultdict(dict)
        
        for cat, pid, date in zip(
            grouped_op['op_cat'],
            grouped_op['person_id'],
            timestamp_to_date_string(grouped_op['min_date'])
        ):
            if cat is not None and pid is not None:
                cat_str = cat.as_py()
                pid_int = pid.as_py()
                ccsr_op[cat_str].add(pid_int)
                ccsr_op_dates[cat_str][pid_int] = date
        
        ccsr_op = {k: sorted(list(v)) for k, v in ccsr_op.items()}
        logger.info(f"Created {len(ccsr_op)} outpatient CCSR categories")
    else:
        ccsr_op = {}
        ccsr_op_dates = {}
        logger.warning("No outpatient encounters found")
    
    # Save dictionaries
    results = {
        'patients_by_ccsr_chronic_ip': ccsr_ip,
        'patients_by_ccsr_chronic_ip_first_date': ccsr_ip_dates,
        'patients_by_ccsr_chronic_op': ccsr_op,
        'patients_by_ccsr_chronic_op_first_date': ccsr_op_dates
    }
    
    for name, data in results.items():
        save_dictionary(data, f"{name}.json", output_dir)
    
    logger.info("CCSR chronic condition dictionaries created successfully")
    return results

# Create dictionaries
ccsr_results = create_ccsr_dictionaries(
    ip_conditions,
    op_conditions,
    config.output_dir
)

## 11. Summary

In [None]:
# Display summary
output_files = list(config.output_dir.glob('*.json'))

print("\n" + "="*60)
print("CCSR DICTIONARY CREATION COMPLETE")
print("="*60)
print(f"\nOutput directory: {config.output_dir}")
print(f"\nGenerated {len(output_files)} files:\n")

for file in sorted(output_files):
    size_kb = file.stat().st_size / 1024
    print(f"  • {file.name:<50} ({size_kb:>8.1f} KB)")

print("\n" + "="*60)
print("\nDictionary Structure:")
print("  - patients_by_ccsr_chronic_ip.json")
print("    Format: {ccsr_category: [patient_ids]}")
print("\n  - patients_by_ccsr_chronic_ip_first_date.json")
print("    Format: {ccsr_category: {patient_id: first_date}}")
print("\n  - patients_by_ccsr_chronic_op.json")
print("    Format: {ccsr_category: [patient_ids]}")
print("\n  - patients_by_ccsr_chronic_op_first_date.json")
print("    Format: {ccsr_category: {patient_id: first_date}}")
print("="*60)

# Sample output
if ccsr_results['patients_by_ccsr_chronic_ip']:
    sample_cat = list(ccsr_results['patients_by_ccsr_chronic_ip'].keys())[0]
    sample_patients = ccsr_results['patients_by_ccsr_chronic_ip'][sample_cat]
    print(f"\nExample - Inpatient '{sample_cat}':")
    print(f"  Patient count: {len(sample_patients)}")
    print(f"  Sample IDs: {sample_patients[:5]}")

if ccsr_results['patients_by_ccsr_chronic_op']:
    sample_cat = list(ccsr_results['patients_by_ccsr_chronic_op'].keys())[0]
    sample_patients = ccsr_results['patients_by_ccsr_chronic_op'][sample_cat]
    print(f"\nExample - Outpatient '{sample_cat}':")
    print(f"  Patient count: {len(sample_patients)}")
    print(f"  Sample IDs: {sample_patients[:5]}")