# Creating Patient Dictionaries from OMOP CDM Data

This notebook demonstrates how to create structured patient dictionaries from OMOP Common Data Model (CDM) data, including:
- Condition mappings (ICD-10, SNOMED, CCSR categories)
- Procedure mappings
- Drug/medication mappings
- Lab measurement mappings
- **Temporal tracking**: earliest occurrence dates for each clinical event

## Use Cases
- Multimorbidity research
- Chronic disease pattern analysis
- Healthcare utilization studies
- Cohort identification
- Longitudinal health tracking

## Requirements
- OMOP CDM database (Parquet, CSV, or database connection)
- Dictionary definition files (provided separately)
- Python libraries: pyarrow, pandas, numpy

## 1. Setup and Imports

In [None]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from datetime import datetime
import json
from pathlib import Path
import logging
from typing import Dict, List, Set, Optional, Any, Tuple
from collections import defaultdict
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

logger.info("Libraries imported successfully")

## 2. Configuration

Update these paths to match your data location:

In [None]:
@dataclass
class Config:
    """Configuration for dictionary creation pipeline."""
    
    # Input: OMOP data directory (adjust to your data location)
    omop_data_dir: Path = Path('./omop_data')  # Change this to your OMOP data path
    
    # Output: where to save generated dictionaries
    output_dir: Path = Path('./output/dictionaries')
    
    # OMOP tables needed
    required_tables: List[str] = None
    
    def __post_init__(self):
        if self.required_tables is None:
            self.required_tables = [
                'person',
                'condition_occurrence',
                'drug_exposure',
                'measurement',
                'procedure_occurrence',
                'visit_occurrence',
                'concept',
                'concept_ancestor',
                'concept_relationship'
            ]
        
        self.output_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Output directory: {self.output_dir}")

config = Config()

# Load dictionary definitions (these should be in the same directory)
from dictionary_definitions import (
    all_conditions_dictionary,
    all_procedures_dictionary,
    all_drugs_dictionary,
    lab_conditions,
    rx_risk_dictionary
)

logger.info(f"Loaded {len(all_conditions_dictionary)} condition categories")
logger.info(f"Loaded {len(all_procedures_dictionary)} procedure categories")
logger.info(f"Loaded {len(all_drugs_dictionary)} drug categories")
logger.info(f"Loaded {len(lab_conditions)} lab conditions")

## 3. Utility Functions

In [None]:
def remove_null_type_columns(table: pa.Table) -> pa.Table:
    """
    Remove columns with null/NA type from PyArrow table.
    These can cause issues in downstream processing.
    """
    valid_columns = [
        name for name, field in zip(table.column_names, table.schema)
        if not pa.types.is_null(field.type)
    ]
    if len(valid_columns) < len(table.column_names):
        logger.warning(f"Removed {len(table.column_names) - len(valid_columns)} null-type columns")
    return table.select(valid_columns)


def ensure_int64(table: pa.Table, column_name: str) -> pa.Table:
    """
    Ensure a column is int64 type for consistent comparisons.
    """
    if column_name not in table.column_names:
        return table
    
    col = table[column_name]
    if not pa.types.is_integer(col.type):
        try:
            col = pc.cast(col, pa.int64())
            idx = table.column_names.index(column_name)
            table = table.set_column(idx, column_name, col)
        except Exception as e:
            logger.warning(f"Could not cast {column_name} to int64: {e}")
    elif col.type != pa.int64():
        col = pc.cast(col, pa.int64())
        idx = table.column_names.index(column_name)
        table = table.set_column(idx, column_name, col)
    
    return table


def load_omop_table(table_name: str, data_dir: Path) -> Optional[pa.Table]:
    """
    Load an OMOP table from Parquet file.
    Modify this function if your data is in a different format (CSV, database, etc.)
    """
    try:
        # Try common file patterns
        possible_paths = [
            data_dir / f"{table_name}.parquet",
            data_dir / f"{table_name.upper()}.parquet",
            data_dir / f"{table_name}.csv",
        ]
        
        for path in possible_paths:
            if path.exists():
                if path.suffix == '.parquet':
                    table = pq.read_table(path)
                elif path.suffix == '.csv':
                    df = pd.read_csv(path)
                    table = pa.Table.from_pandas(df)
                
                logger.info(f"Loaded {table_name}: {table.num_rows:,} rows, {len(table.column_names)} columns")
                return remove_null_type_columns(table)
        
        logger.warning(f"Table {table_name} not found in {data_dir}")
        return None
        
    except Exception as e:
        logger.error(f"Error loading {table_name}: {e}")
        return None


def save_dictionary(data: Dict, filename: str, output_dir: Path):
    """
    Save dictionary to JSON file.
    """
    output_path = output_dir / filename
    
    # Convert sets to lists for JSON serialization
    if isinstance(data, dict):
        data = {k: sorted(list(v)) if isinstance(v, set) else v 
                for k, v in data.items()}
    
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)
    
    logger.info(f"Saved {filename} ({len(data)} entries)")


def timestamp_to_date_string(ts_array: pa.Array) -> List[Optional[str]]:
    """
    Convert PyArrow timestamp array to date strings (YYYY-MM-DD).
    """
    pyvals = ts_array.to_pylist()
    return [
        v.strftime("%Y-%m-%d") if isinstance(v, datetime) else None
        for v in pyvals
    ]

logger.info("Utility functions loaded")

## 4. Data Loader Class

In [None]:
class OMOPDataLoader:
    """
    Loads and manages OMOP CDM tables.
    """
    
    def __init__(self, data_dir: Path):
        self.data_dir = data_dir
        self.tables = {}
        
    def load_table(self, table_name: str) -> Optional[pa.Table]:
        """Load a single OMOP table."""
        if table_name not in self.tables:
            self.tables[table_name] = load_omop_table(table_name, self.data_dir)
        return self.tables[table_name]
    
    def load_all_required_tables(self, table_list: List[str]):
        """Load all required tables."""
        logger.info(f"Loading {len(table_list)} OMOP tables...")
        for table_name in table_list:
            self.load_table(table_name)
        logger.info("All tables loaded")
    
    def get_valid_person_ids(self) -> Set[int]:
        """Get set of valid person IDs from person table."""
        person_table = self.load_table('person')
        if person_table is None:
            logger.error("Person table not found!")
            return set()
        
        # Adjust column name based on your data (might be PERSON_ID or person_id)
        person_id_col = None
        for col in ['PERSON_ID', 'person_id', 'PersonId']:
            if col in person_table.column_names:
                person_id_col = col
                break
        
        if person_id_col is None:
            logger.error("Could not find person_id column")
            return set()
        
        person_ids = set(person_table[person_id_col].to_pylist())
        logger.info(f"Found {len(person_ids):,} valid persons")
        return person_ids

logger.info("Data loader class defined")

## 5. Load OMOP Data

**Note**: Modify the `config.omop_data_dir` path above to point to your OMOP data location.

In [None]:
# Initialize data loader
loader = OMOPDataLoader(config.omop_data_dir)

# Load all required tables
loader.load_all_required_tables(config.required_tables)

# Get valid person IDs
valid_person_ids = loader.get_valid_person_ids()

logger.info(f"Data loading complete. Working with {len(valid_person_ids):,} patients")

## 6. Create Condition Dictionaries

Maps conditions (diagnoses) to patients, tracking earliest occurrence.

In [None]:
def create_condition_dictionaries(loader: OMOPDataLoader, 
                                 condition_dict: Dict,
                                 output_dir: Path) -> Dict:
    """
    Create patient dictionaries for conditions with earliest dates.
    
    Args:
        loader: OMOP data loader
        condition_dict: Dictionary mapping category names to concept IDs
        output_dir: Where to save output files
    
    Returns:
        Dictionary with patient mappings and dates
    """
    logger.info("Creating condition dictionaries...")
    
    # Load condition_occurrence table
    cond_table = loader.load_table('condition_occurrence')
    if cond_table is None:
        logger.error("condition_occurrence table not found")
        return {}
    
    # Standardize column names (adjust based on your data)
    col_mapping = {
        'person_id': None,
        'condition_concept_id': None,
        'condition_start_date': None
    }
    
    for standard_name in col_mapping.keys():
        for col in cond_table.column_names:
            if col.lower() == standard_name or col.lower() == standard_name.upper():
                col_mapping[standard_name] = col
                break
    
    if any(v is None for v in col_mapping.values()):
        logger.error(f"Missing required columns. Found: {cond_table.column_names}")
        return {}
    
    # Select and rename columns
    cond_table = cond_table.select(list(col_mapping.values()))
    cond_table = cond_table.rename_columns(list(col_mapping.keys()))
    
    # Ensure proper types
    cond_table = ensure_int64(cond_table, 'person_id')
    cond_table = ensure_int64(cond_table, 'condition_concept_id')
    
    # Parse dates
    date_col = pc.cast(cond_table['condition_start_date'], pa.string())
    date_parsed = pc.strptime(date_col, format='%Y-%m-%d', unit='s')
    cond_table = cond_table.append_column('date_parsed', date_parsed)
    
    # Filter to valid patients
    valid_mask = pc.is_in(
        cond_table['person_id'],
        pa.array(list(valid_person_ids), type=pa.int64())
    )
    cond_table = cond_table.filter(valid_mask)
    
    logger.info(f"Processing {cond_table.num_rows:,} condition records")
    
    # Create dictionaries for each category
    results = {
        'patients_by_condition': {},
        'patients_by_condition_first_date': {}
    }
    
    for category_name, concept_ids in condition_dict.items():
        # Filter to this category's concepts
        concept_mask = pc.is_in(
            cond_table['condition_concept_id'],
            pa.array(concept_ids, type=pa.int64())
        )
        category_table = cond_table.filter(concept_mask)
        
        if category_table.num_rows == 0:
            continue
        
        # Group by person and get earliest date
        grouped = category_table.group_by('person_id').aggregate([
            ('date_parsed', 'min')
        ])
        
        # Extract results
        person_ids = grouped['person_id'].to_pylist()
        dates = timestamp_to_date_string(grouped['date_parsed_min'])
        
        results['patients_by_condition'][category_name] = sorted(person_ids)
        results['patients_by_condition_first_date'][category_name] = dict(zip(person_ids, dates))
    
    # Save results
    save_dictionary(results['patients_by_condition'], 'conditions.json', output_dir)
    save_dictionary(results['patients_by_condition_first_date'], 'conditions_first_date.json', output_dir)
    
    logger.info(f"Created dictionaries for {len(results['patients_by_condition'])} condition categories")
    return results

# Run condition dictionary creation
condition_results = create_condition_dictionaries(
    loader, 
    all_conditions_dictionary, 
    config.output_dir
)

## 7. Create Procedure Dictionaries

Maps procedures to patients with earliest occurrence dates.

In [None]:
def create_procedure_dictionaries(loader: OMOPDataLoader,
                                 procedure_dict: Dict,
                                 output_dir: Path) -> Dict:
    """
    Create patient dictionaries for procedures with earliest dates.
    """
    logger.info("Creating procedure dictionaries...")
    
    proc_table = loader.load_table('procedure_occurrence')
    if proc_table is None:
        logger.error("procedure_occurrence table not found")
        return {}
    
    # Similar processing as conditions
    col_mapping = {
        'person_id': None,
        'procedure_concept_id': None,
        'procedure_date': None
    }
    
    for standard_name in col_mapping.keys():
        for col in proc_table.column_names:
            if col.lower() == standard_name or col.lower() == standard_name.upper():
                col_mapping[standard_name] = col
                break
    
    if any(v is None for v in col_mapping.values()):
        logger.error(f"Missing required columns")
        return {}
    
    proc_table = proc_table.select(list(col_mapping.values()))
    proc_table = proc_table.rename_columns(list(col_mapping.keys()))
    
    proc_table = ensure_int64(proc_table, 'person_id')
    proc_table = ensure_int64(proc_table, 'procedure_concept_id')
    
    # Parse dates
    date_col = pc.cast(proc_table['procedure_date'], pa.string())
    date_parsed = pc.strptime(date_col, format='%Y-%m-%d', unit='s')
    proc_table = proc_table.append_column('date_parsed', date_parsed)
    
    # Filter valid patients
    valid_mask = pc.is_in(
        proc_table['person_id'],
        pa.array(list(valid_person_ids), type=pa.int64())
    )
    proc_table = proc_table.filter(valid_mask)
    
    logger.info(f"Processing {proc_table.num_rows:,} procedure records")
    
    results = {
        'patients_by_procedure': {},
        'patients_by_procedure_first_date': {}
    }
    
    for category_name, concept_ids in procedure_dict.items():
        concept_mask = pc.is_in(
            proc_table['procedure_concept_id'],
            pa.array(concept_ids, type=pa.int64())
        )
        category_table = proc_table.filter(concept_mask)
        
        if category_table.num_rows == 0:
            continue
        
        grouped = category_table.group_by('person_id').aggregate([
            ('date_parsed', 'min')
        ])
        
        person_ids = grouped['person_id'].to_pylist()
        dates = timestamp_to_date_string(grouped['date_parsed_min'])
        
        results['patients_by_procedure'][category_name] = sorted(person_ids)
        results['patients_by_procedure_first_date'][category_name] = dict(zip(person_ids, dates))
    
    save_dictionary(results['patients_by_procedure'], 'procedures.json', output_dir)
    save_dictionary(results['patients_by_procedure_first_date'], 'procedures_first_date.json', output_dir)
    
    logger.info(f"Created dictionaries for {len(results['patients_by_procedure'])} procedure categories")
    return results

# Run procedure dictionary creation
procedure_results = create_procedure_dictionaries(
    loader,
    all_procedures_dictionary,
    config.output_dir
)

## 8. Create Drug/Medication Dictionaries

Maps medications to patients with earliest prescription/exposure dates.

In [None]:
def create_drug_dictionaries(loader: OMOPDataLoader,
                            drug_dict: Dict,
                            output_dir: Path) -> Dict:
    """
    Create patient dictionaries for drugs/medications with earliest dates.
    """
    logger.info("Creating drug dictionaries...")
    
    drug_table = loader.load_table('drug_exposure')
    if drug_table is None:
        logger.error("drug_exposure table not found")
        return {}
    
    col_mapping = {
        'person_id': None,
        'drug_concept_id': None,
        'drug_exposure_start_date': None
    }
    
    for standard_name in col_mapping.keys():
        for col in drug_table.column_names:
            if col.lower() == standard_name or col.lower() == standard_name.upper():
                col_mapping[standard_name] = col
                break
    
    if any(v is None for v in col_mapping.values()):
        logger.error(f"Missing required columns")
        return {}
    
    drug_table = drug_table.select(list(col_mapping.values()))
    drug_table = drug_table.rename_columns(list(col_mapping.keys()))
    
    drug_table = ensure_int64(drug_table, 'person_id')
    drug_table = ensure_int64(drug_table, 'drug_concept_id')
    
    # Parse dates
    date_col = pc.cast(drug_table['drug_exposure_start_date'], pa.string())
    date_parsed = pc.strptime(date_col, format='%Y-%m-%d', unit='s')
    drug_table = drug_table.append_column('date_parsed', date_parsed)
    
    # Filter valid patients
    valid_mask = pc.is_in(
        drug_table['person_id'],
        pa.array(list(valid_person_ids), type=pa.int64())
    )
    drug_table = drug_table.filter(valid_mask)
    
    logger.info(f"Processing {drug_table.num_rows:,} drug exposure records")
    
    results = {
        'patients_by_drug': {},
        'patients_by_drug_first_date': {}
    }
    
    for category_name, concept_ids in drug_dict.items():
        concept_mask = pc.is_in(
            drug_table['drug_concept_id'],
            pa.array(concept_ids, type=pa.int64())
        )
        category_table = drug_table.filter(concept_mask)
        
        if category_table.num_rows == 0:
            continue
        
        grouped = category_table.group_by('person_id').aggregate([
            ('date_parsed', 'min')
        ])
        
        person_ids = grouped['person_id'].to_pylist()
        dates = timestamp_to_date_string(grouped['date_parsed_min'])
        
        results['patients_by_drug'][category_name] = sorted(person_ids)
        results['patients_by_drug_first_date'][category_name] = dict(zip(person_ids, dates))
    
    save_dictionary(results['patients_by_drug'], 'drugs.json', output_dir)
    save_dictionary(results['patients_by_drug_first_date'], 'drugs_first_date.json', output_dir)
    
    logger.info(f"Created dictionaries for {len(results['patients_by_drug'])} drug categories")
    return results

# Run drug dictionary creation
drug_results = create_drug_dictionaries(
    loader,
    all_drugs_dictionary,
    config.output_dir
)

## 9. Create Lab Measurement Dictionaries

Maps lab measurements (e.g., abnormal values) to patients.

In [None]:
def create_lab_dictionaries(loader: OMOPDataLoader,
                           lab_list: List,
                           output_dir: Path) -> Dict:
    """
    Create patient dictionaries for lab measurements.
    Note: This is a simplified version. Customize based on your lab definitions.
    """
    logger.info("Creating lab measurement dictionaries...")
    
    meas_table = loader.load_table('measurement')
    if meas_table is None:
        logger.error("measurement table not found")
        return {}
    
    col_mapping = {
        'person_id': None,
        'measurement_concept_id': None,
        'measurement_date': None,
        'value_as_number': None
    }
    
    for standard_name in col_mapping.keys():
        for col in meas_table.column_names:
            if col.lower() == standard_name or col.lower() == standard_name.upper():
                col_mapping[standard_name] = col
                break
    
    if any(v is None for v in col_mapping.values()):
        logger.error(f"Missing required columns")
        return {}
    
    meas_table = meas_table.select(list(col_mapping.values()))
    meas_table = meas_table.rename_columns(list(col_mapping.keys()))
    
    meas_table = ensure_int64(meas_table, 'person_id')
    meas_table = ensure_int64(meas_table, 'measurement_concept_id')
    
    # Parse dates
    date_col = pc.cast(meas_table['measurement_date'], pa.string())
    date_parsed = pc.strptime(date_col, format='%Y-%m-%d', unit='s')
    meas_table = meas_table.append_column('date_parsed', date_parsed)
    
    # Filter valid patients
    valid_mask = pc.is_in(
        meas_table['person_id'],
        pa.array(list(valid_person_ids), type=pa.int64())
    )
    meas_table = meas_table.filter(valid_mask)
    
    logger.info(f"Processing {meas_table.num_rows:,} measurement records")
    
    results = {
        'patients_by_lab': {},
        'patients_by_lab_first_date': {}
    }
    
    # Process lab_list (structure depends on your lab_conditions definition)
    # This is a placeholder - customize based on your needs
    for lab_item in lab_list:
        if isinstance(lab_item, dict) and 'concept_id' in lab_item:
            concept_id = lab_item['concept_id']
            name = lab_item.get('name', f'lab_{concept_id}')
            
            concept_mask = pc.equal(
                meas_table['measurement_concept_id'],
                pa.scalar(concept_id, type=pa.int64())
            )
            category_table = meas_table.filter(concept_mask)
            
            if category_table.num_rows == 0:
                continue
            
            grouped = category_table.group_by('person_id').aggregate([
                ('date_parsed', 'min')
            ])
            
            person_ids = grouped['person_id'].to_pylist()
            dates = timestamp_to_date_string(grouped['date_parsed_min'])
            
            results['patients_by_lab'][name] = sorted(person_ids)
            results['patients_by_lab_first_date'][name] = dict(zip(person_ids, dates))
    
    save_dictionary(results['patients_by_lab'], 'labs.json', output_dir)
    save_dictionary(results['patients_by_lab_first_date'], 'labs_first_date.json', output_dir)
    
    logger.info(f"Created dictionaries for {len(results['patients_by_lab'])} lab categories")
    return results

# Run lab dictionary creation
lab_results = create_lab_dictionaries(
    loader,
    lab_conditions,
    config.output_dir
)

## 10. Summary and Output Files

Review the generated dictionary files.

In [None]:
# List all generated files
output_files = list(config.output_dir.glob('*.json'))

print("\n" + "="*60)
print("DICTIONARY CREATION COMPLETE")
print("="*60)
print(f"\nOutput directory: {config.output_dir}")
print(f"\nGenerated {len(output_files)} dictionary files:\n")

for file in sorted(output_files):
    size_kb = file.stat().st_size / 1024
    print(f"  • {file.name:<40} ({size_kb:>8.1f} KB)")

print("\n" + "="*60)
print("\nDictionary Structure:")
print("  - conditions.json: {category: [patient_ids]}")
print("  - conditions_first_date.json: {category: {patient_id: date}}")
print("  - procedures.json: {category: [patient_ids]}")
print("  - procedures_first_date.json: {category: {patient_id: date}}")
print("  - drugs.json: {category: [patient_ids]}")
print("  - drugs_first_date.json: {category: {patient_id: date}}")
print("  - labs.json: {category: [patient_ids]}")
print("  - labs_first_date.json: {category: {patient_id: date}}")
print("="*60)

# Sample output
if output_files:
    sample_file = [f for f in output_files if 'first_date' not in f.name][0]
    with open(sample_file, 'r') as f:
        sample_data = json.load(f)
    
    print(f"\nSample from {sample_file.name}:")
    print(f"Categories: {list(sample_data.keys())[:5]}")
    
    if sample_data:
        first_category = list(sample_data.keys())[0]
        print(f"\nExample - '{first_category}':")
        print(f"  Patient count: {len(sample_data[first_category])}")
        print(f"  Sample IDs: {sample_data[first_category][:5]}")

## Notes

### Usage
1. Place your OMOP data files in the specified directory
2. Ensure `dictionary_definitions.py` is in the same directory
3. Run all cells sequentially
4. Dictionary JSON files will be created in `./output/dictionaries/`

### Dictionary Structure
Each dictionary maps medical categories to patients:
- **Main dictionaries**: `{category: [patient_id_list]}`
- **Date dictionaries**: `{category: {patient_id: first_occurrence_date}}`

### Customization
- Modify `Config` class to change input/output paths
- Update `load_omop_table()` for different data formats (CSV, database, etc.)
- Adjust column name mappings if your OMOP implementation uses different names
- Add additional filtering logic as needed (e.g., date ranges, specific cohorts)

### Performance
- Uses PyArrow for efficient processing of large datasets
- Processes millions of records in minutes
- Memory-efficient with streaming operations

### Output Files
Generated dictionaries can be used for:
- Cohort identification
- Feature engineering for ML models
- Multimorbidity analysis
- Healthcare utilization studies
- Longitudinal outcome tracking