In [None]:
#| default_exp data_reader

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
import json
from typing import Iterator, Dict, Any, List

In [None]:
#| export
def read_csv_batches(filepath: str, chunksize: int = 10000) -> Iterator[pd.DataFrame]:
    """Read large CSV file in batches."""
    for chunk in pd.read_csv(filepath, chunksize=chunksize):
        yield chunk

In [None]:
#| export
def strip_whitespace(value: Any) -> Any:
    """Strip whitespace from string values."""
    if isinstance(value, str):
        return value.strip()
    return value

In [None]:
#| export
def parse_prescription_row(row: pd.Series) -> List[Dict[str, Any]]:
    """
    Parse a single CSV row containing JSON prescription data.
    Returns a list of flattened records (one per item, ready for API calls).
    """
    # Parse the JSON if it's a string
    if isinstance(row.iloc[0], str):
        data = json.loads(row.iloc[0])
    else:
        data = row.iloc[0]
    
    # Extract prescription-level fields
    prescription_id = strip_whitespace(data.get('id', ''))
    pharmacy_name = strip_whitespace(data.get('pharmacy_name', ''))
    pharmacy_code = strip_whitespace(data.get('pharmacy_code', ''))
    prescription_date = strip_whitespace(data.get('prescription_date', ''))
    patient_age = data.get('patient_age')
    patient_gender = strip_whitespace(data.get('patient_gender', ''))
    
    # Process each prescription item
    records = []
    for item in data.get('prescription_items', []):
        record = {
            'prescription_id': prescription_id,
            'pharmacy_name': pharmacy_name,
            'pharmacy_code': pharmacy_code,
            'prescription_date': prescription_date,
            'patient_age': patient_age,
            'patient_gender': patient_gender,
            'seq': item.get('seq'),
            'code': strip_whitespace(item.get('code', '')),
            'drug': strip_whitespace(item.get('drug', '')),
            'form': strip_whitespace(item.get('uom_text', '')),
            'route': strip_whitespace(item.get('route', '')),
            'original_direction': strip_whitespace(item.get('original_direction', '')),
            'additional_instructions': strip_whitespace(item.get('additional_instructions', '')),
            'target_direction_manual': strip_whitespace(item.get('dispensed_dosage', ''))
        }
        records.append(record)
    
    return records


In [None]:
#| export
def process_csv_batch(chunk: pd.DataFrame) -> List[Dict[str, Any]]:
    """Process a batch of CSV rows into flattened prescription item records."""
    all_records = []
    for _, row in chunk.iterrows():
        records = parse_prescription_row(row)
        all_records.extend(records)
    return all_records


In [None]:
#| export
if __name__ == "__main__":
    import sys
    filepath = sys.argv[1] if len(sys.argv) > 1 else 'dataset/prescriptions.csv'
    chunksize = int(sys.argv[2]) if len(sys.argv) > 2 else 10000
    
    for chunk in read_csv_batches(filepath, chunksize):
        records = process_csv_batch(chunk)
        print(f"Processed {len(records)} records")
        # TODO: Send to API and save to database
