This notebook contains exploratory ICD mapping logic. Production version is in src/icd_enrich.py.

In [None]:
import pandas as pd
import numpy as np

# 1. LOAD THE DATA
# The dataset uses '?' for missing values. We tell pandas to treat them as NaN.
print("Loading the US Diabetes Dataset...")
df = pd.read_csv('data/raw/diabetic_data.csv', na_values='?')

print(f"Initial Shape: {df.shape}")

# 2. THE CLINICAL MAPPING FUNCTION (The "Canadian Bridge")
# You are certified in ICD-10-CA. This function simulates the grouping logic
# used in Canadian hospital reporting (CIHI DAD abstracting).

def map_icd9_to_icd10ca_chapter(icd9_code):
    if pd.isna(icd9_code):
        return "Unknown"
    
    # Convert to string to handle '250.01' vs 250
    code_str = str(icd9_code)
    
    # --- HANDLE V-CODES (Factors influencing health status) ---
    # ICD-9 'V' codes -> ICD-10 'Z' codes (Z00-Z99)
    if code_str.startswith('V'):
        return "Z00-Z99 (Factors influencing health status)"
    
    # --- HANDLE E-CODES (External causes of injury) ---
    # ICD-9 'E' codes -> ICD-10 'V01-Y98'
    if code_str.startswith('E'):
        return "V01-Y98 (External causes of morbidity)"
    
    # --- HANDLE NUMERIC CODES ---
    try:
        # Convert "250.01" to float 250.01
        code_num = float(code_str)
        
        # 1. CIRCULATORY SYSTEM (Heart Disease) -> ICD-10 I00-I99
        if 390 <= code_num <= 459 or code_num == 785:
            return "I00-I99 (Diseases of the circulatory system)"
        
        # 2. DIABETES (The Core of this study) -> ICD-10 E08-E13
        # In ICD-9, Diabetes is 250.xx
        elif 249 <= code_num < 251:
            return "E08-E13 (Diabetes mellitus)"
            
        # 3. RESPIRATORY SYSTEM -> ICD-10 J00-J99
        elif 460 <= code_num <= 519 or code_num == 786:
            return "J00-J99 (Diseases of the respiratory system)"
            
        # 4. DIGESTIVE SYSTEM -> ICD-10 K00-K93
        elif 520 <= code_num <= 579 or code_num == 787:
            return "K00-K93 (Diseases of the digestive system)"
            
        # 5. INJURY AND POISONING -> ICD-10 S00-T98
        elif 800 <= code_num <= 999:
            return "S00-T98 (Injury, poisoning and consequences of external causes)"
            
        # 6. MUSCULOSKELETAL -> ICD-10 M00-M99
        elif 710 <= code_num <= 739:
            return "M00-M99 (Diseases of the musculoskeletal system)"
            
        # 7. NEOPLASMS (Cancer) -> ICD-10 C00-D48
        elif 140 <= code_num <= 239:
            return "C00-D48 (Neoplasms)"
            
        else:
            return "Other (Genitourinary, Mental, Skin, etc.)"
            
    except ValueError:
        return "Unknown"

# 3. APPLY THE TRANSFORMATION
print("Applying Clinical Mapping (ICD-9 -> ICD-10-CA)...")

# We apply this to the primary diagnosis (diag_1)
df['Primary_Diagnosis_Group'] = df['diag_1'].apply(map_icd9_to_icd10ca_chapter)

# 4. BIOCHEMIST'S INSIGHT: HbA1c Logic
# The 'A1Cresult' column has values: None, Norm, >7, >8
# We turn this into a "High Risk" flag.
def check_high_risk_a1c(result):
    if result == '>8':
        return 1
    return 0

df['High_Risk_A1C'] = df['A1Cresult'].apply(check_high_risk_a1c)

# 5. VERIFY THE RESULTS
print("-" * 50)
print("MAPPING REPORT:")
print(df['Primary_Diagnosis_Group'].value_counts())
print("-" * 50)

# 6. SAVE LOCALLY (Ready for Azure Upload)
df.to_csv("diabetes_clinical_enriched.csv", index=False)
print("File saved as 'diabetes_clinical_enriched.csv'. Ready for Azure.")