In [2]:
# Task 1–3: Preprocess MIMIC-III Notes and Attach Disease Labels

import pandas as pd
import re

# Step 1: Load and filter discharge summaries from NOTEEVENTS.csv.gz
noteevents = pd.read_csv("NOTEEVENTS.csv.gz", 
                         usecols=["SUBJECT_ID", "HADM_ID", "CHARTDATE", "CATEGORY", "TEXT"],
                         compression="gzip", low_memory=False)

discharge_notes = noteevents[noteevents["CATEGORY"] == "Discharge summary"].copy()

# Step 2: Extract relevant clinical sections
RELEVANT_HEADERS = {
    'HISTORY OF PRESENT ILLNESS', 'IMPRESSION', 'FINDINGS',
    'HOSPITAL COURSE', 'PHYSICAL EXAMINATION', 'CHIEF COMPLAINT'
}

def extract_sections(text):
    sections = {}
    current_section = None
    lines = text.split('\n')

    for line in lines:
        line_stripped = line.strip()
        if re.match(r'^[A-Z][A-Z \-]*:$', line_stripped):
            current_section = line_stripped[:-1].strip()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line_stripped)

    kept = []
    for key in sections:
        if any(rel in key for rel in RELEVANT_HEADERS):
            kept.extend(sections[key])
    return ' '.join(kept).strip()

discharge_notes["TEXT_CLEAN"] = discharge_notes["TEXT"].fillna("").apply(extract_sections)

# Step 3: Symptom extraction from cleaned text
SYMPTOM_LIST = ['fever', 'cough', 'fatigue', 'headache', 'nausea', 'vomiting', 
                'dizziness', 'chest pain', 'shortness of breath']

def extract_symptoms(text):
    text = text.lower()
    return [s for s in SYMPTOM_LIST if s in text]

discharge_notes["SYMPTOMS"] = discharge_notes["TEXT_CLEAN"].apply(extract_symptoms)

# Step 4: Merge with diagnoses from DIAGNOSES_ICD.csv.gz
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv.gz", 
                        usecols=["HADM_ID", "ICD9_CODE"], 
                        compression="gzip", low_memory=False)

labeled = discharge_notes.merge(diagnoses, on="HADM_ID", how="inner")

# Step 5 (Optional): Keep only top 50 diagnoses
top_diagnoses = labeled["ICD9_CODE"].value_counts().nlargest(50).index
labeled_filtered = labeled[labeled["ICD9_CODE"].isin(top_diagnoses)].copy()

# Save processed data for modeling
labeled_filtered.to_csv("labeled_notes_with_symptoms.csv", index=False)
print("✅ Labeled data saved to 'labeled_notes_with_symptoms.csv'")

✅ Labeled data saved to 'labeled_notes_with_symptoms.csv'
