## 1. Import Libraries and Data

In [1]:
import os
import re
import csv
import sys
import json
import ast
import pickle
import argparse
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

In [2]:
diag_ccs_single_file_path = './data/tools/ccs_single_dx_tool_2015.csv'
diag_ccs_multi_file_path = './data/tools/ccs_multi_dx_tool_2015.csv'
icd10toicd9_cm_file_path = './data/tools/icd10cmtoicd9gem.csv'

pat_file_path = './data/mimic-iv/patients.csv.gz'
adm_file_path = './data/mimic-iv/admissions.csv.gz'
diag_file_path = './data/mimic-iv/diagnoses_icd.csv.gz'
discharge_note_file_path = './data/mimic-iv/discharge.csv.gz'

out_dir = './processed/mimic_iv/diagnosis_final_new/'

## 2. Demographics

In [3]:
df_pat = pd.read_csv(pat_file_path, 
                     infer_datetime_format = True, 
                     engine = "c")

In [4]:
df_pat

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,
...,...,...,...,...,...,...
299707,19999828,F,46,2147,2017 - 2019,
299708,19999829,F,28,2186,2008 - 2010,
299709,19999840,M,58,2164,2008 - 2010,2164-09-17
299710,19999914,F,49,2158,2017 - 2019,


In [5]:
df_adm = pd.read_csv(adm_file_path, 
                     parse_dates = ['admittime', 'dischtime', 'deathtime'], 
                     infer_datetime_format = True, 
                     engine = "c")

In [6]:
df_adm

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,NaT,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,NaT,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,NaT,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,NaT,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,NaT,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431226,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P75BG6,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0
431227,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,NaT,EW EMER.,P16C7J,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2147-07-17 17:18:00,2147-07-18 17:34:00,0
431228,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,2164-09-17 13:42:00,EW EMER.,P58A9J,EMERGENCY ROOM,DIED,Other,ENGLISH,WIDOWED,WHITE,2164-09-10 11:09:00,2164-09-10 14:46:00,1
431229,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,NaT,EW EMER.,P506DE,EMERGENCY ROOM,HOME,Other,ENGLISH,WIDOWED,WHITE,2164-07-24 21:16:00,2164-07-25 01:20:00,0


In [7]:
df_pat_adm = pd.merge(df_pat, df_adm, on = ['subject_id'], how = 'left')
df_pat_adm = df_pat_adm.sort_values(by=['subject_id', 'admittime']).reset_index(drop=True)

In [8]:
bins = [17, 40, 60, 95]
labels = ['18–40', '41–60', '61+']

df_pat_adm['age_group'] = pd.cut(df_pat_adm['anchor_age'], bins=bins, labels=labels)

In [9]:
df_pat_adm['admission_type'] = df_pat_adm['admission_type'].fillna('UNKNOWN')
df_pat_adm['admission_location'] = df_pat_adm['admission_location'].fillna('INFORMATION NOT AVAILABLE')
df_pat_adm['discharge_location'] = df_pat_adm['discharge_location'].fillna('UNKNOWN')
df_pat_adm['insurance'] = df_pat_adm['insurance'].fillna('Other')
df_pat_adm['language'] = df_pat_adm['language'].fillna('Other')
df_pat_adm['marital_status'] = df_pat_adm['marital_status'].fillna('UNKNOWN')

df_pat_adm['insurance'] = df_pat_adm['insurance'].replace({'No charge': 'Other'})

df_pat_adm['race'] = df_pat_adm['race'].map({

    'ASIAN': 'ASIAN',
    'ASIAN - ASIAN INDIAN': 'ASIAN',
    'ASIAN - CHINESE': 'ASIAN',
    'ASIAN - KOREAN': 'ASIAN',
    'ASIAN - SOUTH EAST ASIAN': 'ASIAN',

    'WHITE': 'WHITE',
    'WHITE - BRAZILIAN': 'WHITE',
    'WHITE - EASTERN EUROPEAN': 'WHITE',
    'WHITE - OTHER EUROPEAN': 'WHITE',
    'WHITE - RUSSIAN': 'WHITE',
    'PORTUGUESE': 'WHITE',

    'BLACK/AFRICAN': 'BLACK',
    'BLACK/AFRICAN AMERICAN': 'BLACK',
    'BLACK/CAPE VERDEAN': 'BLACK',
    'BLACK/CARIBBEAN ISLAND': 'BLACK',

    'HISPANIC OR LATINO': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - COLOMBIAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - CUBAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - DOMINICAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - GUATEMALAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - HONDURAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - MEXICAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - PUERTO RICAN': 'HISPANIC_LATINO',
    'HISPANIC/LATINO - SALVADORAN': 'HISPANIC_LATINO',

    'MIDDLE EASTERN': 'OTHER',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'OTHER',
    'SOUTH AMERICAN': 'OTHER',
    'AMERICAN INDIAN/ALASKA NATIVE': 'OTHER',
    'MULTI RACE ETHNICITY': 'OTHER',
    'OTHER': 'OTHER',

    'PATIENT DECLINED TO ANSWER': 'UNKNOWN',
    'UNABLE TO OBTAIN': 'UNKNOWN',
    'UNKNOWN': 'UNKNOWN'
})

df_pat_adm['race'] = df_pat_adm['race'].fillna('UNKNOWN')

In [10]:
df_pat_adm = df_pat_adm[df_pat_adm['admission_type'].isin(['URGENT', 'EW EMER.', 'DIRECT EMER.'])]

In [11]:
df_pat_adm['next_admittime'] = df_pat_adm.groupby('subject_id').admittime.shift(-1)
df_pat_adm['next_admission_type'] = df_pat_adm.groupby('subject_id').admission_type.shift(-1)

In [12]:
df_pat_adm[['next_admittime', 'next_admission_type']] = df_pat_adm.groupby(['subject_id'])[['next_admittime', 'next_admission_type']].fillna(method = 'bfill')

In [13]:
df_pat_adm['days_next_admit'] = (df_pat_adm.next_admittime - df_pat_adm.dischtime).dt.total_seconds()/(24*60*60)

In [14]:
df_pat_adm['length_of_stay'] = (df_pat_adm['dischtime'] - df_pat_adm['admittime']).apply(lambda x: x.total_seconds() / (24*60*60))

In [15]:
df_pat_adm = df_pat_adm[(df_pat_adm['length_of_stay'] >= 0)]

In [16]:
df_pat_adm['post_disch_time'] = (df_pat_adm['deathtime'] - df_pat_adm['dischtime']).apply(lambda x: x.total_seconds() / (24*60*60))

In [None]:
condition = df_pat_adm['post_disch_time'].isna()
df_pat_adm = df_pat_adm[condition]

In [18]:
df_pat_adm = df_pat_adm[df_pat_adm['anchor_age'] >= 18]
df_pat_adm = df_pat_adm.reset_index(drop=True)

In [19]:
df_pat_adm['temp_visit_interval'] = (df_pat_adm['next_admittime'] - df_pat_adm['dischtime']).dt.total_seconds() / (24*60*60)

In [20]:
df_pat_adm['visit_interval'] = df_pat_adm.groupby('subject_id')['temp_visit_interval'].shift(1)

In [21]:
df_pat_adm['visit_interval'] = df_pat_adm['visit_interval'].fillna(0)

In [22]:
df_pat_adm.drop(columns=['temp_visit_interval'], inplace=True)

In [23]:
subject_ids_neg_los = df_pat_adm[df_pat_adm.length_of_stay < 0].subject_id.tolist()
subject_ids_neg_vi = df_pat_adm[df_pat_adm.visit_interval < 0].subject_id.tolist()

In [24]:
subject_ids_neg_los, subject_ids_neg_vi

([],
 [11339384,
  11890885,
  13723491,
  14080908,
  14233347,
  14319319,
  15957311,
  17006375,
  18507371,
  19270999])

In [25]:
df_pat_adm = df_pat_adm[~df_pat_adm['subject_id'].isin(subject_ids_neg_los)]
df_pat_adm = df_pat_adm[~df_pat_adm['subject_id'].isin(subject_ids_neg_vi)]

df_pat_adm = df_pat_adm.reset_index(drop=True)

In [26]:
df_pat_adm = df_pat_adm.dropna(subset=['hadm_id'])

In [27]:
df_pat_adm

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,hadm_id,admittime,dischtime,deathtime,...,edregtime,edouttime,hospital_expire_flag,age_group,next_admittime,next_admission_type,days_next_admit,length_of_stay,post_disch_time,visit_interval
0,10000032,F,52,2180,2014 - 2016,2180-09-09,22595853.0,2180-05-06 22:23:00,2180-05-07 17:15:00,NaT,...,2180-05-06 19:17:00,2180-05-06 23:30:00,0.0,41–60,2180-06-26 18:27:00,EW EMER.,50.050000,0.786111,,0.000000
1,10000032,F,52,2180,2014 - 2016,2180-09-09,22841357.0,2180-06-26 18:27:00,2180-06-27 18:49:00,NaT,...,2180-06-26 15:54:00,2180-06-26 21:31:00,0.0,41–60,2180-07-23 12:35:00,EW EMER.,25.740278,1.015278,,50.050000
2,10000032,F,52,2180,2014 - 2016,2180-09-09,29079034.0,2180-07-23 12:35:00,2180-07-25 17:55:00,NaT,...,2180-07-23 05:54:00,2180-07-23 14:00:00,0.0,41–60,2180-08-05 23:44:00,EW EMER.,11.242361,2.222222,,25.740278
3,10000032,F,52,2180,2014 - 2016,2180-09-09,25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,NaT,...,2180-08-05 20:58:00,2180-08-06 01:44:00,0.0,41–60,NaT,,,1.754167,,11.242361
4,10000084,M,72,2160,2017 - 2019,2161-02-13,23052089.0,2160-11-21 01:56:00,2160-11-25 14:52:00,NaT,...,2160-11-20 20:36:00,2160-11-21 03:20:00,0.0,61+,NaT,,,4.538889,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206350,19999784,M,57,2119,2017 - 2019,,25180002.0,2119-12-30 09:54:00,2120-01-04 16:20:00,NaT,...,,,0.0,41–60,NaT,,,5.268056,,0.000000
206351,19999828,F,46,2147,2017 - 2019,,29734428.0,2147-07-18 16:23:00,2147-08-04 18:10:00,NaT,...,2147-07-17 17:18:00,2147-07-18 17:34:00,0.0,41–60,2149-01-08 16:44:00,EW EMER.,522.940278,17.074306,,0.000000
206352,19999828,F,46,2147,2017 - 2019,,25744818.0,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,...,2149-01-08 09:11:00,2149-01-08 18:12:00,0.0,41–60,NaT,,,10.011111,,522.940278
206353,19999840,M,58,2164,2008 - 2010,2164-09-17,26071774.0,2164-07-25 00:27:00,2164-07-28 12:15:00,NaT,...,2164-07-24 21:16:00,2164-07-25 01:20:00,0.0,41–60,2164-09-10 13:47:00,EW EMER.,44.063889,3.491667,,0.000000


## 3. Clinical Notes

In [28]:
discharge_note = pd.read_csv(discharge_note_file_path, 
                             compression='gzip', 
                             header=0, 
                             sep=',', 
                             low_memory = False, 
                             engine = "c")

In [29]:
discharge_note

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...
...,...,...,...,...,...,...,...,...
331789,19999828-DS-6,19999828,29734428,DS,6,2147-08-04 00:00:00,2147-08-12 15:36:00,\nName: ___ Unit No: ___...
331790,19999828-DS-7,19999828,25744818,DS,7,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...
331791,19999840-DS-20,19999840,26071774,DS,20,2164-07-28 00:00:00,2164-07-29 14:52:00,\nName: ___ Unit No: ___\...
331792,19999840-DS-21,19999840,21033226,DS,21,2164-09-17 00:00:00,2164-09-18 01:36:00,\nName: ___ Unit No: ___\...


In [30]:
discharge_note = discharge_note.dropna(subset=['hadm_id'])
discharge_note = discharge_note.sort_values(by=['subject_id', 'note_seq', 'charttime']).reset_index(drop=True)
discharge_note = discharge_note.groupby(['subject_id', 'hadm_id', 'note_type']).agg({'text': ' '.join}).reset_index()

In [31]:
discharge_note = discharge_note[[
    'subject_id',
    'hadm_id',
    'text'
]]

In [32]:
def preprocess_text(text):
    # Replace sequences of underscores
    text = re.sub(r'___+', '', text)
    # Remove non-alphanumeric characters except punctuation (keep .,!?;)
    text = re.sub(r'[^\w\s.,!?;]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove gender information: 'sex f' or 'sex m'
    text = re.sub(r'\bsex\s+[mf]\b', '', text, flags=re.IGNORECASE)
    # Remove extra whitespace after deletion
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove de-identification placeholders
    text = text.replace('name unit no admission date discharge date date of birth ', '')
    return text

In [33]:
discharge_note["text"] = discharge_note["text"].apply(preprocess_text)

processed_discharge_note = discharge_note

In [34]:
def remove_hadm_ids_lacking_note_info(df_pat_adm, df_notes):
    df_pat_adm_notes = pd.merge(df_pat_adm, df_notes, on = ['subject_id', 'hadm_id'], how = 'left')
    
    missing_ratio = df_pat_adm_notes.text.isnull().sum() / len(df_pat_adm_notes)
    print("there are {:.4f}% of rows missing note information.".format(missing_ratio))
    
    hadm_ids_to_remove = df_pat_adm_notes[df_pat_adm_notes['text'].isna()].hadm_id.tolist()    
    df_pat_adm_notes_clean = df_pat_adm_notes[~df_pat_adm_notes['hadm_id'].isin(hadm_ids_to_remove)]
    
    print("there are {} subject ids.".format(len(df_pat_adm_notes_clean.hadm_id.unique().tolist())))
    
    df_pat_adm_notes_clean = df_pat_adm_notes_clean.reset_index(drop=True)
    
    return df_pat_adm_notes_clean

In [35]:
df_pat_adm_dis_notes = remove_hadm_ids_lacking_note_info(df_pat_adm, processed_discharge_note)

there are 0.1080% of rows missing note information.
there are 184069 subject ids.


## 4. Diagnosis Codes

In [36]:
def icd_diag_ccs_processing(
    adm_file_path, 
    diag_file_path, 
    icd10toicd9_cm_file_path,
    new_admIdList_threshold=2
):
    
    adms_df = pd.read_csv(adm_file_path, compression='gzip', header=0, sep=',')
    diag_df = pd.read_csv(diag_file_path, compression='gzip', header=0, sep=',')

    icd10cmtoicd9gem_cm_df = pd.read_csv(icd10toicd9_cm_file_path, header=0, sep=',', quotechar='"')

    icd10cmtoicd9_cm_dx = {}
    for index, row in icd10cmtoicd9gem_cm_df.iterrows():
        icd10cmtoicd9_cm_dx[row.icd10cm] = row.icd9cm
    
    print('Building pid-admission mapping, admission-date mapping')
    pidAdmMap = {}
    admDateMap = {}
    disDateMap = {}

    for index, row in tqdm(adms_df.iterrows()):
        pid = int(row.subject_id)
        admId = int(row.hadm_id)
        admTime = datetime.strptime(row.admittime, '%Y-%m-%d %H:%M:%S')
        disTime = datetime.strptime(row.dischtime, '%Y-%m-%d %H:%M:%S')
        admDateMap[admId] = admTime
        disDateMap[admId] = disTime

        if pid in pidAdmMap:
            pidAdmMap[pid].append(admId)
        else:
            pidAdmMap[pid] = [admId]
    
    print('Building admission-dxList mapping')
    admDiagMap = {}                 # admission id->diag ICD

    for index, row in tqdm(diag_df.iterrows()):
        admId = int(row.hadm_id)
        dx = row.icd_code.strip()
        if len(dx) == 0:
            continue

        # convert ICD10CM to ICD9CM
        if row.icd_version == 10:
            if dx in icd10cmtoicd9_cm_dx:
                dx = icd10cmtoicd9_cm_dx[dx]
                if dx == 'NoDx':
                    continue
            else:
                continue

        # dxStr = 'D_' + dx
        dxStr = dx

        if admId in admDiagMap:
            admDiagMap[admId].append(dxStr)
        else:
            admDiagMap[admId] = [dxStr]
            
    print('Building pid-sortedVisits mapping')
    pidDiagMap = {}

    pid_list = list()
    hadm_id_list = list()

    for pid, admIdList in tqdm(pidAdmMap.items()):
        new_admIdList = list()
        for admId in admIdList:
            if admId in admDiagMap:
                new_admIdList.append(admId)
        if len(new_admIdList) < new_admIdList_threshold:
            continue

        pid_list.append(pid)
        hadm_id_list.append(new_admIdList)

        sortedDiagList = sorted([(admDateMap[admId], disDateMap[admId], admDiagMap[admId]) for admId in new_admIdList])
        pidDiagMap[pid] = sortedDiagList
        
    print('Building strSeqs, span label')
    seqs = list()
    seqs_intervals = list()

    for pid, visits in tqdm(pidDiagMap.items()):
        seq = list()
        intervals = list()
        los_ = list()
        first_time = visits[0][0]

        for i, visit in enumerate(visits):
            current_time = visit[0]
            discharge_time = visit[1]
            seq.append(visit[2])

            interval = (current_time - first_time).days
            los = (discharge_time - current_time).days

            intervals.append(interval)
            los_.append(los)

        seqs.append(seq)
        seqs_intervals.append(intervals)
        
    diag_code_df = {
        'subject_id': [pid for pid, times in zip(pid_list, seqs_intervals) for _ in times],
        'hadm_id': [hadm_id for hadm_ids in hadm_id_list for hadm_id in hadm_ids],
        'diag_seqs': [diag for diags in seqs for diag in diags]
    }

    diag_code_df = pd.DataFrame(diag_code_df)
    diag_code_df = diag_code_df.sort_values(by=['subject_id']).reset_index(drop=True)
    
    return diag_code_df

In [37]:
df_diag_codes = icd_diag_ccs_processing(
    adm_file_path, 
    diag_file_path, 
    icd10toicd9_cm_file_path,
    new_admIdList_threshold=2
)
# 6364488it [07:26, 14250.75it/s]

Building pid-admission mapping, admission-date mapping


431231it [00:34, 12337.96it/s]


Building admission-dxList mapping


4756326it [03:55, 20197.69it/s]


Building pid-sortedVisits mapping


100%|██████████████████████████████████████████████████████████████████████| 180733/180733 [00:00<00:00, 217933.17it/s]


Building strSeqs, span label


100%|████████████████████████████████████████████████████████████████████████| 79371/79371 [00:00<00:00, 359413.96it/s]


In [38]:
df_diag_codes

Unnamed: 0,subject_id,hadm_id,diag_seqs
0,10000032,22595853,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ..."
1,10000032,22841357,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]"
2,10000032,25742920,"[45829, 07044, 7994, 2761, 78959, 2767, 3051, ..."
3,10000032,29079034,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0..."
4,10000084,23052089,"[33182, 29410, 36816, 78199, 2724, V1046]"
...,...,...,...
329533,19999784,29956342,"[V5811, 20280, 2768, 2731, 3051]"
329534,19999828,25744818,"[99832, 9986, 56981, 28981, 5990, E8788, E8490..."
329535,19999828,29734428,"[56981, 2863, 6822, 99639, E8788, E8499, 4019,..."
329536,19999840,21033226,"[43491, 43820, 34590, 43811, 4019, 2724, 3051]"


In [39]:
def get_ccs_multi(df_diag_codes, diag_ccs_multi_file_path):

    diag_ccs_multi_file = pd.read_csv(diag_ccs_multi_file_path, dtype=object)

    diag_ccs_multi_file.columns = diag_ccs_multi_file.columns.str.replace("'", "").str.strip()
    diag_ccs_multi_file = diag_ccs_multi_file[['ICD-9-CM CODE', 'CCS LVL 1', 'CCS LVL 1 LABEL']]

    diag_ccs_multi_file['ICD-9-CM CODE'] = diag_ccs_multi_file['ICD-9-CM CODE'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_multi_file['CCS LVL 1'] = diag_ccs_multi_file['CCS LVL 1'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_multi_file['CCS LVL 1 LABEL'] = diag_ccs_multi_file['CCS LVL 1 LABEL'].astype(str).str.replace("'", "").str.strip()

    icd_to_ccs_multi = dict(zip(diag_ccs_multi_file['ICD-9-CM CODE'], diag_ccs_multi_file['CCS LVL 1']))
    
    def map_to_ccs(diag_list):
        codes = [code for code in diag_list]
        return [icd_to_ccs_multi.get(code, None) for code in codes]
    
    df_diag_codes['ccs_multi_seqs'] = df_diag_codes['diag_seqs'].apply(map_to_ccs)
    df_diag_codes['ccs_multi_seqs'] = df_diag_codes['ccs_multi_seqs'].apply(lambda x: list(set(x)))

    css_multi_to_desc = dict(zip(diag_ccs_multi_file['CCS LVL 1'], diag_ccs_multi_file['CCS LVL 1 LABEL']))
    
    return df_diag_codes, css_multi_to_desc


def get_ccs_single(df_diag_codes, diag_ccs_single_file_path):

    diag_ccs_single_file = pd.read_csv(diag_ccs_single_file_path, dtype=object)

    diag_ccs_single_file.columns = diag_ccs_single_file.columns.str.replace("'", "").str.strip()
    diag_ccs_single_file = diag_ccs_single_file[['ICD-9-CM CODE', 'CCS CATEGORY', 'CCS CATEGORY DESCRIPTION']]

    diag_ccs_single_file['ICD-9-CM CODE'] = diag_ccs_single_file['ICD-9-CM CODE'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_single_file['CCS CATEGORY'] = diag_ccs_single_file['CCS CATEGORY'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_single_file['CCS CATEGORY DESCRIPTION'] = diag_ccs_single_file['CCS CATEGORY DESCRIPTION'].astype(str).str.replace("'", "").str.strip()

    icd_to_ccs = dict(zip(diag_ccs_single_file['ICD-9-CM CODE'], diag_ccs_single_file['CCS CATEGORY']))

    def map_to_ccs(diag_list):
        codes = [code for code in diag_list]
        return [icd_to_ccs.get(code, None) for code in codes]

    df_diag_codes['ccs_single_seqs'] = df_diag_codes['diag_seqs'].apply(map_to_ccs)
    df_diag_codes['ccs_single_seqs'] = df_diag_codes['ccs_single_seqs'].apply(lambda x: list(set(x)))

    css_single_to_desc = dict(zip(diag_ccs_single_file['CCS CATEGORY'], diag_ccs_single_file['CCS CATEGORY DESCRIPTION']))
    
    return df_diag_codes, css_single_to_desc


def map_ccs_single_and_multi(diag_ccs_single_file_path, diag_ccs_multi_file_path):

    diag_ccs_multi_file = pd.read_csv(diag_ccs_multi_file_path, dtype=object)

    diag_ccs_multi_file.columns = diag_ccs_multi_file.columns.str.replace("'", "").str.strip()
    diag_ccs_multi_file = diag_ccs_multi_file[['ICD-9-CM CODE', 'CCS LVL 1', 'CCS LVL 1 LABEL']]

    diag_ccs_multi_file['ICD-9-CM CODE'] = diag_ccs_multi_file['ICD-9-CM CODE'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_multi_file['CCS LVL 1'] = diag_ccs_multi_file['CCS LVL 1'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_multi_file['CCS LVL 1 LABEL'] = diag_ccs_multi_file['CCS LVL 1 LABEL'].astype(str).str.replace("'", "").str.strip()

    icd_to_ccs_multi = dict(zip(diag_ccs_multi_file['ICD-9-CM CODE'], diag_ccs_multi_file['CCS LVL 1']))
    
    multi_df = pd.DataFrame.from_dict(icd_to_ccs_multi, orient='index', columns=['Multi_Value'])
    multi_df = multi_df.reset_index().rename(columns={'index': 'Key'})

    diag_ccs_single_file = pd.read_csv(diag_ccs_single_file_path, dtype=object)

    diag_ccs_single_file.columns = diag_ccs_single_file.columns.str.replace("'", "").str.strip()
    diag_ccs_single_file = diag_ccs_single_file[['ICD-9-CM CODE', 'CCS CATEGORY', 'CCS CATEGORY DESCRIPTION']]

    diag_ccs_single_file['ICD-9-CM CODE'] = diag_ccs_single_file['ICD-9-CM CODE'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_single_file['CCS CATEGORY'] = diag_ccs_single_file['CCS CATEGORY'].astype(str).str.replace("'", "").str.strip()
    diag_ccs_single_file['CCS CATEGORY DESCRIPTION'] = diag_ccs_single_file['CCS CATEGORY DESCRIPTION'].astype(str).str.replace("'", "").str.strip()

    icd_to_ccs_single = dict(zip(diag_ccs_single_file['ICD-9-CM CODE'], diag_ccs_single_file['CCS CATEGORY']))
    
    single_df = pd.DataFrame.from_dict(icd_to_ccs_single, orient='index', columns=['Single_Value'])
    single_df = single_df.reset_index().rename(columns={'index': 'Key'})
    single_df = single_df.iloc[1:].reset_index(drop=True)
    single_df = single_df.drop(['Key'], axis=1)
    
    multi_single_df = pd.concat([multi_df.reset_index(drop=True), single_df.reset_index(drop=True)], axis=1)

    css_multi_to_desc = dict(zip(diag_ccs_multi_file['CCS LVL 1'], diag_ccs_multi_file['CCS LVL 1 LABEL']))
    css_single_to_desc = dict(zip(diag_ccs_single_file['CCS CATEGORY'], diag_ccs_single_file['CCS CATEGORY DESCRIPTION']))
    
    multi_single_df['Multi_Diag'] = multi_single_df['Multi_Value'].astype(str).map(css_multi_to_desc)
    multi_single_df['Single_Diag'] = multi_single_df['Single_Value'].astype(str).map(css_single_to_desc)
    
    multi_to_single_dict = multi_single_df.groupby('Multi_Diag')['Single_Diag'].unique().apply(list).to_dict()
    
    return multi_single_df, multi_to_single_dict

In [40]:
df_diag_codes, css_multi_to_desc = get_ccs_multi(df_diag_codes, diag_ccs_multi_file_path)

In [41]:
df_diag_codes, css_single_to_desc = get_ccs_single(df_diag_codes, diag_ccs_single_file_path)

In [42]:
multi_single_df, multi_to_single_dict  = map_ccs_single_and_multi(diag_ccs_single_file_path, diag_ccs_multi_file_path)

In [43]:
long_to_short = {
    "Spinal cord injury": "Spin cor inj",
    "Fracture of skull or face": "Fx skull fac",
    "Fracture of arm": "Fx arm",
    "Fracture of leg": "Fx leg",
    "Other fracture": "Oth fracture",
    "Fetal distress": "Fetal distrs",
    "Amniotic fluid disorders": "Amnios dx",
    "Umbilical cord complications": "Umbil cord",
    "Obstetrics-related perinatal trauma": "OB-related perin trauma",
    "Forceps delivery": "Forceps del",
    "Other complications of birth": "Ot compl bir",
    "Other pregnancy and delivery including normal": "Other pregnancy and delivery including normal",
    "Skin infection": "Skin infectn",
    "Other inflammatory skin conditions": "Ot infl skin",
    "Skin ulcer": "Ulcer skin",
    "Other skin diseases": "Oth skin dx",
    "Infectious arthritis": "Infect arth",
    "Rheumatoid arthritis": "Rheum arth",
    "Osteoarthros": "Osteoarthros",
    "Other joint diseases": "Ot joint dx",
    "Birth trauma": "Birth trauma",
    "Other perinatal diseases": "Ot perint dx",
    "Joint injury": "Joint injury",
    "Fracture of hip": "Fx hip",
    "Anemia": "Anemia",
    "Acute post-hemorrhagic anemia": "Acut p-h anm",
    "Sickle cell": "Sickle cell",
    "Coagulation and hemorrhagic disorders": "Coag/hemr dx",
    "White blood cell disorders": "Wht blood dx",
    "Other hematologic diseases": "Ot hematl dx",
    "Immunity disorders": "Immunity dx",
    "Heart valve disease": "Hrt valve dx",
    "Carditis": "Carditis",
    "Hypertension": "HTN",
    "Hypertension complications": "Htn complicn",
    "Acute myocardial infarction": "Acute MI",
    "Coronary atherosclerosis": "Coron athero",
    "Pulmonary heart disease": "Pulm hart dx",
    "Other heart diseases": "Oth heart dx",
    "Conduction disorders": "Conduction",
    "Cardiac dysrhythmia": "Dysrhythmia",
    "Cardiac arrest": "Cardia arrst",
    "Congestive heart failure; non-hypertensive": "chf;nonhp",
    "Acute cerebrovascular disease": "Acute CVD",
    "Pre-cerebral occlusion": "Precere occl",
    "Other cerebrovascular diseases": "Other CVD",
    "Transient ischemic attack": "TIA",
    "Late effects of cerebrovascular disease": "Late eff CVD",
    "Pneumonia": "Pneumonia",
    "Influenza": "Influenza",
    "Tonsillitis": "Tonsillitis",
    "Bronchitis": "Bronchitis",
    "Other upper respiratory infections": "Ot up rsp in",
    "Chronic obstructive pulmonary disease": "COPD",
    "Asthma": "Asthma",
    "Aspiration pneumonia": "Asp pneumon",
    "Pleurisy": "Pleurisy",
    "Adult respiratory failure": "Adlt resp fl",
    "Lung diseases due to external agents": "Lung externl",
    "Other lower respiratory diseases": "Oth low resp",
    "Other upper respiratory diseases": "Ot uppr resp",
    "Other liver diseases": "Oth liver dx",
    "Pancreas disease": "Pancreas dx",
    "Gastrointestinal hemorrhage": "GI hemorrhag",
    "Gastroenteritis": "Gastroent",
    "Other gastrointestinal diseases": "Other GI dx",
    "Esophageal disease": "Esophgeal dx",
    "Gastric and duodenal ulcer": "Gasduo ulcer",
    "Gastritis": "Gastritis",
    "Other diseases of stomach": "Ot dx stomch",
    "Appendicitis": "Appendicitis",
    "Abdominal hernia": "Abdom hernia",
    "Ulcerative colitis": "Ulcerat col",
    "Intestinal obstruction": "Int obstruct",
    "Diverticulosis": "Diverticulos",
    "Anal and rectal conditions": "Anal/rectal",
    "Peritonitis": "Peritonitis",
    "Biliary disease": "Biliary dx",
    "Teeth diseases": "Teeth dx",
    "Mouth diseases": "Mouth dx",
    "Nephritis": "Nephritis",
    "Acute renal failure": "Ac renl fail",
    "Chronic kidney disease": "Chr kidney disease",
    "Urinary tract infection": "UTI",
    "Urinary stone": "Urin stone",
    "Other diseases of kidney": "Ot dx kidney",
    "Other diseases of bladder": "Ot dx bladdr",
    "Other genitourinary diseases": "Other GU dx",
    "Benign prostatic hyperplasia": "BPH",
    "Infections of male genital organs": "Inf male gen",
    "Other male genital disorders": "Oth male gen",
    "Breast disease": "Breast dx",
    "Pelvic inflammatory disease": "PID",
    "Endometriosis": "Endometrios",
    "Prolapse": "Prolapse",
    "Menstrual disorders": "Menstrual dx",
    "Ovarian cyst": "Ovarian cyst",
    "Menopausal disorders": "Menopausl dx",
    "Female infertility": "Fem infertil",
    "Other female genital disorders": "Ot femal gen",
    "Contraceptive management": "Contraceptiv",
    "Induced abortion": "Induc abortn",
    "Abortion complications": "Abort compl",
    "Ectopic pregnancy": "Ectopic preg",
    "Other pregnancy complications": "Ot preg comp",
    "Hemorrhage in pregnancy": "Hemorr preg",
    "Hypertension in pregnancy": "HTN in preg",
    "Early labor": "Early labor",
    "Long pregnancy": "Long pregncy",
    "Diabetes mellitus in pregnancy": "DM in preg",
    "Malposition of fetus": "Malposition",
    "Pelvic obstruction": "Pelvic obstr",
    "Previous C-section": "Prev c-sectn",
    "Systemic lupus erythematosus": "SLE",
    "Other connective tissue disorders": "Ot conn tiss",
    "Other bone diseases": "Ot bone dx",
    "Other acquired deformities": "Ot acq defor",
    "Osteoporosis": "Osteoporosis",
    "Pathological fracture": "Patholog fx",
    "Back problem": "Back problem",
    "Acquired foot deformity": "Acq foot def",
    "Other nervous system diseases": "Oth nerv dx",
    "Parkinson's disease": "Parkinson-s",
    "Multiple sclerosis": "MS",
    "Other hereditary central nervous system diseases": "Ot hered CNS",
    "Paralysis": "Paralysis",
    "Epilepsy and convulsive disorders": "Epilepsy/cnv",
    "Headache and migraine": "Headache/mig",
    "Coma and brain damage": "Coma/brn dmg",
    "Blindness": "Blindness",
    "Eye infection": "Eye infectn",
    "Other eye diseases": "Other eye dx",
    "Cataract": "Cataract",
    "Retinal disease": "Retinal dx",
    "Glaucoma": "Glaucoma",
    "Otitis media": "Otitis media",
    "Dizziness": "Dizziness",
    "Other ear diseases": "Other ear dx",
    "Thyroid disorders": "Thyroid dsor",
    "Diabetes Mellitus without complications": "DiabMel no c",
    "Diabetes Mellitus with complications": "DiabMel w/cm",
    "Other endocrine disorders": "Ot endo dsor",
    "Nutritional deficiencies": "Nutrit defic",
    "Hyperlipidemia": "Hyperlipidem",
    "Gout and other crystal arthropathies": "Gout/ot crys",
    "Fluid and electrolyte disorders": "Fluid/elc dx",
    "Cystic fibrosis": "Cystic fibro",
    "Other nutritional and metabolic disorders": "Ot nutrit dx",
    "Tuberculosis": "Tuberculosis",
    "Septicemia": "Septicemia",
    "Other bacterial infections": "Oth bact inf",
    "Mycoses": "Mycoses",
    "HIV infection": "HIV infectn",
    "Hepatitis": "Hepatitis",
    "Viral infection": "Viral infect",
    "Other infectious diseases": "Oth infectns",
    "Sexual Infections": "Sexual Infxs",
    "Immunization and screening": "Immuniz/scrn",
    "Other primary cancer": "Ot primry ca",
    "Secondary malignancy": "2ndary malig",
    "Malignant neoplasm": "Malig neopls",
    "Neoplasm unspecified": "Neoplsm unsp",
    "Maintenance chemotherapy/radiation": "Maint chem/r",
    "Benign uterine neoplasm": "Bnign ut neo",
    "Other benign neoplasm": "Ot bnign neo",
    "Abdominal pain": "Abdomnl pain",
    "Adjustment disorders": "Adjustment disorders",
    "Alcohol-related disorders": "Alcohol-related disorders",
    "Allergy": "Allergy",
    "Aneurysm": "Aneurysm",
    "Anxiety disorders": "Anxiety disorders",
    "Arterial embolism": "Art embolism",
    "Attention-deficit/conduct/disruptive behavior disorders": "Attention-deficit/conduct/disruptive beha",
    "Birth asphyxia": "Birth asphyx",
    "Bladder cancer": "Bladder cncr",
    "Bone/connective tissue cancer": "Bone/ct cncr",
    "Brain/nervous system cancer": "Brain/ns can",
    "Breast cancer": "Breast cancr",
    "Bronchus/lung cancer": "Brnch/lng ca",
    "Burns": "Burns",
    "Cardiac anomalies": "Cardiac anom",
    "Cervical cancer": "Cervix cancr",
    "Chest pain": "Chest pain",
    "Colon cancer": "Colon cancer",
    "Device complications": "Complic devi",
    "Procedure complications": "Complic proc",
    "Crush injury": "Crush injury",
    "Delirium/dementia/amnesia/cognitive disorders": "Delirium/dementia/amnestic/other cognitiv",
    "Developmental disorders": "Developmental disorders",
    "Disorders diagnosed in infancy/childhood": "Disorders usually diagnosed in infancy/ch",
    "E codes: Adverse effects of medical care": "E Codes: Adverse effects of medical care",
    "E codes: Adverse effects of medical drugs": "E Codes: Adverse effects of medical drugs",
    "E codes: Cut or pierce": "E Codes: Cut/pierce",
    "E codes: Drowning or submersion": "E Codes: Drowning/submersion",
    "E codes: Fall": "E Codes: Fall",
    "E codes: Fire or burn": "E Codes: Fire/burn",
    "E codes: Firearm": "E Codes: Firearm",
    "E codes: Machinery": "E Codes: Machinery",
    "E codes: Environmental causes": "E Codes: Natural/environment",
    "E codes: Other specified and classifiable": "E Codes: Other specified and classifiable",
    "E codes: Overexertion": "E Codes: Overexertion",
    "E codes: Pedal cyclist": "E Codes: Pedal cyclist- not MVT",
    "E codes: Pedestrian": "E Codes: Pedestrian- not MVT",
    "E codes: Place of occurrence": "E Codes: Place of occurrence",
    "E codes: Poisoning": "E Codes: Poisoning",
    "E codes: Struck by or against": "E Codes: Struck by- against",
    "E codes: Suffocation": "E Codes: Suffocation",
    "E codes: Unspecified": "E Codes: Unspecified",
    "Encephalitis": "Encephalitis",
    "Esophageal cancer": "Esoph cancer",
    "Examination and evaluation": "Exam/eval",
    "Fever of unknown origin": "FUO",
    "Fatigue": "Fatigue",
    "Female genital cancer": "Fem genit ca",
    "Gastrointestinal congenital anomaly": "GI cong anom",
    "Gastrointestinal and peritoneal cancer": "GI/perit can",
    "Gangrene": "Gangrene",
    "Head or neck cancer": "Hd/nck cancr",
    "Hemorrhoids": "Hemmorhoids",
    "Hodgkin's disease": "Hodgkin-s ds",
    "Impulse control disorders": "Impulse control disorders NEC",
    "Intestinal infection": "Intest infct",
    "Intracranial injury": "Intracrn inj",
    "Kidney and renal cancer": "Kidny/rnl ca",
    "Leukemias": "Leukemias",
    "Liveborn infant": "Liveborn",
    "Liver or inflammatory bowel disease cancer": "Liver/ibd ca",
    "Low birth weight": "Low birth wt",
    "Lymph node enlargement": "Lymph enlarg",
    "Male genital cancer": "Mal genit ca",
    "Meningitis": "Meningitis",
    "Miscellaneous mental health disorders": "Miscellaneous mental health disorders",
    "Mood disorders": "Mood disorders",
    "Multiple myeloma": "Mult myeloma",
    "Nausea and vomiting": "Nausea/vomit",
    "Nervous system congenital anomaly": "Nerv cong an",
    "Non-Hodgkin lymphoma": "Non-Hodg lym",
    "Non-epithelial cancer": "Non-epith ca",
    "Open wound of extremity": "Opn wnd extr",
    "Open wound of head": "Opn wnd head",
    "Other aftercare": "Ot aftercare",
    "Other circulatory diseases": "Ot circul dx",
    "Other congenital anomalies": "Ot cong anom",
    "Other respiratory cancer": "Ot respir ca",
    "Other CNS infections": "Oth CNS infx",
    "Other vein disorders": "Oth vein dx",
    "Other injury": "Other injury",
    "Other screening": "Other screen",
    "Ovary cancer": "Ovary cancer",
    "Pancreatic cancer": "Pancreas can",
    "Perinatal jaundice": "Perint jaund",
    "Peripheral atherosclerosis": "Perip athero",
    "Personality disorders": "Personality disorders",
    "Phlebitis": "Phlebitis",
    "Poisoning by non-medications": "Poisn nonmed",
    "Poisoning by other medications": "Poisn ot med",
    "Poisoning by psychiatric medications": "Poison psych",
    "Prostate cancer": "Prostate can",
    "Rectal or anal cancer": "Rctm/anus ca",
    "Rehabilitation": "Rehab",
    "Respiratory distress": "Resp distres",
    "Schizophrenia and other psychotic disorders": "Schizophrenia and other psychotic disorde",
    "Mental health screening or history": "Screening and history of mental health an",
    "Shock": "Shock",
    "Skin melanoma": "Skin melanom",
    "Social and administrative problems": "Social admin",
    "Spontaneous abortion": "Spont abortn",
    "Sprain": "Sprain",
    "Stomach cancer": "Stomch cancr",
    "Substance-related disorders": "Substance-related disorders",
    "Suicide and intentional self-inflicted injury": "Suicide and intentional self-inflicted in",
    "Superficial injury": "Superfic inj",
    "Syncope": "Syncope",
    "Testicular cancer": "Testis cancr",
    "Thyroid cancer": "Thyroid cncr",
    "Unclassified": "Unclassified",
    "Urinary organ cancer": "Uriny org ca",
    "Uterine cancer": "Uterus cancr",
    "Varicose vein": "Varicose vn",
    "E codes: Motor vehicle traffic": "e codes: motor vehicle traffic (mvt)",
    "E codes: Other specified NEC": "e codes: other specified- nec",
    "E codes: Transport not motor vehicle traffic": "e codes: transport- not mvt",
    "Genitourinary congenital anomaly": "gu cong anom"
}

diagnostic_hierarchy_old = {
  "Certain Conditions Originating in the Perinatal Period": {
    "Neonatal Trauma and Injury": [
      "Fracture of skull or face",
      "Fracture of arm",
      "Fracture of leg",
      "Other fracture",
      "Low birth weight",
      "Perinatal jaundice",
      "Birth asphyxia",
      "Liveborn infant"
    ]
  },
  "Congenital Anomalies": {
    "Musculoskeletal and Nervous System Anomalies": [
      "Birth trauma",
      "Joint injury",
      "Fracture of hip",
      "Spinal cord injury",
      "Nervous system congenital anomaly"
    ],
    "Other Congenital Conditions": [
      "Other perinatal diseases",
      "Other congenital anomalies",
      "Gastrointestinal congenital anomaly",
      "Genitourinary congenital anomaly"
    ]
  },
  "Diseases of the Blood and Immune System": {
    "Anemia and Hematologic Disorders": [
      "Anemia",
      "Acute post-hemorrhagic anemia",
      "Sickle cell"
    ],
    "Immunologic and Coagulation Disorders": [
      "Coagulation and hemorrhagic disorders",
      "White blood cell disorders",
      "Other hematologic diseases",
      "Immunity disorders"
    ]
  },
  "Diseases of the Circulatory System": {
    "Cardiac Conditions": [
      "Heart valve disease",
      "Carditis",
      "Hypertension",
      "Hypertension complications",
      "Acute myocardial infarction",
      "Coronary atherosclerosis",
      "Pulmonary heart disease",
      "Other heart diseases",
      "Conduction disorders",
      "Chest pain",
      "Cardiac dysrhythmia",
      "Cardiac arrest",
      "Congestive heart failure; non-hypertensive",
      "Cardiac anomalies"
    ],
    "Cerebrovascular Disorders": [
      "Acute cerebrovascular disease",
      "Pre-cerebral occlusion",
      "Other cerebrovascular diseases",
      "Transient ischemic attack",
      "Late effects of cerebrovascular disease"
    ],
    "Peripheral Vascular and Venous Diseases": [
      "Peripheral atherosclerosis",
      "Aneurysm",
      "Arterial embolism",
      "Other circulatory diseases",
      "Phlebitis",
      "Varicose vein",
      "Hemorrhoids",
      "Other vein disorders"
    ]
  },
  "Diseases of the Respiratory System": {
    "Respiratory Infections and Pulmonary Disorders": [
      "Pneumonia",
      "Influenza",
      "Tonsillitis",
      "Bronchitis",
      "Other upper respiratory infections",
      "Chronic obstructive pulmonary disease",
      "Asthma",
      "Aspiration pneumonia",
      "Pleurisy",
      "Respiratory distress",
      "Adult respiratory failure",
      "Lung diseases due to external agents",
      "Other lower respiratory diseases",
      "Other upper respiratory diseases"
    ]
  },
  "Diseases of the Digestive System": {
    "Upper Gastrointestinal and Hepatic Disorders": [
      "Other liver diseases",
      "Pancreas disease",
      "Gastrointestinal hemorrhage",
      "Gastroenteritis",
      "Other gastrointestinal diseases",
      "Esophageal disease",
      "Gastric and duodenal ulcer",
      "Gastritis",
      "Other diseases of stomach"
    ],
    "Lower Gastrointestinal and Abdominal Disorders": [
      "Appendicitis",
      "Abdominal hernia",
      "Ulcerative colitis",
      "Intestinal obstruction",
      "Diverticulosis",
      "Anal and rectal conditions",
      "Peritonitis",
      "Biliary disease"
    ],
    "Oral and Dental Conditions": [
      "Teeth diseases",
      "Mouth diseases"
    ]
  },
  "Diseases of the Genitourinary System": {
    "Renal and Lower Urinary Tract Disorders": [
      "Nephritis",
      "Acute renal failure",
      "Chronic kidney disease",
      "Urinary tract infection",
      "Urinary stone",
      "Other diseases of kidney",
      "Other diseases of bladder",
      "Other genitourinary diseases",
      "Benign prostatic hyperplasia"
    ],
    "Non-Pregnancy Reproductive Disorders": [
      "Infections of male genital organs",
      "Other male genital disorders",
      "Breast disease",
      "Pelvic inflammatory disease",
      "Endometriosis",
      "Prolapse",
      "Menstrual disorders",
      "Ovarian cyst",
      "Menopausal disorders",
      "Female infertility",
      "Other female genital disorders",
      "Contraceptive management"
    ]
  },
  "Pregnancy; Childbirth; and Postpartum Complications": {
    "Pregnancy Complications": [
      "Induced abortion",
      "Spontaneous abortion",
      "Abortion complications",
      "Ectopic pregnancy",
      "Long pregnancy",
      "Early labor",
      "Malposition of fetus",
      "Pelvic obstruction",
      "Diabetes mellitus in pregnancy",
      "Hypertension in pregnancy",
      "Hemorrhage in pregnancy",
      "Previous C-section",
      "Other pregnancy complications"
   ],
    "Labor and Delivery Complications": [
      "Fetal distress",
      "Amniotic fluid disorders",
      "Umbilical cord complications",
      "Obstetrics-related perinatal trauma",
      "Forceps delivery",
      "Other complications of birth",
      "Other pregnancy and delivery including normal"
    ],
    "Postpartum and Puerperal Complications": [
      "Osteoarthros",
      "Other joint diseases"
    ]
  },
  "Diseases of the Musculoskeletal and Connective Tissue": {
    "Autoimmune and Connective Tissue Disorders": [
      "Systemic lupus erythematosus",
      "Other connective tissue disorders",
      "Infectious arthritis",
      "Rheumatoid arthritis"
    ],
    "Skeletal and Acquired Musculoskeletal Disorders": [
      "Other bone diseases",
      "Osteoporosis",
      "Pathological fracture",
      "Back problem",
      "Acquired foot deformity"
    ]
  },
  "Diseases of the Nervous System and Sense Organs": {
    "Central Nervous System Disorders": [
      "Other nervous system diseases",
      "Parkinson's disease",
      "Multiple sclerosis",
      "Other hereditary central nervous system diseases",
      "Paralysis",
      "Epilepsy and convulsive disorders",
      "Headache and migraine",
      "Coma and brain damage",
      "Other CNS infections",
      "Meningitis",
      "Encephalitis"
    ],
    "Sensory and Vestibular Disorders": [
      "Eye infection",
      "Other eye diseases",
      "Cataract",
      "Retinal disease",
      "Glaucoma",
      "Otitis media",
      "Dizziness",
      "Other ear diseases"
    ]
  },
  "Endocrine; Nutritional; and Metabolic Diseases": {
    "Endocrine and Diabetic Disorders": [
      "Thyroid disorders",
      "Diabetes Mellitus without complications",
      "Diabetes Mellitus with complications",
      "Other endocrine disorders"
    ],
    "Nutritional and Metabolic Disorders": [
      "Nutritional deficiencies",
      "Hyperlipidemia",
      "Gout and other crystal arthropathies",
      "Fluid and electrolyte disorders",
      "Cystic fibrosis",
      "Other nutritional and metabolic disorders"
    ]
  },
  "Infectious and Parasitic Diseases": {
    "Bacterial and Septic Infections": [
      "Tuberculosis",
      "Septicemia",
      "Other bacterial infections",
      "Intestinal infection",
    ],
    "Viral; Mycotic; and Other Infections": [
      "Mycoses",
      "HIV infection",
      "Hepatitis",
      "Viral infection",
      "Other infectious diseases"
    ],
    "Sexually Transmitted and Preventive Conditions": [
      "Sexual Infections",
      "Immunization and screening"
    ]
  },
  "Diseases of the Skin and Subcutaneous Tissue": {
    "Inflammatory and Infectious Skin Disorders": [
      "Skin infection",
      "Other inflammatory skin conditions",
      "Skin ulcer",
      "Other skin diseases",
      "Other acquired deformities"
    ]
  },
  "Injury; Poisoning; and External Causes": {
    "Physical Trauma and Injuries": [
      "Sprain",
      "Intracranial injury",
      "Crush injury",
      "Open wound of head",
      "Open wound of extremity",
      "Superficial injury",
      "Burns",
      "Other injury"
    ],
    "Toxicological and Iatrogenic Complications": [
      "Poisoning by psychiatric medications",
      "Poisoning by other medications",
      "Poisoning by non-medications",
      "Procedure complications",
      "Device complications"
    ],
    "Post-Trauma Symptoms and Aftercare": [
      "Syncope",
      "Fever of unknown origin",
      "Lymph node enlargement",
      "Gangrene",
      "Shock",
      "Nausea and vomiting",
      "Abdominal pain",
      "Fatigue",
      "Allergy",
      "Rehabilitation",
      "Social and administrative problems",
      "Examination and evaluation",
      "Other aftercare",
      "Other screening",
      "Unclassified"
    ]
  },
  "Mental; Behavioral; and Neurodevelopmental Disorders": {
    "Pediatric and Developmental Disorders": [
      "Developmental disorders",
      "Disorders diagnosed in infancy/childhood",
      "Impulse control disorders",
      "Blindness"
    ],
    "Mood; Anxiety; and Cognitive Disorders": [
      "Adjustment disorders",
      "Anxiety disorders",
      "Attention-deficit/conduct/disruptive behavior disorders",
      "Delirium/dementia/amnesia/cognitive disorders",
      "Mood disorders",
      "Personality disorders"
    ],
    "Psychotic and Substance Use Disorders": [
      "Schizophrenia and other psychotic disorders",
      "Alcohol-related disorders",
      "Substance-related disorders",
      "Suicide and intentional self-inflicted injury",
      "Mental health screening or history",
      "Miscellaneous mental health disorders"
    ]
  },
  "Neoplasms": {
    "Gastrointestinal and Hepatic Cancers": [
      "Head or neck cancer",
      "Esophageal cancer",
      "Stomach cancer",
      "Colon cancer",
      "Rectal or anal cancer",
      "Pancreatic cancer",
      "Gastrointestinal and peritoneal cancer",
      "Liver or inflammatory bowel disease cancer"
    ],
    "Thoracic and Urogenital Cancers": [
      "Bronchus/lung cancer",
      "Other respiratory cancer",
      "Bladder cancer",
      "Kidney and renal cancer",
      "Urinary organ cancer"
    ],
    "Reproductive and Endocrine Cancers": [
      "Breast cancer",
      "Uterine cancer",
      "Ovary cancer",
      "Female genital cancer",
      "Prostate cancer",
      "Testicular cancer",
      "Male genital cancer",
      "Thyroid cancer",
      "Cervical cancer",
    ],
    "Hematologic and Central Nervous System Cancers": [
      "Hodgkin's disease",
      "Non-Hodgkin lymphoma",
      "Leukemias",
      "Multiple myeloma",
      "Brain/nervous system cancer"
    ],
    "Other and Unspecified Neoplasms": [
      "Other primary cancer",
      "Secondary malignancy",
      "Malignant neoplasm",
      "Neoplasm unspecified",
      "Maintenance chemotherapy/radiation",
      "Benign uterine neoplasm",
      "Other benign neoplasm",
      "Non-epithelial cancer",
      "Bone/connective tissue cancer",
      "Skin melanoma"
    ]
  },
  "External Causes of Morbidity (E-Codes)": {
    "Environmental; Mechanical; and Intentional Injuries": [
      "E codes: Cut or pierce",
      "E codes: Drowning or submersion",
      "E codes: Fall",
      "E codes: Fire or burn",
      "E codes: Firearm",
      "E codes: Machinery",
      "E codes: Motor vehicle traffic",
      "E codes: Pedal cyclist",
      "E codes: Pedestrian",
      "E codes: Transport not motor vehicle traffic",
      "E codes: Environmental causes",
      "E codes: Overexertion",
      "E codes: Poisoning",
      "E codes: Struck by or against",
      "E codes: Suffocation",
      "E codes: Adverse effects of medical care",
      "E codes: Adverse effects of medical drugs",
      "E codes: Other specified and classifiable",
      "E codes: Other specified NEC",
      "E codes: Unspecified",
      "E codes: Place of occurrence"
    ]
  }
}

In [44]:
diagnostic_hierarchy = {
  "Perinatal and Congenital Conditions": {
    "Neonatal Trauma and Injury": [
      "Fracture of skull or face",
      "Fracture of arm",
      "Fracture of leg",
      "Other fracture",
      "Birth trauma",
      "Joint injury",
      "Spinal cord injury"
    ],
    "Other Perinatal Conditions": [
      "Low birth weight",
      "Perinatal jaundice",
      "Birth asphyxia",
      "Liveborn infant",
      "Other perinatal diseases"    
    ],
    "Congenital Anomalies": [
      "Nervous system congenital anomaly",
      "Fracture of hip",
      "Other congenital anomalies",
      "Gastrointestinal congenital anomaly",
      "Genitourinary congenital anomaly",
      "Cardiac anomalies"
    ]
  },
  "Diseases of the Blood and Immune System": {
    "Anemia and Hematologic Disorders": [
      "Anemia",
      "Acute post-hemorrhagic anemia",
      "Sickle cell",
      "Coagulation and hemorrhagic disorders",
      "White blood cell disorders",
      "Other hematologic diseases",
    ],
    "Immunologic Disorders": [
      "Immunity disorders"
    ]
  },
  "Diseases of the Circulatory System": {
    "Heart Diseases": [
      "Heart valve disease",
      "Carditis",
      "Acute myocardial infarction",
      "Coronary atherosclerosis",
      "Pulmonary heart disease",
      "Other heart diseases",
      "Conduction disorders",
      "Chest pain",
      "Cardiac dysrhythmia",
      "Cardiac arrest",
      "Congestive heart failure; non-hypertensive"
    ],
    "Hypertensive Diseases": [
      "Hypertension",
      "Hypertension complications"    
    ],
    "Cerebrovascular Disorders": [
      "Acute cerebrovascular disease",
      "Pre-cerebral occlusion",
      "Other cerebrovascular diseases",
      "Transient ischemic attack",
      "Late effects of cerebrovascular disease"
    ],
    "Peripheral and Venous Diseases": [
      "Peripheral atherosclerosis",
      "Aneurysm",
      "Arterial embolism",
      "Other circulatory diseases",
      "Phlebitis",
      "Varicose vein",
      "Other vein disorders"
    ]
  },
  "Diseases of the Respiratory System": {
    "Respiratory Infections": [
      "Pneumonia",
      "Influenza",
      "Tonsillitis",
      "Bronchitis",
      "Other upper respiratory infections"
    ],
    "Chronic and Obstructive Pulmonary Diseases": [
      "Chronic obstructive pulmonary disease",
      "Asthma"   
    ],
    "Other Respiratory Conditions": [
      "Aspiration pneumonia",
      "Pleurisy",
      "Respiratory distress",
      "Adult respiratory failure",
      "Lung diseases due to external agents",
      "Other lower respiratory diseases",
      "Other upper respiratory diseases"
    ],
  },
  "Diseases of the Digestive System": {
    "Upper Gastrointestinal Disorders": [
      "Esophageal disease",
      "Gastric and duodenal ulcer",
      "Gastritis",
      "Other diseases of stomach",
      "Gastroenteritis",
      "Gastrointestinal hemorrhage"
    ],
    "Lower Gastrointestinal and Abdominal Disorders": [
      "Appendicitis",
      "Ulcerative colitis",
      "Intestinal obstruction",
      "Diverticulosis",
      "Anal and rectal conditions",
      "Abdominal hernia",
      "Peritonitis",
      "Hemorrhoids",
      "Other gastrointestinal diseases"
    ],
    "Hepatic and Pancreatic Disorders": [      
      "Other liver diseases",
      "Pancreas disease",
      "Biliary disease"
    ],
    "Oral and Dental Conditions": [
      "Teeth diseases",
      "Mouth diseases"
    ]
  },
  "Diseases of the Genitourinary System": {
    "Renal and Urinary Tract Disorders": [
      "Nephritis",
      "Acute renal failure",
      "Chronic kidney disease",
      "Urinary tract infection",
      "Urinary stone",
      "Other diseases of kidney",
      "Other diseases of bladder",
      "Other genitourinary diseases"      
    ],
    "Reproductive Disorders": [
      "Pelvic inflammatory disease",
      "Endometriosis",
      "Prolapse",
      "Menstrual disorders",
      "Ovarian cyst",
      "Menopausal disorders",
      "Female infertility",
      "Breast disease",
      "Contraceptive management",
      "Other female genital disorders",
      "Infections of male genital organs",
      "Other male genital disorders",
      "Benign prostatic hyperplasia"
    ]
  },
  "Pregnancy; Childbirth; and Postpartum Complications": {
    "Pregnancy Complications": [
      "Induced abortion",
      "Spontaneous abortion",
      "Abortion complications",
      "Ectopic pregnancy",
      "Long pregnancy",
      "Early labor",
      "Malposition of fetus",
      "Pelvic obstruction",
      "Diabetes mellitus in pregnancy",
      "Hypertension in pregnancy",
      "Hemorrhage in pregnancy",
      "Previous C-section",
      "Other pregnancy complications"
   ],
    "Labor and Delivery Complications": [
      "Fetal distress",
      "Amniotic fluid disorders",
      "Umbilical cord complications",
      "Obstetrics-related perinatal trauma",
      "Forceps delivery",
      "Other complications of birth",
      "Other pregnancy and delivery including normal"
    ],
    "Postpartum and Puerperal Complications": [
      "Osteoarthros",
      "Other joint diseases"
    ]
  },
  "Diseases of the Musculoskeletal and Connective Tissue": {
    "Autoimmune and Connective Tissue Disorders": [
      "Systemic lupus erythematosus",
      "Infectious arthritis",
      "Rheumatoid arthritis",
      "Other connective tissue disorders"
    ],
    "Skeletal and Acquired Musculoskeletal Disorders": [
      "Osteoporosis",
      "Other bone diseases",
      "Pathological fracture",
      "Back problem",
      "Acquired foot deformity"
    ]
  },
  "Diseases of the Nervous System and Sense Organs": {
    "Central Nervous System Disorders": [
      "Epilepsy and convulsive disorders",
      "Meningitis",
      "Encephalitis",
      "Other CNS infections",
      "Parkinson's disease",
      "Multiple sclerosis",
      "Other hereditary central nervous system diseases",
      "Other nervous system diseases",
      "Headache and migraine",
      "Coma and brain damage",
      "Paralysis"
    ],
    "Sensory and Vestibular Disorders": [
      "Eye infection",
      "Cataract",
      "Retinal disease",
      "Glaucoma",
      "Other eye diseases",
      "Otitis media",
      "Dizziness",
      "Other ear diseases"
    ]
  },
  "Endocrine; Nutritional; and Metabolic Diseases": {
    "Endocrine and Diabetic Disorders": [
      "Thyroid disorders",
      "Diabetes Mellitus without complications",
      "Diabetes Mellitus with complications",
      "Other endocrine disorders"
    ],
    "Nutritional and Metabolic Disorders": [
      "Nutritional deficiencies",
      "Hyperlipidemia",
      "Gout and other crystal arthropathies",
      "Fluid and electrolyte disorders",
      "Cystic fibrosis",
      "Other nutritional and metabolic disorders"
    ]
  },
  "Infectious and Parasitic Diseases": {
    "Bacterial and Septic Infections": [
      "Tuberculosis",
      "Septicemia",
      "Intestinal infection",
      "Other bacterial infections"
    ],
    "Viral; Mycotic; and Other Infections": [
      "HIV infection",
      "Hepatitis",
      "Viral infection",
      "Mycoses",
      "Other infectious diseases"
    ],
    "Sexually Transmitted and Preventive Conditions": [
      "Sexual Infections",
      "Immunization and screening"
    ]
  },
  "Diseases of the Skin and Subcutaneous Tissue": {
    "Inflammatory and Infectious Skin Disorders": [
      "Skin infection",
      "Skin ulcer",
      "Other skin diseases",
      "Other inflammatory skin conditions",
      "Other acquired deformities",
    ]
  },
  "Injury; Poisoning; and External Causes": {
    "Physical Trauma and Injuries": [
      "Sprain",
      "Intracranial injury",
      "Crush injury",
      "Open wound of head",
      "Open wound of extremity",
      "Superficial injury",
      "Burns",
      "Other injury"
    ],
    "Toxicological and Iatrogenic Complications": [
      "Poisoning by psychiatric medications",
      "Poisoning by other medications",
      "Poisoning by non-medications",
      "Procedure complications",
      "Device complications"
    ]
  },
  "Symptoms, Signs, and Other Conditions": {
    "Symptoms and Signs": [
      "Syncope",
      "Fever of unknown origin",
      "Lymph node enlargement",
      "Gangrene",
      "Shock",
      "Nausea and vomiting",
      "Abdominal pain",
      "Fatigue",
      "Allergy",
      "Unclassified"
    ],
    "Aftercare and Other Issues": [
      "Rehabilitation",
      "Social and administrative problems",
      "Examination and evaluation",
      "Other aftercare",
      "Other screening"
    ]
  },
  "Mental; Behavioral; and Neurodevelopmental Disorders": {
    "Neurodevelopmental and Pediatric Disorders": [
      "Developmental disorders",
      "Disorders diagnosed in infancy/childhood",
      "Attention-deficit/conduct/disruptive behavior disorders",
      "Impulse control disorders",
      "Blindness"
    ],
    "Mood; Anxiety; and Cognitive Disorders": [
      "Adjustment disorders",
      "Anxiety disorders",
      "Mood disorders",
      "Personality disorders"
    ],
    "Psychotic and Substance Use Disorders": [
      "Schizophrenia and other psychotic disorders",
      "Alcohol-related disorders",
      "Substance-related disorders"
    ],
    "Other Conditions and Events": [
      "Delirium/dementia/amnesia/cognitive disorders",
      "Miscellaneous mental health disorders",
      "Suicide and intentional self-inflicted injury",
      "Mental health screening or history"
    ]
  },
  "Neoplasms": {
    "Gastrointestinal Cancers": [
      "Esophageal cancer",
      "Stomach cancer",
      "Colon cancer",
      "Rectal or anal cancer",
      "Pancreatic cancer",
      "Gastrointestinal and peritoneal cancer"      
    ],
    "Head, Neck, and Thoracic Cancers": [
      "Head or neck cancer",
      "Bronchus/lung cancer",
      "Other respiratory cancer"
    ],
    "Urogenital and Reproductive Cancers": [
      "Breast cancer",
      "Uterine cancer",
      "Ovary cancer",
      "Cervical cancer",      
      "Female genital cancer",
      "Prostate cancer",
      "Testicular cancer",
      "Male genital cancer",
      "Bladder cancer",
      "Kidney and renal cancer",
      "Urinary organ cancer"
    ],
    "Hematologic and Endocrine Cancers": [
      "Hodgkin's disease",
      "Non-Hodgkin lymphoma",
      "Leukemias",
      "Multiple myeloma",
      "Thyroid cancer"      
    ],
    "Cancers of Other Systems": [
      "Brain/nervous system cancer",
      "Bone/connective tissue cancer",
      "Skin melanoma",
      "Non-epithelial cancer",
      "Other primary cancer",
      "Secondary malignancy",
      "Liver or inflammatory bowel disease cancer"
    ],
    "Benign Neoplasms": [
      "Benign uterine neoplasm",
      "Other benign neoplasm"
    ],
    "Unspecified Neoplasms and Other Conditions": [
      "Malignant neoplasm",
      "Neoplasm unspecified",
      "Maintenance chemotherapy/radiation"
    ]
  },
  "External Causes of Morbidity (E-Codes)": {
    "Environmental; Mechanical; and Intentional Injuries": [
      "E codes: Cut or pierce",
      "E codes: Drowning or submersion",
      "E codes: Fall",
      "E codes: Fire or burn",
      "E codes: Firearm",
      "E codes: Machinery",
      "E codes: Motor vehicle traffic",
      "E codes: Pedal cyclist",
      "E codes: Pedestrian",
      "E codes: Transport not motor vehicle traffic",
      "E codes: Environmental causes",
      "E codes: Overexertion",
      "E codes: Poisoning",
      "E codes: Struck by or against",
      "E codes: Suffocation",
      "E codes: Adverse effects of medical care",
      "E codes: Adverse effects of medical drugs",
      "E codes: Other specified and classifiable",
      "E codes: Other specified NEC",
      "E codes: Unspecified",
      "E codes: Place of occurrence"
    ]
  }
}

In [45]:
def generate_diagnosis_list(codes, hadm_id): 
    try:
        descs = []
        for code in codes:
            if code in css_single_to_desc:
                descs.append(css_single_to_desc[code])
            else:
                raise KeyError(f"Code {code} not found")
        return ", ".join(descs)
    
    except KeyError as e:
        print(f"Missing CCS code in hadm_id {hadm_id}: {e}")
        return f"Missing CCS code in hadm_id {hadm_id}: {e}"

In [46]:
df_diag_codes['diagnosis_list'] = df_diag_codes.apply(
    lambda row: generate_diagnosis_list(row['ccs_single_seqs'], row['hadm_id']),
    axis=1
)

In [47]:
def convert_leaf_names(hierarchy, conversion_dict):
    new_hierarchy = {}
    for top_cat, mid_dict in hierarchy.items():
        new_hierarchy[top_cat] = {}
        for mid_cat, leaves in mid_dict.items():
            converted_leaves = [
                conversion_dict.get(leaf, leaf) for leaf in leaves
            ]
            new_hierarchy[top_cat][mid_cat] = converted_leaves
    return new_hierarchy

In [48]:
abbrev_hierarchy = convert_leaf_names(diagnostic_hierarchy, long_to_short)

In [49]:
leaf_to_mid = {}
for top_cat, mids in abbrev_hierarchy.items():
    for mid_cat, leaves in mids.items():
        for leaf in leaves:
            leaf_to_mid[leaf] = mid_cat

def map_diag_to_mid(diag_list):
    return list(set(leaf_to_mid.get(d, None) for d in diag_list if d in leaf_to_mid))

In [50]:
df_diag_codes['diagnosis_mid_categories'] = df_diag_codes['diagnosis_list'].apply(
    lambda x: map_diag_to_mid([d.strip() for d in x.split(',')])
)

In [51]:
mid_to_top = {}
for top_cat, mid_dict in diagnostic_hierarchy.items():
    for mid_cat in mid_dict:
        mid_to_top[mid_cat] = top_cat

def get_top_level_categories(mid_list):
    mids = set(mid_list)  # ensure uniqueness
    tops = {mid_to_top[mid] for mid in mids if mid in mid_to_top}
    return list(tops)

In [52]:
df_diag_codes['diagnosis_top_categories'] = df_diag_codes['diagnosis_mid_categories'].apply(get_top_level_categories)

In [53]:
df_diag_codes['diagnosis_list'] = df_diag_codes['diagnosis_list'].apply(lambda x: [item.strip() for item in x.split(',')])

## 5. Data Check and Save

In [54]:
def find_empty_list(df):

    def is_empty_list(x):
        return isinstance(x, list) and len(x) == 0
    
    # Apply to each cell in the dataframe
    empty_list_mask = df.applymap(is_empty_list)
    
    # Find rows that contain at least one empty list
    rows_with_empty_lists = df[empty_list_mask.any(axis=1)]
    
    return rows_with_empty_lists

In [55]:
find_empty_list(df_diag_codes)

Unnamed: 0,subject_id,hadm_id,diag_seqs,ccs_multi_seqs,ccs_single_seqs,diagnosis_list,diagnosis_mid_categories,diagnosis_top_categories


In [56]:
df_pat_adm_dis_notes_diag_codes = pd.merge(df_pat_adm_dis_notes, df_diag_codes, on = ['subject_id', 'hadm_id'], how = 'left')

In [57]:
diag_seqs_null_rows = df_pat_adm_dis_notes_diag_codes[df_pat_adm_dis_notes_diag_codes['diag_seqs'].isnull()]

In [58]:
diag_seqs_null_rows_pids = list(set(diag_seqs_null_rows.subject_id.unique()))

In [59]:
len(diag_seqs_null_rows_pids)

38558

In [60]:
df_pat_adm_dis_notes_diag_codes = df_pat_adm_dis_notes_diag_codes[~df_pat_adm_dis_notes_diag_codes['subject_id'].isin(diag_seqs_null_rows_pids)]

In [61]:
df_pat_adm_dis_notes_diag_codes = df_pat_adm_dis_notes_diag_codes.sort_values(by=['subject_id', 'admittime']).reset_index(drop=True)

In [62]:
def check_consistency(row):
    lengths = [len(x) for x in row[1:]]
    return all(length == lengths[0] for length in lengths)

In [63]:
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes.groupby('subject_id').agg(lambda x: list(x)).reset_index()

In [64]:
df_pat_adm_dis_notes_diag_codes_filtered['consistency'] = df_pat_adm_dis_notes_diag_codes_filtered.apply(check_consistency, axis=1)

In [65]:
set(df_pat_adm_dis_notes_diag_codes_filtered['consistency'].to_list())

{True}

In [66]:
len(df_pat_adm_dis_notes_diag_codes_filtered.subject_id.unique())

57172

In [67]:
df_pat_adm_dis_notes_diag_codes_filtered

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,hadm_id,admittime,dischtime,deathtime,...,post_disch_time,visit_interval,text,diag_seqs,ccs_multi_seqs,ccs_single_seqs,diagnosis_list,diagnosis_mid_categories,diagnosis_top_categories,consistency
0,10000032,"[F, F, F, F]","[52, 52, 52, 52]","[2180, 2180, 2180, 2180]","[2014 - 2016, 2014 - 2016, 2014 - 2016, 2014 -...","[2180-09-09, 2180-09-09, 2180-09-09, 2180-09-09]","[22595853.0, 22841357.0, 29079034.0, 25742920.0]","[2180-05-06 22:23:00, 2180-06-26 18:27:00, 218...","[2180-05-07 17:15:00, 2180-06-27 18:49:00, 218...","[NaT, NaT, NaT, NaT]",...,"[nan, nan, nan, nan]","[0.0, 50.05, 25.740277777777777, 11.2423611111...",[service medicine allergies no known allergies...,"[[5723, 78959, 5715, 07070, 496, 29680, 30981,...","[[8, 1, 5, 9], [1, 9, 8, 3, 4, 5], [1, 9, 8, 3...","[[6, 657, 663, 151, 651, 127], [6, 663, 62, 15...","[[Hepatitis, Mood disorders, Screening and his...","[[Hepatic and Pancreatic Disorders, Other Cond...",[[Mental; Behavioral; and Neurodevelopmental D...,True
1,10000084,[M],[72],[2160],[2017 - 2019],[2161-02-13],[23052089.0],[2160-11-21 01:56:00],[2160-11-25 14:52:00],[NaT],...,[nan],[0.0],[service medicine allergies no known allergies...,"[[33182, 29410, 36816, 78199, 2724, V1046]]","[[6, 3, 13, 2, 5]]","[[89, 653, 29, 211, 53]]","[[Blindness, Delirium/dementia/amnestic/other ...","[[Autoimmune and Connective Tissue Disorders, ...",[[Mental; Behavioral; and Neurodevelopmental D...,True
2,10000826,"[F, F, F]","[32, 32, 32]","[2146, 2146, 2146]","[2008 - 2010, 2008 - 2010, 2008 - 2010]","[nan, nan, nan]","[20032235.0, 21086876.0, 28289260.0]","[2146-12-05 19:07:00, 2146-12-18 17:39:00, 214...","[2146-12-12 16:30:00, 2146-12-24 19:55:00, 214...","[NaT, NaT, NaT]",...,"[nan, nan, nan]","[0.0, 6.047916666666667, 6.2]",[name . unit no admission date discharge date ...,"[[5712, 486, 78959, 5723, 5990, 2639, 2761, 51...","[[9, 10, 8, 3, 4, 5], [1, 9, 10, 8, 3, 5], [9,...","[[122, 159, 151, 155, 59, 55, 651, 130, 52, 66...","[[Pneumonia, UTI, Oth liver dx, Other GI dx, A...","[[Hepatic and Pancreatic Disorders, Psychotic ...",[[Mental; Behavioral; and Neurodevelopmental D...,True
3,10000883,[M],[20],[2124],[2008 - 2010],[nan],[25221576.0],[2124-05-14 21:11:00],[2124-05-22 10:40:00],[NaT],...,[nan],[0.0],[service psychiatry allergies patient recorded...,"[[29620, 30000, 30989]]",[[5]],"[[650, 651, 657]]","[[Adjustment disorders, Anxiety disorders, Moo...",[[Mood; Anxiety; and Cognitive Disorders]],[[Mental; Behavioral; and Neurodevelopmental D...,True
4,10000935,"[F, F, F]","[52, 52, 52]","[2182, 2182, 2182]","[2008 - 2010, 2008 - 2010, 2008 - 2010]","[2187-11-12, 2187-11-12, 2187-11-12]","[29541074.0, 26381316.0, 25849114.0]","[2183-10-28 09:55:00, 2187-08-23 21:22:00, 218...","[2183-11-04 13:11:00, 2187-08-27 15:35:00, 218...","[NaT, NaT, NaT]",...,"[nan, nan, nan]","[0.0, 1388.3409722222223, 44.14861111111111]",[service surgery allergies sulfonamides codein...,"[[1970, 34830, 1977, 1539, 2762, 5780, 2869, 2...","[[9, 6, 10, 7, 3, 13, 4, 18, 2, 5], [9, 6, 8, ...","[[42, 106, 153, 657, 62, 163, 52, 211, 14, 63,...","[[2ndary malig, Dysrhythmia, GI hemorrhag, Moo...","[[Upper Gastrointestinal Disorders, Autoimmune...",[[Mental; Behavioral; and Neurodevelopmental D...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57167,19999303,[F],[61],[2161],[2008 - 2010],[nan],[23567530.0],[2161-04-03 15:40:00],[2161-04-06 10:45:00],[NaT],...,[nan],[0.0],[service surgery allergies sulfonamides doxycy...,"[[5772, 20929, 57420, 4019, 33818, 78702, 7832...","[[9, 6, 7, 3, 17, 2, 5]]","[[149, 43, 657, 152, 98, 95, 250, 58, 22]]","[[Biliary dx, Malig neopls, Mood disorders, Pa...","[[Hepatic and Pancreatic Disorders, Cancers of...",[[Mental; Behavioral; and Neurodevelopmental D...,True
57168,19999625,"[M, M]","[81, 81]","[2138, 2138]","[2008 - 2010, 2008 - 2010]","[nan, nan]","[27638769.0, 25304202.0]","[2138-10-06 17:27:00, 2139-10-10 18:06:00]","[2138-10-09 16:56:00, 2139-10-16 03:30:00]","[NaT, NaT]",...,"[nan, nan]","[0.0, 366.0486111111111]",[service medicine allergies aspirin attending ...,"[[486, 5849, 2760, 5070, 33182, 29410, 78720, ...","[[9, 10, 7, 8, 3, 18, 12, 4, 5], [9, 10, 7, 8,...","[[99, 129, 122, 62, 157, 155, 164, 653, 55, 15...","[[Htn complicn, Asp pneumon, Pneumonia, Coag/h...","[[Reproductive Disorders, Other Respiratory Co...",[[Mental; Behavioral; and Neurodevelopmental D...,True
57169,19999784,[M],[57],[2119],[2017 - 2019],[nan],[25180002.0],[2119-12-30 09:54:00],[2120-01-04 16:20:00],[NaT],...,[nan],[0.0],[service medicine allergies no known allergies...,"[[V5811, 20280, 28803, 2731, 73679, V1582, 344...","[[6, 3, 13, 4, 18, 2, 17, 5]]","[[38, 45, 2617, 63, 663, 2621, 253, 208, 82, 58]]","[[Non-Hodg lym, Maint chem/r, E Codes: Adverse...",[[Environmental; Mechanical; and Intentional I...,[[Mental; Behavioral; and Neurodevelopmental D...,True
57170,19999828,"[F, F]","[46, 46]","[2147, 2147]","[2017 - 2019, 2017 - 2019]","[nan, nan]","[29734428.0, 25744818.0]","[2147-07-18 16:23:00, 2149-01-08 16:44:00]","[2147-08-04 18:10:00, 2149-01-18 17:00:00]","[NaT, NaT]",...,"[nan, nan]","[0.0, 522.9402777777777]",[service surgery allergies lamictal hydrochlor...,"[[56981, 2863, 6822, 99639, E8788, E8499, 4019...","[[1, 9, 7, 8, 18, 3, 4, 16, 12, 17, 5], [1, 9,...","[[106, 663, 62, 155, 2621, 98, 3, 197, 55, 237...","[[Dysrhythmia, Screening and history of mental...","[[Toxicological and Iatrogenic Complications, ...",[[Mental; Behavioral; and Neurodevelopmental D...,True


In [68]:
df_pat_adm_dis_notes_diag_codes_filtered['length'] = df_pat_adm_dis_notes_diag_codes_filtered['gender'].apply(lambda x: len(x))

In [69]:
len_one_df = df_pat_adm_dis_notes_diag_codes_filtered[df_pat_adm_dis_notes_diag_codes_filtered['length'] == 1]

In [70]:
len_one_df_pids = list(set(len_one_df.subject_id.unique()))
len(len_one_df_pids)

25022

In [71]:
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered[~df_pat_adm_dis_notes_diag_codes_filtered['subject_id'].isin(len_one_df_pids)]
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered.reset_index(drop=True)

In [72]:
def remove_single_visit_subject_ids(df, len_one_df_pids):
    df = df[~df['subject_id'].isin(len_one_df_pids)]
    df = df.reset_index(drop=True)
    return df

In [73]:
df_pat_adm_dis_notes_diag_codes_filtered = remove_single_visit_subject_ids(df_pat_adm_dis_notes_diag_codes, len_one_df_pids)

In [74]:
df_pat_adm_dis_notes_diag_codes_filtered['visit_interval'] = df_pat_adm_dis_notes_diag_codes_filtered['visit_interval'].fillna(0)

In [75]:
df_pat_adm_dis_notes_diag_codes_filtered.isnull().sum()

subject_id                       0
gender                           0
anchor_age                       0
anchor_year                      0
anchor_year_group                0
dod                          69725
hadm_id                          0
admittime                        0
dischtime                        0
deathtime                   120200
admission_type                   0
admit_provider_id                0
admission_location               0
discharge_location               0
insurance                        0
language                         0
marital_status                   0
race                             0
edregtime                    28384
edouttime                    28384
hospital_expire_flag             0
age_group                        0
next_admittime               29652
next_admission_type          29652
days_next_admit              29652
length_of_stay                   0
post_disch_time             120200
visit_interval                   0
text                

In [76]:
df_pat_adm_dis_notes_diag_codes_filtered['admittime'] = pd.to_datetime(df_pat_adm_dis_notes_diag_codes_filtered['admittime'])
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered.sort_values(by=['subject_id', 'admittime']).reset_index(drop=True)

In [77]:
df_pat_adm_dis_notes_diag_codes_filtered.columns

Index(['subject_id', 'gender', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod', 'hadm_id', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admit_provider_id',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'marital_status', 'race', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'age_group', 'next_admittime',
       'next_admission_type', 'days_next_admit', 'length_of_stay',
       'post_disch_time', 'visit_interval', 'text', 'diag_seqs',
       'ccs_multi_seqs', 'ccs_single_seqs', 'diagnosis_list',
       'diagnosis_mid_categories', 'diagnosis_top_categories'],
      dtype='object')

In [78]:
df_pat_adm_dis_notes_diag_codes_filtered['record_id'] = df_pat_adm_dis_notes_diag_codes_filtered['subject_id'].astype(str) + '_' + df_pat_adm_dis_notes_diag_codes_filtered['hadm_id'].astype(str)

In [79]:
df_pat_adm_dis_notes_diag_codes_filtered.rename(columns={'text': 'discharge_summary'}, inplace=True)

In [80]:
df_pat_adm_dis_notes_diag_codes_filtered["next_diagnosis_bottom_categories"] = df_pat_adm_dis_notes_diag_codes_filtered.groupby("subject_id")['diagnosis_list'].shift(-1)

In [81]:
df_pat_adm_dis_notes_diag_codes_filtered["next_diagnosis_mid_categories"] = df_pat_adm_dis_notes_diag_codes_filtered.groupby("subject_id")['diagnosis_mid_categories'].shift(-1)

In [82]:
df_pat_adm_dis_notes_diag_codes_filtered["next_diagnosis_top_categories"] = df_pat_adm_dis_notes_diag_codes_filtered.groupby("subject_id")['diagnosis_top_categories'].shift(-1)

In [83]:
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered.dropna(subset=["next_diagnosis_bottom_categories"]).reset_index(drop=True)

In [84]:
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered.dropna(subset=["next_diagnosis_mid_categories"]).reset_index(drop=True)

In [85]:
df_pat_adm_dis_notes_diag_codes_filtered = df_pat_adm_dis_notes_diag_codes_filtered.dropna(subset=["next_diagnosis_top_categories"]).reset_index(drop=True)

In [86]:
len(df_pat_adm_dis_notes_diag_codes_filtered.subject_id.unique())

32150

In [87]:
find_empty_list(df_pat_adm_dis_notes_diag_codes_filtered)

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,hadm_id,admittime,dischtime,deathtime,...,diag_seqs,ccs_multi_seqs,ccs_single_seqs,diagnosis_list,diagnosis_mid_categories,diagnosis_top_categories,record_id,next_diagnosis_bottom_categories,next_diagnosis_mid_categories,next_diagnosis_top_categories


In [88]:
os.makedirs(out_dir, exist_ok=True)

In [89]:
with open(os.path.join(out_dir, "diagnostic_hierarchy.txt"), "w") as f:
    json.dump(diagnostic_hierarchy, f, indent=2)

In [90]:
def reduce_df_by_seq(df, visit_number=5):
    df["visit_number_from_last"] = df.groupby("subject_id").cumcount(ascending=False) + 1
    
    df_simplified = df[df["visit_number_from_last"] <= visit_number]
    
    return df_simplified.reset_index(drop=True)

In [91]:
df_pat_adm_dis_notes_diag_codes_filtered_truncated = reduce_df_by_seq(df_pat_adm_dis_notes_diag_codes_filtered, 5)

In [92]:
df_pat_adm_dis_notes_diag_codes_filtered_truncated = df_pat_adm_dis_notes_diag_codes_filtered_truncated.groupby('subject_id').agg(lambda x: list(x)).reset_index()

In [93]:
columns_with_lists = ["age_group", "gender", "insurance", "race", "next_diagnosis_mid_categories", "next_diagnosis_top_categories"]

for col in columns_with_lists:
    df_pat_adm_dis_notes_diag_codes_filtered_truncated[col] = df_pat_adm_dis_notes_diag_codes_filtered_truncated[col].apply(lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None)

In [94]:
find_empty_list(df_pat_adm_dis_notes_diag_codes_filtered_truncated)

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,hadm_id,admittime,dischtime,deathtime,...,ccs_multi_seqs,ccs_single_seqs,diagnosis_list,diagnosis_mid_categories,diagnosis_top_categories,record_id,next_diagnosis_bottom_categories,next_diagnosis_mid_categories,next_diagnosis_top_categories,visit_number_from_last


In [95]:
def stratified_sample_exact(df, stratify_cols, n_samples, random_state=42):
    assert n_samples <= len(df), "n_samples cannot exceed dataset size"

    rng = np.random.RandomState(random_state)

    # Create composite key
    df["_stratum"] = df[stratify_cols].astype(str).agg("-".join, axis=1)

    # Compute stratum sizes
    stratum_counts = df["_stratum"].value_counts(normalize=True)

    # Compute number of samples per stratum
    stratum_samples = (stratum_counts * n_samples).round().astype(int)

    # Adjust to ensure total exactly equals n_samples
    diff = n_samples - stratum_samples.sum()
    if diff != 0:
        # Adjust the largest strata up or down to correct rounding
        adjust_indices = stratum_samples.sort_values(ascending=(diff < 0)).index
        for i in adjust_indices[:abs(diff)]:
            stratum_samples[i] += 1 if diff > 0 else -1

    # Sample from each stratum
    sampled_df_list = []
    for stratum, n in stratum_samples.items():
        subset = df[df["_stratum"] == stratum]
        if n > len(subset):
            raise ValueError(f"Stratum '{stratum}' has only {len(subset)} samples, requested {n}")
        sampled_df_list.append(subset.sample(n=n, random_state=rng))

    df = pd.concat(sampled_df_list).drop(columns=["_stratum"]).reset_index(drop=True)
    return df

In [96]:
stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated = stratified_sample_exact(
    df_pat_adm_dis_notes_diag_codes_filtered_truncated,
    stratify_cols=["age_group", "gender", "race", "insurance"],
    n_samples=8000,
    random_state=42
)

In [97]:
stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod,hadm_id,admittime,dischtime,deathtime,...,ccs_multi_seqs,ccs_single_seqs,diagnosis_list,diagnosis_mid_categories,diagnosis_top_categories,record_id,next_diagnosis_bottom_categories,next_diagnosis_mid_categories,next_diagnosis_top_categories,visit_number_from_last
0,14777603,F,"[84, 84, 84, 84]","[2177, 2177, 2177, 2177]","[2008 - 2010, 2008 - 2010, 2008 - 2010, 2008 -...","[nan, nan, nan, nan]","[20457736.0, 22808936.0, 21546481.0, 20305987.0]","[2177-06-03 02:53:00, 2179-06-10 16:44:00, 218...","[2177-06-07 20:30:00, 2179-06-14 19:16:00, 218...","[NaT, NaT, NaT, NaT]",...,"[[9, 6, 10, 7, 3, 13], [9, 6, 10, 7, 3, 13, 17...","[[138, 205, 157, 155, 98, 55, 101, 95], [138, ...","[[Esophgeal dx, Back problem, Ac renl fail, Ot...","[[Upper Gastrointestinal Disorders, Central Ne...",[[Endocrine; Nutritional; and Metabolic Diseas...,"[14777603_20457736.0, 14777603_22808936.0, 147...","[[Esophgeal dx, Ovarian cyst, Other GU dx, Oth...","[Upper Gastrointestinal Disorders, Central Ner...",[Mental; Behavioral; and Neurodevelopmental Di...,"[4, 3, 2, 1]"
1,15192733,F,[81],[2156],[2011 - 2013],[2161-09-18],[27599089.0],[2156-05-01 16:58:00],[2156-05-04 13:40:00],[NaT],...,"[[7, 8, 3, 18, 16, 13, 14, 5]]","[[106, 108, 206, 131, 213, 663, 53, 98, 96, 25...","[[Dysrhythmia, chf;nonhp, Osteoporosis, Adlt r...","[[Other Respiratory Conditions, Other Conditio...",[[Mental; Behavioral; and Neurodevelopmental D...,[15192733_27599089.0],"[[Dysrhythmia, chf;nonhp, Htn complicn, Chr ki...","[Nutritional and Metabolic Disorders, Aftercar...","[Diseases of the Circulatory System, Endocrine...",[1]
2,11433898,F,"[85, 85, 85]","[2193, 2193, 2193]","[2011 - 2013, 2011 - 2013, 2011 - 2013]","[2195-02-06, 2195-02-06, 2195-02-06]","[24567127.0, 21838620.0, 24510198.0]","[2193-05-21 10:38:00, 2193-06-08 13:04:00, 219...","[2193-05-28 16:44:00, 2193-06-10 17:31:00, 219...","[NaT, NaT, NaT]",...,"[[9, 6, 10, 7, 8, 3, 13, 17], [9, 6, 10, 7, 3,...","[[106, 206, 138, 48, 135, 157, 105, 55, 101, 5...","[[Dysrhythmia, Osteoporosis, Esophgeal dx, Thy...","[[Upper Gastrointestinal Disorders, Endocrine ...",[[Endocrine; Nutritional; and Metabolic Diseas...,"[11433898_24567127.0, 11433898_21838620.0, 114...","[[Dysrhythmia, chf;nonhp, Osteoporosis, Biliar...","[Other Respiratory Conditions, Respiratory Inf...",[Mental; Behavioral; and Neurodevelopmental Di...,"[3, 2, 1]"
3,17968572,F,[91],[2150],[2008 - 2010],[nan],[22678798.0],[2150-11-14 11:26:00],[2150-11-15 20:00:00],[NaT],...,"[[9, 7, 8, 18, 13, 16, 5]]","[[138, 131, 2603, 122, 657, 205, 663, 231, 262...","[[Esophgeal dx, Adlt resp fl, E Codes: Fall, P...","[[Upper Gastrointestinal Disorders, Autoimmune...",[[Mental; Behavioral; and Neurodevelopmental D...,[17968572_22678798.0],"[[Esophgeal dx, Delirium/dementia/amnestic/oth...","[Upper Gastrointestinal Disorders, Autoimmune ...",[Mental; Behavioral; and Neurodevelopmental Di...,[1]
4,15052507,F,"[75, 75]","[2147, 2147]","[2008 - 2010, 2008 - 2010]","[nan, nan]","[24692659.0, 26666141.0]","[2147-05-15 00:25:00, 2153-03-14 00:40:00]","[2147-05-17 15:14:00, 2153-03-17 15:20:00]","[NaT, NaT]",...,"[[7, 8, 3, 13, 18, 2, 5], [10, 7, 8, 18, 3, 13...","[[206, 19, 48, 663, 98, 130, 259, 53], [19, 48...","[[Osteoporosis, Brnch/lng ca, Thyroid dsor, Sc...","[[Head, Neck, and Thoracic Cancers, Endocrine ...",[[Mental; Behavioral; and Neurodevelopmental D...,"[15052507_24692659.0, 15052507_26666141.0]","[[Brnch/lng ca, Thyroid dsor, Screening and hi...","[Head, Neck, and Thoracic Cancers, Endocrine a...","[External Causes of Morbidity (E-Codes), Endoc...","[2, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,14872473,M,"[33, 33, 33, 33]","[2180, 2180, 2180, 2180]","[2011 - 2013, 2011 - 2013, 2011 - 2013, 2011 -...","[nan, nan, nan, nan]","[25425893.0, 26155522.0, 20699065.0, 22805038.0]","[2180-06-18 20:51:00, 2181-02-13 22:16:00, 218...","[2180-07-02 13:30:00, 2181-02-18 11:26:00, 218...","[NaT, NaT, NaT, NaT]",...,"[[7, 5, 8, 18], [3, 4, 5, 18], [5], [5]]","[[106, 2617, 122, 259, 659], [654, 663, 661, 5...","[[Dysrhythmia, E Codes: Adverse effects of med...","[[Psychotic and Substance Use Disorders, Envir...",[[Mental; Behavioral; and Neurodevelopmental D...,"[14872473_25425893.0, 14872473_26155522.0, 148...","[[Developmental disorders, Screening and histo...","[Symptoms and Signs, Psychotic and Substance U...",[Mental; Behavioral; and Neurodevelopmental Di...,"[4, 3, 2, 1]"
7996,10506015,M,"[25, 25, 25, 25, 25]","[2172, 2172, 2172, 2172, 2172]","[2011 - 2013, 2011 - 2013, 2011 - 2013, 2011 -...","[nan, nan, nan, nan, nan]","[25008660.0, 25568011.0, 22963499.0, 26204855....","[2173-03-04 20:57:00, 2173-04-17 18:04:00, 217...","[2173-03-08 17:54:00, 2173-04-22 15:56:00, 217...","[NaT, NaT, NaT, NaT, NaT]",...,"[[6, 10, 7, 3, 18, 16], [10, 7, 6, 3], [1, 6, ...","[[48, 2621, 98, 237, 95, 160, 2616, 58], [99, ...","[[Thyroid dsor, E Codes: Place of occurrence, ...","[[Endocrine and Diabetic Disorders, Toxicologi...","[[External Causes of Morbidity (E-Codes), Endo...","[10506015_25008660.0, 10506015_25568011.0, 105...","[[Htn complicn, Chr kidney disease, Oth nerv d...","[Toxicological and Iatrogenic Complications, S...",[Endocrine; Nutritional; and Metabolic Disease...,"[5, 4, 3, 2, 1]"
7997,14001416,F,[61],[2167],[2011 - 2013],[2171-08-18],[29099098.0],[2167-04-08 21:59:00],[2167-04-15 14:27:00],[NaT],...,"[[1, 6, 10, 7, 8, 18, 4, 5]]","[[83, 131, 159, 6, 657, 62, 653, 3, 59, 95, 25...","[[Epilepsy/cnv, Adlt resp fl, UTI, Hepatitis, ...","[[Psychotic and Substance Use Disorders, Other...",[[Mental; Behavioral; and Neurodevelopmental D...,[14001416_29099098.0],"[[Epilepsy/cnv, Adlt resp fl, Screening and hi...","[Psychotic and Substance Use Disorders, Other ...",[Mental; Behavioral; and Neurodevelopmental Di...,[1]
7998,12728074,F,[62],[2187],[2011 - 2013],[nan],[26499065.0],[2187-10-16 05:14:00],[2187-10-24 11:30:00],[NaT],...,"[[1, 9, 10, 7, 3, 13, 12, 17]]","[[99, 138, 48, 200, 8, 158, 4, 257, 211, 58, 4...","[[Htn complicn, Esophgeal dx, Thyroid dsor, Ot...","[[Upper Gastrointestinal Disorders, Endocrine ...",[[Endocrine; Nutritional; and Metabolic Diseas...,[12728074_26499065.0],"[[Shock, Thyroid dsor, Ot up rsp in, Ot dx kid...","[Endocrine and Diabetic Disorders, Autoimmune ...",[Endocrine; Nutritional; and Metabolic Disease...,[1]


In [98]:
def save_diagnosis_df_to_csv(
    data, 
    out_dir, 
    name
):

    # Make directory #
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # Save Dataframe #
    save_path = os.path.join(out_dir, '{}.csv'.format(name))
    data.to_csv(save_path, index=False)

In [99]:
save_diagnosis_df_to_csv(df_pat_adm_dis_notes_diag_codes_filtered, out_dir, 'original_hierarchical_data')

In [100]:
save_diagnosis_df_to_csv(stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated, out_dir, 'hierarchical_data')

In [101]:
df_pat_adm_dis_notes_diag_codes_filtered_last_visits = df_pat_adm_dis_notes_diag_codes_filtered.groupby('subject_id', as_index=False).last()
df_pat_adm_dis_notes_diag_codes_filtered_last_visits = df_pat_adm_dis_notes_diag_codes_filtered_last_visits[['subject_id', 'age_group', 'gender', 'insurance', 'race']]

for col in ['age_group', 'gender', 'insurance', 'race']:
    print(f"\nDistribution for '{col}':")
    print(df_pat_adm_dis_notes_diag_codes_filtered_last_visits[col].value_counts(dropna=False, normalize=True).round(4) * 100)
    print(df_pat_adm_dis_notes_diag_codes_filtered_last_visits[col].value_counts(dropna=False))


Distribution for 'age_group':
61+      56.10
41–60    30.06
18–40    13.84
Name: age_group, dtype: float64
61+      18037
41–60     9663
18–40     4450
Name: age_group, dtype: int64

Distribution for 'gender':
F    50.41
M    49.59
Name: gender, dtype: float64
F    16208
M    15942
Name: gender, dtype: int64

Distribution for 'insurance':
Medicare    45.95
Other       45.66
Medicaid     8.39
Name: insurance, dtype: float64
Medicare    14773
Other       14680
Medicaid     2697
Name: insurance, dtype: int64

Distribution for 'race':
WHITE              71.75
BLACK              14.42
HISPANIC_LATINO     4.76
OTHER               3.30
ASIAN               3.02
UNKNOWN             2.76
Name: race, dtype: float64
WHITE              23068
BLACK               4636
HISPANIC_LATINO     1529
OTHER               1060
ASIAN                970
UNKNOWN              887
Name: race, dtype: int64


In [102]:
stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated = stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated.groupby('subject_id', as_index=False).last()
stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated = stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated[['subject_id', 'age_group', 'gender', 'insurance', 'race']]

for col in ['age_group', 'gender', 'insurance', 'race']:
    print(f"\nDistribution for '{col}':")
    print(stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated[col].value_counts(dropna=False, normalize=True).round(4) * 100)
    print(stratified_df_pat_adm_dis_notes_diag_codes_filtered_truncated[col].value_counts(dropna=False))


Distribution for 'age_group':
61+      56.10
41–60    30.05
18–40    13.85
Name: age_group, dtype: float64
61+      4488
41–60    2404
18–40    1108
Name: age_group, dtype: int64

Distribution for 'gender':
F    50.44
M    49.56
Name: gender, dtype: float64
F    4035
M    3965
Name: gender, dtype: int64

Distribution for 'insurance':
Medicare    45.94
Other       45.69
Medicaid     8.38
Name: insurance, dtype: float64
Medicare    3675
Other       3655
Medicaid     670
Name: insurance, dtype: int64

Distribution for 'race':
WHITE              71.78
BLACK              14.41
HISPANIC_LATINO     4.76
OTHER               3.29
ASIAN               3.00
UNKNOWN             2.76
Name: race, dtype: float64
WHITE              5742
BLACK              1153
HISPANIC_LATINO     381
OTHER               263
ASIAN               240
UNKNOWN             221
Name: race, dtype: int64
