In [1]:
import pandas as pd
import torch
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter, defaultdict

  from .autonotebook import tqdm as notebook_tqdm


**ICD Codes from Diagnosis and Procedures**

In [2]:
def reformat(code, is_diag): ## leveraged from caml-mimic (for adding periods (dots) to ICD codes)
    """
        Put a period in the right place because the MIMIC-3 data files exclude them.
        Generally, procedure codes have dots after the first two digits, 
        while diagnosis codes have dots after the first three digits.
    """
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

In [3]:
## read data
diag_df = pd.read_csv('MIMIC_DATA/DIAGNOSES_ICD.csv')
proc_df = pd.read_csv('MIMIC_DATA/PROCEDURES_ICD.csv')

## reformat codes
diag_df['FORM_ICD9_CODE'] = diag_df['ICD9_CODE'].apply(lambda x: reformat(str(x), True))
proc_df['FORM_ICD9_CODE'] = proc_df['ICD9_CODE'].apply(lambda x: reformat(str(x), False))

## merge data
all_df_codes = pd.concat([diag_df, proc_df])
print(all_df_codes[:5])

##save all codes to csv
all_df_codes.to_csv('MIMIC_DATA/all_codes.csv', index=False, columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'FORM_ICD9_CODE'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])


all_df_codes = all_df_codes[['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'FORM_ICD9_CODE']]

all_df_codes.rename(columns={'FORM_ICD9_CODE': 'ICD9_CODE'}, inplace=True)

print(" # unique ICD codes: ", len(all_df_codes['ICD9_CODE'].unique()))

   ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE FORM_ICD9_CODE
0    1297         109   172335      1.0     40301         403.01
1    1298         109   172335      2.0       486            486
2    1299         109   172335      3.0     58281         582.81
3    1300         109   172335      4.0      5855          585.5
4    1301         109   172335      5.0      4254          425.4
 # unique ICD codes:  8994


**Extract Discharge summaries (from Notes) and then Tokenize only non-numeric text**

In [4]:
def tokenize_lower_nonnum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    updated_text = [token.lower() for token in tokenizer.tokenize(text) if not token.isnumeric()]
    updated_text = '"'+ ' '.join(updated_text) + '"'
    return updated_text

In [15]:
#tokenizer = RegexpTokenizer(r'\w+')

"""
##code for processing discharge summaries, if we already did, not needed to run again

notes_df = pd.read_csv('MIMIC_DATA/NOTEEVENTS.csv') ## read Noteevents data

notes_df = notes_df[notes_df['CATEGORY'] == 'Discharge summary'] ## filter for discharge summaries

notes_df['TEXT'] = notes_df['TEXT'].apply(lambda x: tokenize_lower_nonnum(x)) ## tokenize, lower case, remove numbers

notes_df = notes_df[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']] ## keep only relevant columns

#write to csv
notes_df.to_csv('MIMIC_DATA/processed_discharge_summaries.csv', index=False)

"""


  notes_df = pd.read_csv('MIMIC_DATA/NOTEEVENTS.csv') ## read Noteevents data


In [13]:
##read processed discharge summaries
dis_df = pd.read_csv('MIMIC_DATA/processed_discharge_summaries.csv', dtype= {"SUBJECT_ID": int, 'HADM_ID': int}) ## read processed discharge summaries
dis_df = dis_df.sort_values(['SUBJECT_ID', 'HADM_ID']) ## sort by subject id and hadm id

## keep only relevant columns
dis_df = dis_df[['SUBJECT_ID', 'HADM_ID', 'TEXT']]

##key details
print(" # unique SUBJECT_IDs with discharge notes: ", len(dis_df['SUBJECT_ID'].unique()))
print(" # unique HADM_IDs with discharge notes: ", len(dis_df['HADM_ID'].unique()))



 # unique SUBJECT_IDs with discharge notes:  41127
 # unique HADM_IDs with discharge notes:  52726


In [17]:
dis_df[0:3]

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT
48470,3,145834,"""admission date discharge date date of birth s..."
4782,4,185777,"""admission date discharge date date of birth s..."
24476,6,107064,"""admission date discharge date date of birth s..."


**Integrate ICD codes from Diagnosis and Procedures with Discharge summaries**

In [18]:
all_df_codes.sort_values(['SUBJECT_ID', 'HADM_ID'], inplace=True) ## sort by subject id and hadm id

##keep only the codes for which we have discharge summaries

all_df_codes = all_df_codes[all_df_codes['HADM_ID'].isin(dis_df['HADM_ID'].unique())]

all_df_codes.sort_values(['SUBJECT_ID', 'HADM_ID'], inplace=True)

print (" # unique HADM_IDs in all_codes: ", len(all_df_codes['HADM_ID'].unique()))


 # unique HADM_IDs in all_codes:  52726


In [20]:
##grouping codes by HADM_ID and subject_id
all_codes_df_2 = all_df_codes.groupby(['SUBJECT_ID', 'HADM_ID'])['ICD9_CODE'].apply(list).reset_index()
all_codes_df_2['ICD9_CODE'] = all_codes_df_2['ICD9_CODE'].apply(lambda x: ';'.join(x))
#print("updated allcodes:", all_codes_df_2[:5])

##merge discharge summaries and codes
notes_codes_df = pd.merge(dis_df, all_codes_df_2, on=['SUBJECT_ID', 'HADM_ID'], how='inner')
notes_codes_df[:3]

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
0,3,145834,"""admission date discharge date date of birth s...",038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...
1,4,185777,"""admission date discharge date date of birth s...",042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...
2,6,107064,"""admission date discharge date date of birth s...",403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...


In [22]:
print(" # unique HADM_IDs in merged data: ", len(notes_codes_df['HADM_ID'].unique()))

 # unique HADM_IDs in merged data:  52726


In [24]:
len(notes_codes_df)

59652

**Create Train, Validation and Test sets**

In [25]:
train_perc, dev_perc, test_perc = 0.8, 0.1, 0.1

shuffled_indices = torch.randperm(len(notes_codes_df)) ##

train_df = notes_codes_df.iloc[shuffled_indices[:int(len(notes_codes_df)*train_perc)]]

dev_df = notes_codes_df.iloc[shuffled_indices[int(len(notes_codes_df)*train_perc):int(len(notes_codes_df)*(train_perc+dev_perc))]]

test_df = notes_codes_df.iloc[shuffled_indices[int(len(notes_codes_df)*(train_perc+dev_perc)):]]

**Buidling Vocabulary using Training set**

In [26]:
training_documents = train_df['TEXT'].tolist()

vectorizer = CountVectorizer(min_df=3)

vectorizer.fit(training_documents)

vocab = vectorizer.vocabulary_

print("Vocabulary size based on train data: ", len(vocab))

##save vocab to file
with open('MIMIC_DATA/disnotes_vocab.txt', 'w') as f:
    for word, index in vocab.items():
        f.write(f'{word}\t{index}\n') ## writes word and index to a text file (with tab delimiter, reference code just saved the sorted list of words)


Vocabulary size based on train data:  50088


**Sort data based on length of the text for batching**

In [None]:
train_df["length"] = train_df["TEXT"].apply(lambda x: len(x.split()))
train_df.sort_values(by="length", inplace=True)

dev_df["length"] = dev_df["TEXT"].apply(lambda x: len(x.split()))
dev_df.sort_values(by="length", inplace=True)

test_df["length"] = test_df["TEXT"].apply(lambda x: len(x.split()))
test_df.sort_values(by="length", inplace=True)

**Pre-train word embeddings (Excluded this step, as pretrained BERT doesn't need this)**

****Filter Top 50 ICD codes****

In [39]:
top_x = 50
code_count = {}
##count # times a code appeared in the full notes data
for i in range(len(notes_codes_df)):
    codes = notes_codes_df.iloc[i]['ICD9_CODE'].split(';')
    for code in codes:
        if code not in code_count:
            code_count[code] = 1
        else:
            code_count[code] += 1

##sort the codes based on their counts
sorted_code_count = sorted(code_count.items(), key=lambda x: x[1], reverse=True)

##get the top x codes
top_x_codes = [code[0] for code in sorted_code_count[:top_x]]

print("top codes:", top_x_codes)





top codes: ['401.9', '38.93', '428.0', '427.31', '414.01', '96.04', '96.6', '584.9', '250.00', '96.71', '272.4', '518.81', '99.04', '39.61', '599.0', '96.72', '530.81', '272.0', '88.56', '285.9', '486', '38.91', '244.9', '36.15', '99.15', '285.1', '496', '276.2', '507.0', '88.72', '995.92', 'V58.61', '038.9', '37.22', '33.24', '311', '39.95', '585.9', '403.90', '305.1', '412', '410.71', '287.5', '276.1', '424.0', '45.13', 'V45.81', '37.23', '511.9', '93.90']


In [43]:
##filter train, dev, test data based on top x codes
def filter_codes(x, top_x_codes):
    codes = x.split(';')
    filtered_codes = [code for code in codes if code in top_x_codes]
    return ';'.join(filtered_codes)

dffss = [train_df, dev_df, test_df]

for df in dffss:
    df['ICD9_CODE'] = df['ICD9_CODE'].apply(lambda x: filter_codes(x, top_x_codes))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ICD9_CODE'] = df['ICD9_CODE'].apply(lambda x: filter_codes(x, top_x_codes))


In [63]:
##remove empty code rows
train_df = train_df[train_df['ICD9_CODE'] != '']
dev_df = dev_df[dev_df['ICD9_CODE'] != '']
test_df = test_df[test_df['ICD9_CODE'] != '']


##save train, dev, test data to csv
train_df.to_csv('MIMIC_DATA/train_ds_notes.csv', index=False)
dev_df.to_csv('MIMIC_DATA/dev_ds_notes.csv', index=False)
test_df.to_csv('MIMIC_DATA/test_ds_notes.csv', index=False)


In [66]:
###length of train, dev, test data
print("length of train data: ", len(train_df))
print("length of dev data: ", len(dev_df))
print("length of test data: ", len(test_df))

length of train data:  44998
length of dev data:  5633
length of test data:  5638


In [67]:
##save top x codes to file
with open('MIMIC_DATA/top_x_codes.txt', 'w') as f:
    for code in top_x_codes:
        f.write(f'{code}\n')

In [68]:
test_df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,length
28356,23057,127445,"""admission date discharge date date of birth s...",424.0;427.31;39.61;37.23;88.56,27
11792,9554,119712,"""admission date discharge date date of birth s...",401.9,28
26002,21126,124750,"""admission date discharge date date of birth s...",584.9;285.9;38.93,29
11831,9588,100041,"""admission date discharge date date of birth s...",414.01;412;496;244.9;401.9;36.15;39.61,31
14922,12004,180993,"""admission date discharge date date of birth s...",285.9;305.1;45.13,33
...,...,...,...,...,...
35873,29299,120896,"""admission date discharge date date of birth s...",038.9;599.0;518.81;584.9;285.1;V58.61;250.00;9...,5810
49870,70425,150284,"""admission date discharge date date of birth s...",507.0;518.81;272.4;305.1;530.81;96.72;96.6,6311
42605,48872,138211,"""admission date discharge date date of birth s...",507.0;584.9;276.2;96.72;39.95;33.24;33.24;45.13,6393
7375,5962,186417,"""admission date discharge date date of birth s...",038.9;511.9;285.1;995.92;99.15;96.6,6404


In [69]:
notes_codes_df[notes_codes_df['HADM_ID'] == 119712]

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
11792,9554,119712,"""admission date discharge date date of birth s...",707.8;401.9;V43.64;V43.65;431;714.0;13.9
11793,9554,119712,"""admission date discharge date date of birth s...",707.8;401.9;V43.64;V43.65;431;714.0;13.9
