# Preprocess Discharge Summaries

In [149]:
import pandas as pd
import numpy as np
import os
import psycopg2
import sqlalchemy
import string
import spacy
from spacy.symbols import ORTH
from collections import Counter
import re
from datetime import date, datetime, timedelta
import random
from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit
from spellchecker import SpellChecker

Connect to the mimic database and set the search path to the 'mimiciii' schema

In [3]:
dbschema='mimiciii'
cnx = sqlalchemy.create_engine('postgresql+psycopg2://aa5118:mimic@localhost:5432/mimic',
                    connect_args={'options': '-csearch_path={}'.format(dbschema)})


Query the discharge summary notes joined on to patient data

In [4]:
sql = """
  SELECT
      p.subject_id, p.dob, p.gender,
      n.hadm_id, n.category, n.chartdate, n.row_id,
      ROUND((cast(chartdate as date) - cast(dob as date)) / 365.242,0)
          AS age_at_noteevent,
      n.text
  FROM patients p 
  INNER JOIN noteevents n 
  ON p.subject_id = n.subject_id
  WHERE ROUND((cast(chartdate as date) - cast(dob as date)) / 365.242,0) > 14
  AND n.category = 'Discharge summary'
  ORDER BY subject_id
  --LIMIT 1000;
"""

df = pd.read_sql_query(sqlalchemy.text(sql), cnx)
df.head()

Unnamed: 0,subject_id,dob,gender,hadm_id,category,chartdate,row_id,age_at_noteevent,text
0,3,2025-04-11,M,145834,Discharge summary,2101-10-31,44005,77.0,Admission Date: [**2101-10-20**] Discharg...
1,4,2143-05-12,F,185777,Discharge summary,2191-03-23,4788,48.0,Admission Date: [**2191-3-16**] Discharge...
2,6,2109-06-21,F,107064,Discharge summary,2175-06-15,20825,66.0,Admission Date: [**2175-5-30**] Dischar...
3,9,2108-01-26,M,150750,Discharge summary,2149-11-14,57115,42.0,"Name: [**Known lastname 10050**], [**Known fi..."
4,9,2108-01-26,M,150750,Discharge summary,2149-11-13,20070,42.0,Admission Date: [**2149-11-9**] Dischar...


Change data type of age to the smallest possible type of integer to save memory and get rid of decimal point

In [5]:
df['age_at_noteevent'] = pd.to_numeric(df['age_at_noteevent'], downcast='integer')
df.head()

Unnamed: 0,subject_id,dob,gender,hadm_id,category,chartdate,row_id,age_at_noteevent,text
0,3,2025-04-11,M,145834,Discharge summary,2101-10-31,44005,77,Admission Date: [**2101-10-20**] Discharg...
1,4,2143-05-12,F,185777,Discharge summary,2191-03-23,4788,48,Admission Date: [**2191-3-16**] Discharge...
2,6,2109-06-21,F,107064,Discharge summary,2175-06-15,20825,66,Admission Date: [**2175-5-30**] Dischar...
3,9,2108-01-26,M,150750,Discharge summary,2149-11-14,57115,42,"Name: [**Known lastname 10050**], [**Known fi..."
4,9,2108-01-26,M,150750,Discharge summary,2149-11-13,20070,42,Admission Date: [**2149-11-9**] Dischar...


In [6]:
df.shape

(55404, 9)

55404 'adult' (15 or over) discharge summaries - this is what we expect from our exploratory data analysis

In [146]:
text = " ".join(list(df.head(1000)['text']))
#text

The following punctuation marks frequently appear in the middle of words or between words without spacing meaning they are missed by the tokenizer. What we need to is to split the tokens on these punctuation marks after we have tokenized. We then retokenize. This will substantially decreases the number of our unique words which we will replace with <UNK>

- ampersand
- brackets
- colons
- forward slashes(make sure to leave dates alone though)
- full stops
- hyphens
- equals signs
- semicolons
- plus signs

Also perf

In [147]:
nlp = spacy.load('en')

date_regex = re.compile(r'([0-9])-([0-9][0-9]?)-([0-9])') # change date format so spacy can recognise
newline_regex = re.compile(r'(\\n){3,}') # cap number of consecutive newline characters to 2
newline_regex2 = re.compile(r'(\\r){3,}') # cap number of consecutive newline characters to 2

bracket_regex = re.compile(r'(.)(\()(.)')
bracket_regex2 = re.compile(r'(.)(\))(.)')
slash_regex = re.compile(r'(.)(\/)([^0-9])')
slash_regex2 = re.compile(r'([^0-9])(\/)(.)')
equals_regex = re.compile(r'(.)(=)(.)')
colon_regex = re.compile(r'(.)(:)(.)')
sq_bracket_regex = re.compile(r'(.)(\[)(.)')
dash_regex = re.compile(r'(.)(-)(.)')
plus_regex = re.compile(r'(.)(\+)(.)')
amp_regex = re.compile(r'(.)(&)(.)')

dot_regex = re.compile(r'([^0-9.])\.(\S[^0-9.])')
semicol_regex = re.compile(r'(.);(.)')

counter = 0

def tokenise_text(text):
    global counter
    
    text = str(text)
    text = date_regex.sub(r'\1/\2/\3',text)
    text = newline_regex.sub(r'\\n\\n',text)
    text = newline_regex2.sub(r'\\n\\n',text)
    
    text = text.replace("[**","[").replace("**]","]")
    
    #text = text.lower()
    tokens = nlp.tokenizer(text)
    tokenised_text = ""
    
    for token in tokens:
        tokenised_text = tokenised_text + str(token) + " "
    
    tokenised_text = tokenised_text.replace("\n"," <PAR> ")
    
    tokenised_text = bracket_regex.sub(r'\1 \2\3',tokenised_text)
    tokenised_text = bracket_regex2.sub(r'\1\2 \3',tokenised_text)
    tokenised_text = slash_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = slash_regex2.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = equals_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = colon_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = sq_bracket_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = dash_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = dash_regex.sub(r'\1 \2 \3',tokenised_text) # dash twice because sometimes it appears twice
    tokenised_text = plus_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = amp_regex.sub(r'\1 \2 \3',tokenised_text)
    tokenised_text = dot_regex.sub(r'\1 \2',tokenised_text)
    tokenised_text = semicol_regex.sub(r'\1 \2',tokenised_text)

    tokenised_text = ' '.join(tokenised_text.split())
    
    counter += 1
    if (counter % 100) == 0:
        print (counter)
    
    return tokenised_text
text = tokenise_text(text)

Below we isolate the tokens which appear 3 times or fewer. They are mostly misspellings.

In [161]:
nlp.tokenizer.add_special_case(u'<PAR>', [{ORTH: u'<PAR>'}])
nlp.tokenizer.add_special_case(u'<UNK>', [{ORTH: u'<UNK>'}])

In [172]:
doc = nlp.tokenizer(text.lower())
words = [token.text for token in doc if token.is_punct != True and token.is_digit != True  and ((token.text)[0]).isdigit() != True]

word_freq = dict(Counter(words))
infreq_words = [word for word in word_freq.keys() if word_freq[word] <= 3]
print(len(infreq_words))
sorted(infreq_words)[:200]

10262


["'ll",
 "'re",
 '-"in',
 '--100/52',
 '--klonopin',
 '--lithium',
 '--melatonin',
 '--zydis',
 '--|',
 '-10',
 '-100',
 '-101',
 '-102',
 '-103',
 '-104',
 '-105',
 '-108',
 '-111',
 '-114',
 '-12',
 '-120',
 '-13',
 '-14',
 '-155',
 '-20',
 '-23',
 '-24',
 '-29',
 '-31',
 '-9',
 '-92',
 '-94',
 '-95',
 '-98',
 '-99',
 '->1',
 '->106/59',
 '->119',
 '->15',
 '->16.3',
 '->2',
 '->23.8',
 '->29.0',
 '->3',
 '->30.1',
 '->44',
 '->60',
 '->66',
 '->7.37/39/76',
 '->98',
 '->linezolid',
 '->therefore',
 '-apply',
 '-atovaquone',
 '-basic',
 '-complicating',
 '-doses',
 '-due',
 '-in',
 '-increaed',
 '-last',
 '-metastatic',
 '-per',
 '-please',
 '-pt',
 '-qt',
 '-r',
 '-she',
 '-sinus',
 '-started',
 '-symptoms',
 '-two',
 '-we',
 '-you',
 '.01',
 '.01%gel',
 '.018',
 '.02',
 '.02.914',
 '.025',
 '.035',
 '.04',
 '.05',
 '.06',
 '.100',
 '.11',
 '.112mcg',
 '.125',
 '.20',
 '.22',
 '.24',
 '.29',
 '.36',
 '.37',
 '.38',
 '.46',
 '.48',
 '.50',
 '.60',
 '.62',
 '.75qd',
 '.77',
 '.80/5',


We try and see if we can correct the misspellings using the `pyspellchecker` library by using the Levenshtein Distance algorithm and comparing against a dictionary. We first add the words with >3 occurrence to our dictionary. This is because they include a lot of scientific/medical terms which might not already be there

In [163]:
freq_words = [word for word in word_freq.keys() if word_freq[word] > 3]
add_to_dictionary = " ".join(freq_words)
f=open("data/mimic_dict.txt", "w+")
f.write(add_to_dictionary)
f.close()

In [174]:
spell = SpellChecker()
spell.distance = 1  # set the distance parameter to just 1 edit away
spell.word_frequency.load_text_file('data/mimic_dict.txt')

In [184]:
misspelled = spell.unknown(infreq_words)
misspell_dict = {}
for i, word in enumerate(misspelled):
    if (word != spell.correction(word)):
        misspell_dict[word] = spell.correction(word)

In [185]:
len(misspell_dict)

3328

We now have correct spellings for over 3000 words in our dictionary that occurred <= 3 times. We will now implement this in our new tokenizer