In [1]:
import sys
sys.path.insert(0, '../preprocessing')

In [2]:
from loader import read_replacements
from transformers import StopWordRemover, SemanticMapper, LowerCaseSentence, SpellCheckTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/maxyuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxyuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/maxyuan/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Load and process input

In [3]:
df = pd.read_csv('../data/inputs/ct_desc.csv')
impressions = df['Impression Cleaned']

In [4]:
example_input = impressions[:10]
example_input = [i.split('\n')[1:-1] for i in example_input]

In [5]:
example_input[:5]

[['1.  Unchanged descending thoracic aorta intramural hematoma with persistent intramural blood pool measuring approximately 1.4 cm in size, which is supplied by an intercostal artery.',
  '2.  Unchanged enlargement of the ascending aorta measuring up to 4.1 cm, with diffuse calcified and noncalcified atherosclerotic plaque throughout the thoracic aorta.',
  '3.  Unchanged 2.3 cm left thyroid nodule which warrants dedicated thyroid ultrasound if not previously performed.'],
 ['1. Slight decrease in the left lung and pleural metastatic disease.',
  '2.  Unchanged small right pleural effusion.'],
 ['1. No pulmonary embolism.',
  '2. Slight interval decrease in scattered groundglass opacities and interlobular septal thickening, compatible with pulmonary edema which is slightly improving.',
  '3. Medium right and small left pleural effusion. Small pericardial effusion. These findings are similar to prior and are suggestive of volume overload.'],
 ['No intrathoracic metastatic disease.'],
 

# Initialize Preprocessing Transformations

In [6]:
clever_r = read_replacements('../data/semantic_maps/clever_replacements')
misc_r = read_replacements('../data/semantic_maps/misc_replacements')
radlax_r = read_replacements('../data/semantic_maps/radlex_replacements')

non_alpha_t = SemanticMapper([('[^A-Za-z]', ' ')], regex=True)
lower_t = LowerCaseSentence()
spellcheck_t = SpellCheckTransformer()
clever_t = SemanticMapper(clever_r)
misc_t = SemanticMapper(misc_r)
radlax_t = SemanticMapper(radlax_r)

In [7]:
# experiment with each tranformation

short_input = example_input[:3]

print('Input: ')
print(short_input)
print('--------------------------------------')

for index, transformer in enumerate([non_alpha_t, lower_t, spellcheck_t, clever_t, misc_t, radlax_t]):
    print('Transformation {}:'.format(index))
    print(transformer.transform(short_input))
    print('--------------------------------------')
    

Input: 
[['1.  Unchanged descending thoracic aorta intramural hematoma with persistent intramural blood pool measuring approximately 1.4 cm in size, which is supplied by an intercostal artery.', '2.  Unchanged enlargement of the ascending aorta measuring up to 4.1 cm, with diffuse calcified and noncalcified atherosclerotic plaque throughout the thoracic aorta.', '3.  Unchanged 2.3 cm left thyroid nodule which warrants dedicated thyroid ultrasound if not previously performed.'], ['1. Slight decrease in the left lung and pleural metastatic disease.', '2.  Unchanged small right pleural effusion.'], ['1. No pulmonary embolism.', '2. Slight interval decrease in scattered groundglass opacities and interlobular septal thickening, compatible with pulmonary edema which is slightly improving.', '3. Medium right and small left pleural effusion. Small pericardial effusion. These findings are similar to prior and are suggestive of volume overload.']]
--------------------------------------
Transform

In [12]:
# create pipeline 
print('Example Input: \n')
for r in example_input:
    for s in r:
        print(s)
print('\n--------------------------------------\n')

proposed_pipeline = make_pipeline(non_alpha_t, lower_t, spellcheck_t, clever_t, misc_t, radlax_t, None)
labeled_output = proposed_pipeline.transform(example_input)

print('Pipeline Output: \n')
for r in labeled_output:
    for s in r:
        print(s)

Example Input: 

1. unchanged descending thoracic aorta intramural hematoma with persistent intramural blood pool measuring approximately 1.4 cm in size, which is supplied by an intercostal artery.
2. unchanged enlargement of the ascending aorta measuring up to 4.1 cm, with diffuse calcified and noncalcified atherosclerotic plaque throughout the thoracic aorta.
3. unchanged 2.3 cm left thyroid nodule which warrants dedicated thyroid ultrasound if not previously performed.
1. slight decrease in the left lung and pleural metastatic disease.
2. unchanged small right pleural effusion.
1. no pulmonary embolism.
2. slight interval decrease in scattered groundglass opacities and interlobular septal thickening, compatible with pulmonary edema which is slightly improving.
3. medium right and small left pleural effusion. small pericardial effusion. these findings are similar to prior and are suggestive of volume overload.
No intrathoracic metastatic disease.
Large left heterogeneously enhancing 

# NOTE
1. Need to modify NEGEX to [NEGEX], RISK to [HEDGE]... for style
2. Some of the transformation might or might not be necessary (clever, misc)
3. Spellcheck transformation is incredible important but takes incredibly long
    - Potentially need to write scripts to parallelize the opration 
    - Might need to run on the UCSF machines to speed up and run for larger datasets (Could be a onetime thing)
    - DEFINITELY NEED: huge dictionary of ALL words we might see for spellcheck to work correctly
