In [106]:
import os
import pandas as pd
import tqdm
import regex as re
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
import random
from datasets import load_metric
from string import punctuation
import nltk.data
from tokenizers import AddedToken
import numpy as np

In [107]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def print_title(title):
    print('=' * 30)
    print(title)
    print('=' * 30)

class AIGDataset(Dataset):
    def __init__(self,dataset,tokenizer,source_len,summ_len):
        self.dataset = dataset 
        self.tokenizer = tokenizer
        self.text_len = source_len
        self.summ_len = summ_len
        self.text = self.dataset['Source']
        self.summary = self.dataset['Impression']

    def __len__(self):
        return len(self.text)

    def __getitem__(self,i):
        summary = '<pad> ' + str(self.summary[i])
        text = '<pad> ' + str(self.text[i])
        source = self.tokenizer.batch_encode_plus([text],max_length=self.text_len,return_tensors='pt',pad_to_max_length=True, truncation=True) # Each source sequence is encoded and padded to max length in batches
        target = self.tokenizer.batch_encode_plus([summary],max_length=self.summ_len,return_tensors='pt',pad_to_max_length=True, truncation=True) # Each target sequence is encoded and padded to max lenght in batches


        source_ids = source['input_ids'].squeeze()
        source_masks = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_masks = target['attention_mask'].squeeze()


        return {
            'source_ids':source_ids.to(torch.long),
            'source_masks':source_masks.to(torch.long),
            'target_ids':target_ids.to(torch.long),
            'target_masks':target_masks.to(torch.long)
        }
    
def test(tokenizer,model,device,loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        progress_bar = tqdm.tqdm(loader)
        for data in progress_bar:
            ids = data['source_ids'].to(device)
            mask = data['source_masks'].to(device)
            y_id = data['target_ids'].to(device)
            prediction = model.generate(input_ids=ids,attention_mask = mask,num_beams=2,max_length=200,repetition_penalty=1.5,early_stopping=False,length_penalty=1.0)

            # Decode y_id and prediction #
            source = [tokenizer.decode(s,skip_special_tokens=True,clean_up_tokenization_spaces=False) for s in ids]
            preds = [tokenizer.decode(p,skip_special_tokens=False,clean_up_tokenization_spaces=False) for p in prediction]
            target = [tokenizer.decode(t,skip_special_tokens=False,clean_up_tokenization_spaces=False) for t in y_id]

            predictions.extend(preds)
    return predictions

def make_demo(mode, reader_performance, department=None, zsfg=False):        

    processed_data = pd.read_csv(f'data/processed/{mode}_test_dataset.csv').sample(frac=1)
    if zsfg:
        processed_data = pd.read_csv(f'data/processed/zsfg_{mode}_test_dataset.csv').sample(frac=1)
    processed_data = processed_data.drop_duplicates(subset=['Impression'], ignore_index=True).dropna()
    if reader_performance:
        processed_data = processed_data[~processed_data['Impression'].str.contains('biopsy')]
    if department:
        processed_data = processed_data[processed_data['Exam'].str.startswith(department)]

    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("\n")]})
    dataset = Dataset.from_pandas(processed_data.reset_index(drop=True))
    
    test_dataset = AIGDataset(dataset,tokenizer,400,200)
    test_loader = DataLoader(dataset = test_dataset,batch_size=1,num_workers=0, shuffle=False)

    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    model.load_state_dict(torch.load(f'models/aig_t5_weights_{mode}'))
    model = model.to(device)

    predictions = test(tokenizer,model,device,test_loader)
    predictions = [postprocess(prediction) for prediction in predictions]
    processed_data['Predicted Impression'] = predictions
                  
    if reader_performance:
        processed_data = processed_data.drop_duplicates(subset=['Predicted Impression'], ignore_index=True).dropna()[:50]
    return dataset['Findings'], dataset['Impression'], predictions, processed_data

def postprocess(prediction):
    prediction = prediction.replace('<pad>', '')
    prediction = prediction.replace('</s>', '')
    prediction = prediction.replace(' \n ', '\n')
    prediction = prediction.strip()
    return prediction

def demo(source, actual, predictions):
    i = random.randint(0, len(source))
    print('SOURCE:')
    print(source[i])
    print('')
    print('ORIGINAL IMPRESSION:')
    print(actual[i])
    print('')
    print('PREDICTED IMPRESSION:')
    print(predictions[i])
    
def calculate_rouge(source, actual, predictions):
    rouge = load_metric('rouge')
    results = rouge.compute(predictions=predictions, references=actual)
    return results['rouge1'].mid.fmeasure * 100, results['rouge2'].mid.fmeasure * 100, results['rougeL'].mid.fmeasure * 100

In [None]:
# Retrospective Reader Performance Study on UCSF Finegrained CT Chest Dataset
finegrained_source, finegrained_actual, finegrained_predictions, finegrained_csv = make_demo(
    mode='finegrained', 
    reader_performance=True,
    zsfg=False
)
finegrained_csv.to_csv('results/finegrained_predictions.csv', index=False)

In [6]:
def rouge_scores(source, actual, predictions):
    rng = np.random.RandomState(seed=12345)
    idx = np.arange(len(source))

    rouge_1_scores = []
    rouge_2_scores = []
    rouge_L_scores = []

    for i in tqdm.tqdm(range(200)):
        pred_idx = rng.choice(idx, size=1000, replace=True)
        rouge_1_score, rouge_2_score, rouge_L_score = calculate_rouge(
            np.array(source)[pred_idx], 
            np.array(actual)[pred_idx], 
            np.array(predictions)[pred_idx]
        )
        rouge_1_scores.append(rouge_1_score)
        rouge_2_scores.append(rouge_2_score)
        rouge_L_scores.append(rouge_L_score)

    def rouge_ci(rouge_scores):
        average_score = np.mean(rouge_scores)
        ci_lower = np.percentile(rouge_scores, 2.5)
        ci_upper = np.percentile(rouge_scores, 97.5)
        return average_score, (ci_lower, ci_upper)

    print('ROUGE-1: ', rouge_ci(rouge_1_scores))
    print('ROUGE-2: ', rouge_ci(rouge_2_scores))
    print('ROUGE-L ', rouge_ci(rouge_L_scores))

In [105]:
ucsf_specialized_source, ucsf_specialized_actual, ucsf_specialized_predictions, _ = make_demo(
    mode='specialized', 
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_specialized_source, ucsf_specialized_actual, ucsf_specialized_predictions)

NameError: name 'make_demo' is not defined

In [11]:
zsfg_specialized_source, zsfg_specialized_actual, zsfg_specialized_predictions, _ = make_demo(
    mode='specialized', 
    reader_performance=False, 
    zsfg=True
)

rouge_scores(zsfg_specialized_source, zsfg_specialized_actual, zsfg_specialized_predictions)

100%|███████████████████████████████████████| 2533/2533 [41:07<00:00,  1.03it/s]
100%|█████████████████████████████████████████| 200/200 [08:10<00:00,  2.45s/it]

ROUGE-1:  (53.15717422689009, (52.175724678092216, 54.30767622103647))
ROUGE-2:  (36.151224519851034, (34.938590412089475, 37.586254162984986))
ROUGE-L  (45.00913662294909, (43.891346356148986, 46.319175629224326))





In [13]:
ucsf_finegrained_source, ucsf_finegrained_actual, ucsf_finegrained_predictions, _ = make_demo(
    mode='finegrained', 
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_finegrained_source, ucsf_finegrained_actual, ucsf_finegrained_predictions)

100%|█████████████████████████████████████████| 676/676 [09:17<00:00,  1.21it/s]
100%|█████████████████████████████████████████| 200/200 [06:44<00:00,  2.02s/it]

ROUGE-1:  (54.66891653585523, (53.29150148565835, 55.710409186160526))
ROUGE-2:  (38.30608612323276, (36.8242358171107, 39.73466882965169))
ROUGE-L  (48.349445003980726, (47.08724238350272, 49.656067862239304))





In [10]:
zsfg_finegrained_source, zsfg_finegrained_actual, zsfg_finegrained_predictions, _ = make_demo(
    mode='finegrained', 
    reader_performance=False, 
    zsfg=True
)

rouge_scores(zsfg_finegrained_source, zsfg_finegrained_actual, zsfg_finegrained_predictions)

100%|███████████████████████████████████████| 4129/4129 [55:52<00:00,  1.23it/s]
100%|█████████████████████████████████████████| 200/200 [07:10<00:00,  2.15s/it]

ROUGE-1:  (47.48753918608839, (46.325304983097865, 48.94960249538783))
ROUGE-2:  (32.086507597541505, (30.842379664978736, 33.877684001662885))
ROUGE-L  (40.79812033807301, (39.59751799972317, 42.29522673871863))





In [32]:
ucsf_general_ct_source, ucsf_general_ct_actual, ucsf_general_ct_predictions, _ = make_demo(
    mode='general', 
    department='CT',
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_general_ct_source, ucsf_general_ct_actual, ucsf_general_ct_predictions)

100%|█████████████████████████████████████| 9056/9056 [2:12:05<00:00,  1.14it/s]
100%|█████████████████████████████████████████| 200/200 [07:32<00:00,  2.26s/it]

ROUGE-1:  (53.68045537449058, (52.456244872353686, 54.78902986414531))
ROUGE-2:  (36.5657392180976, (35.15058140640832, 37.85500384762911))
ROUGE-L  (46.25484175355792, (44.89300526951312, 47.4177312008427))





In [33]:
ucsf_general_mri_source, ucsf_general_mri_actual, ucsf_general_mri_predictions, _ = make_demo(
    mode='general', 
    department='MR',
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_general_mri_source, ucsf_general_mri_actual, ucsf_general_mri_predictions)

100%|█████████████████████████████████████| 6541/6541 [1:40:29<00:00,  1.08it/s]
100%|█████████████████████████████████████████| 200/200 [07:53<00:00,  2.37s/it]

ROUGE-1:  (52.883550319109666, (51.65695545613904, 54.02496770060186))
ROUGE-2:  (35.579871891910514, (34.22917920283988, 36.895173674077775))
ROUGE-L  (45.32954247702376, (44.025573517428846, 46.51066072290612))





In [34]:
ucsf_general_pet_source, ucsf_general_pet_actual, ucsf_general_pet_predictions, _ = make_demo(
    mode='general', 
    department='PETCT',
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_general_pet_source, ucsf_general_pet_actual, ucsf_general_pet_predictions)

100%|█████████████████████████████████████████| 806/806 [13:10<00:00,  1.02it/s]
100%|█████████████████████████████████████████| 200/200 [10:24<00:00,  3.12s/it]

ROUGE-1:  (54.3614604157555, (53.37464253634127, 55.316897011276765))
ROUGE-2:  (37.57389995552365, (36.41930423297943, 38.62273765647115))
ROUGE-L  (47.36978345747333, (46.316367269375604, 48.38511395898861))





In [35]:
ucsf_general_us_source, ucsf_general_us_actual, ucsf_general_us_predictions, _ = make_demo(
    mode='general', 
    department='US',
    reader_performance=False, 
    zsfg=False
)

rouge_scores(ucsf_general_us_source, ucsf_general_us_actual, ucsf_general_us_predictions)

100%|█████████████████████████████████████| 4812/4812 [1:02:58<00:00,  1.27it/s]
100%|█████████████████████████████████████████| 200/200 [07:18<00:00,  2.19s/it]

ROUGE-1:  (53.0849076225571, (51.88031136862934, 54.391613379452295))
ROUGE-2:  (35.51989169615126, (34.12807167840848, 37.01093802609764))
ROUGE-L  (46.58201805394969, (45.33002060827956, 48.04583507342392))





In [46]:
# train = pd.read_csv('data/processed/general_train_dataset.csv')
train = train.dropna()
row = train[train['Impression'].str.contains('metastatic')][train['Exam'].str.startswith('CT')].iloc[85]
print('Exam')
print(row['Exam'])
print('Clinical History')
print(row['Clinical History'])
print('Comparison')
print(row['Comparison'])
print('Findings')
print(row['Findings'])
print('Impression')
print(row['Impression'])

Exam
CT ABDOMEN/PELVIS WITH CONTRAST
Clinical History
50 y/o patient with metastatic melanoma, required to begin cancer treatment
Comparison
2/10/2021
Findings
Visualized lung bases:  For chest findings, please see the separately dictated report from the CT of the chest of the same date.
Liver:  No focal suspicious liver lesions. .
Gallbladder: Unremarkable
Spleen:  Unremarkable
Pancreas:  Unremarkable 
Adrenal Glands:  Small bilateral adrenal nodules measuring up to 13 mm on the right and a 10 mm on the left. As prior CT portion of the PET/CT was noncontrast and was for attenuation correction only, it is unclear whether these nodules were present on the prior study however the current imaging appearance is suspicious for metastatic disease.
Kidneys:  Unremarkable
GI Tract:  Scattered colonic diverticula without evidence of diverticulitis.
Vasculature:  Unremarkable
Lymphadenopathy: Absent
Peritoneum: No ascites
Bladder: Unremarkable
Reproductive organs: Unremarkable
Bones:  No suspici

  row = train[train['Impression'].str.contains('metastatic')][train['Exam'].str.startswith('CT')].iloc[85]


In [9]:
!ls data/raw
os.listdir('data/raw')
raw = pd.read_excel('data/raw/secure_UCSF_CT_2022-05-01_2022-07-31.xlsx')

secure_UCSF_CT_2022-05-01_2022-07-31.xlsx
secure_UCSF_CT_CHEST_2021-09-01-to-2022-09-02.xlsx
secure_UCSF_MR16k_2022-05-01_2022-07-31.xlsx
secure_UCSF_PET_2029-05-01-2022-07-31.xlsx
secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv
secure_UCSF_US15k_2022-05-01-2022-07-31.xlsx


In [32]:
raw = pd.read_csv('data/raw/secure_UCSF_radreports__CT_MR_MRI_US_PET__01-01-2021__10-22-2022.csv')

In [104]:
print(raw.iloc[164]['Report Text'])

EXAMINATION DESCRIPTION:
CT LUMBAR SPINE WITHOUT CONTRAST
INDICATION(S): 
please scan pedicles of L4-S2 assess fusion L5-s1  status post anterior lumbar interbody fusion and posterior fusion at L5-S1
SEDATION:
None.
TECHNIQUE:
Helical CT scan of the lower lumbar spine was performed without intravenous contrast administration, with metallic artifact reduction technique.
CTDI/DLP:
CTDI: Exposure Events: 2 , CTDIvol Min: 0 mGy, CTDIvol Max: 5.5 mGy, DLP: 65 mGy.cm 
COMPARISON:
No previous studies are available for comparison.
FINDINGS:
As requested, lumbar spine CT scan was performed from L4 to the level S2. There is anterior lumbar interbody fusion and posterior fusion noted at the level L5-S1 with a pair of pedicle screw from a posterior fusion and then a anterior single anchoring screw of the interbody to the L5 vertebral body and a pair of anterior screw to the S1 vertebral body. Alignment of L5-S1 is intact. There are no lucencies noted around the interbody fusion. Along the posterio