In [1]:
import numpy as np

import torch
import torch.nn as nn

from modeling_cxrbert import CXRBertModel
from transformers import AutoTokenizer

In [2]:
base_model_name = 'microsoft/BiomedVLP-CXR-BERT-specialized'
resume_model = './results/test_run2'

max_seq_length = 2048
hidden_size = 768
save_seq_len = 192

In [3]:
model = CXRBertModel.from_pretrained(base_model_name)

# extend embeddings
old_embed = model.bert.embeddings.position_embeddings.weight.data
tmp_dim = old_embed.shape[0]
#print("tmp_dim:", tmp_dim)
model.bert.embeddings.position_embeddings = nn.Embedding(max_seq_length, hidden_size)
model.bert.embeddings.position_embeddings.weight.data[:tmp_dim, :] = old_embed
model.bert.embeddings.register_buffer("position_ids", torch.arange(max_seq_length).expand((1, -1)))
model.config.max_position_embeddings = max_seq_length

Some weights of CXRBertModel were not initialized from the model checkpoint at microsoft/BiomedVLP-CXR-BERT-specialized and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
ckpt = torch.load(resume_model+"/pytorch_model.bin")

msg = model.load_state_dict(ckpt, strict=False)
print(msg)

<All keys matched successfully>


In [5]:
model.cuda()
model.eval()

CXRBertModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(2048, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.25, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.25, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

In [7]:
def tokenize_function(example_text):
    # Remove empty lines

    return tokenizer(
        example_text,
        padding='max_length',
        truncation=True,
        max_length=max_seq_length,
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True,
        return_tensors='pt'
    )

### Consolidation

In [8]:
# 578558513: 40336413
example_text_0='No consolidation is identified. No pulmonary nodules are noted. Bone windowed images demonstrate no lytic or blastic lesions.,No evidence of pulmonary embolus.'

example_0=tokenize_function(example_text_0)
for item in example_0.keys():
    example_0[item] = example_0[item].cuda()
    
print(example_0.keys())

dict_keys(['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'])


In [45]:
with torch.no_grad():
    feature_0 = model(**example_0)

In [47]:
feature_0_np = feature_0.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
feature_0_np.shape

np.save("./results/text_embed_example/no_consolidation.npy", feature_0_np)

In [50]:
example_text_1='There is extensive consolidation seen. No pulmonary nodules are noted. Bone windowed images demonstrate no lytic or blastic lesions.,No evidence of pulmonary embolus.'

example_1=tokenize_function(example_text_1)
for item in example_1.keys():
    example_1[item] = example_1[item].cuda()

with torch.no_grad():
    feature_1 = model(**example_1)
    
feature_1_np = feature_1.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_1_np.shape)

np.save("./results/text_embed_example/extensive_consolidation.npy", feature_1_np)

(1, 192, 768)


In [10]:
example_text_1='There is extensive consolidation seen. No evidence of cardiomegaly. No evidence of pleural effusion.'

example_1=tokenize_function(example_text_1)
for item in example_1.keys():
    example_1[item] = example_1[item].cuda()

with torch.no_grad():
    feature_1 = model(**example_1)
    
feature_1_np = feature_1.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_1_np.shape)

np.save("./results/text_embed_example/extensive_consolidation_v2.npy", feature_1_np)

(1, 192, 768)


### Effusion

In [51]:
# 512492223: 42507796
example_text='There is no airspace opacity, effusion or pneumothorax. There is no evidence of suspicious pulmonary nodule or mass.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/no_effusion.npy", feature_np)

(1, 192, 768)


In [52]:
example_text='There are large pleural effusions seen. There is no airspace opacity or pneumothorax. There is no evidence of suspicious pulmonary nodule or mass.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/large_effusion.npy", feature_np)

(1, 192, 768)


In [9]:
example_text='There are large pleural effusions seen. No evidence of cardiomegaly. No evidence of consolidation.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/large_effusion_v2.npy", feature_np)

(1, 192, 768)


### bullae

In [8]:
# 511638018: 42258969
example_text='No bullae, cystic lung disease, or CT findings of small airways disease.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/no_bullae.npy", feature_np)

(1, 192, 768)


In [9]:
# 456867906: 49263717
example_text='Emphysema is present with bullae. No cystic lung disease, or CT findings of small airways disease.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/with_bullae.npy", feature_np)

(1, 192, 768)


### cardiomegaly

In [10]:
# 573143138: 40519238
example_text='No cardiomegaly demonstrated.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/no_cardiomegaly.npy", feature_np)

(1, 192, 768)


In [11]:
# 502156754: 42759222
example_text='There is cardiomegaly.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/with_cardiomegaly.npy", feature_np)

(1, 192, 768)


In [9]:
example_text='There is no significant mediastinal lymphadenopathy. There is no cardiomegaly demonstrated. The visualized upper abdominal organs are unremarkable. There is minimal perihepatic free fluid.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/no_cardiomegaly_v2.npy", feature_np)

(1, 192, 768)


In [8]:
# 506275596
example_text='There is no significant mediastinal lymphadenopathy. There is moderate cardiomegaly. The visualized upper abdominal organs are unremarkable. There is minimal perihepatic free fluid.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/with_cardiomegaly_v2.npy", feature_np)

(1, 192, 768)


In [8]:
# 502156754
example_text='Lungs: There is pulmonary fibrosis present with honeycombing demonstrated involving a significant portion of the bilateral lower lung zones. There are small areas of honeycombing in the anterior lateral right lung apex. There is no focal consolidation or pulmonary mass lesion demonstrated. There are a few areas of groundglass opacity in the right lung present, but the major finding is the pulmonary fibrosis that is most significant in the basilar regions. Pleural spaces:  There are small bilateral pleural effusions. There are no areas of significant pleural thickening demonstrated. Mediastinum and Lymph Nodes:  There is no pathologic adenopathy demonstrated in the axilla, mediastinum, or hilar regions. Heart and vascular structures: There is cardiomegaly. There is a cardiac pacemaker in place. There are small areas of atherosclerotic disease present, likely to include in the coronary arteries. There is no aneurysmal dilatation of the thoracic aorta. There is no significant pericardial effusion. Esophagus and visualized portion of the gastrointestinal tract: There is a mildly dilated partially fluid-filled distal esophagus. This could represent a hiatal hernia. Osseous structures and chest wall:  Unremarkable without acute or significant non-degenerative abnormalities. Visualized portion of the lower neck:  No major abnormalities are demonstrated in the portion of the neck included on this chest CT scan. Visualized portion of the upper abdomen:  There is ascites present with a small amount of fluid surrounding the liver and spleen. There is significant pulmonary fibrosis present, with large areas of honeycombing present involving both lower lung zones. There are milder areas of abnormality in the subpleural right greater than left upper lung zones. There are mild areas of groundglass opacity in the right upper lung zone, but there is no consolidation in the lungs. There are no suspicious pulmonary mass lesions. There are very small bilateral pleural effusions. There is at least mild ascites seen in the portion of the upper abdomen included on this exam.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/with_cardiomegaly_v3.npy", feature_np)

(1, 192, 768)


In [9]:
example_text='Lungs: There is pulmonary fibrosis present with honeycombing demonstrated involving a significant portion of the bilateral lower lung zones. There are small areas of honeycombing in the anterior lateral right lung apex. There is no focal consolidation or pulmonary mass lesion demonstrated. There are a few areas of groundglass opacity in the right lung present, but the major finding is the pulmonary fibrosis that is most significant in the basilar regions. Pleural spaces:  There are small bilateral pleural effusions. There are no areas of significant pleural thickening demonstrated. Mediastinum and Lymph Nodes:  There is no pathologic adenopathy demonstrated in the axilla, mediastinum, or hilar regions. Heart and vascular structures: There is no cardiomegaly. There is a cardiac pacemaker in place. There are small areas of atherosclerotic disease present, likely to include in the coronary arteries. There is no aneurysmal dilatation of the thoracic aorta. There is no significant pericardial effusion. Esophagus and visualized portion of the gastrointestinal tract: There is a mildly dilated partially fluid-filled distal esophagus. This could represent a hiatal hernia. Osseous structures and chest wall:  Unremarkable without acute or significant non-degenerative abnormalities. Visualized portion of the lower neck:  No major abnormalities are demonstrated in the portion of the neck included on this chest CT scan. Visualized portion of the upper abdomen:  There is ascites present with a small amount of fluid surrounding the liver and spleen. There is significant pulmonary fibrosis present, with large areas of honeycombing present involving both lower lung zones. There are milder areas of abnormality in the subpleural right greater than left upper lung zones. There are mild areas of groundglass opacity in the right upper lung zone, but there is no consolidation in the lungs. There are no suspicious pulmonary mass lesions. There are very small bilateral pleural effusions. There is at least mild ascites seen in the portion of the upper abdomen included on this exam.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/no_cardiomegaly_v3.npy", feature_np)

(1, 192, 768)


In [8]:
example_text='There is cardiomegaly. No evidence of pleural effusion. No evidence of consolidation.'

example=tokenize_function(example_text)
for item in example.keys():
    example[item] = example[item].cuda()

with torch.no_grad():
    feature = model(**example)
    
feature_np = feature.hidden_states[-1][:, :save_seq_len, :].detach().cpu().numpy()
print(feature_np.shape)

np.save("./results/text_embed_example/with_cardiomegaly_v4.npy", feature_np)

(1, 192, 768)
