# Extraccion de informacion de reportes medicos

El objetivo de este proyecto es extraer desde un reporte medico que contiene texto libre, informacion del paciente, informacion de la medicacion del paciente, y una imagen de un radiografia de torax, lo siguiente:

- Texto del reporte de-identificado (sin informacion personal del paciente)
- Medicamentos con los que es tratado el paciente
- Prediccion de si en la radiografia es normal o presenta neumonia

Documento a analizar: [Reporte](https://drive.google.com/file/d/15zA5XDQfrVIYZk0gHGLxQqt_up7-NjuP/view?usp=sharing)

1) De-identificar texto (quitar informacion personal)

2) Obtener medicamentos del paciente

3) Predecir si la radiografia corresponde a un paciente normal o con neumonia

In [None]:
!pip install PyPDF
!pip install PyPDF2
!pip install pdf2image
!sudo apt install tesseract-ocr 
!pip install pytesseract 
!apt-get install poppler-utils 
! pip install pdfminer
!pip install Image
!pip install PyMuPDF Pillow
! pip install tensorflow_addons 

In [None]:
import os
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import fitz
import pandas as pd
import re
import os
import random
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import tensorflow as tf
import torch

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
report_dir = '/content/drive/MyDrive/Transformers_Course'

In [None]:
path = os.path.join(report_dir, 'Report.pdf')
PDFfile = open(path, 'rb')

PDFfilereader = PyPDF2.PdfFileReader(PDFfile)

pages = PDFfilereader.getPage(0)

#extracting the text in PDF file
print(pages.extractText())

#close the PDF file
PDFfile.close()




In [None]:
pages = convert_from_path(path)

In [None]:
for page in pages:
    page.save(os.path.join(report_dir, 'Report.png'), 'PNG')

In [None]:
extract = pytesseract.image_to_string(Image.open(os.path.join(report_dir, 'Report.png')))

In [22]:
print(extract)

['Patient Medical Report', '', 'Name: Matias Molinas', 'Birth date: 10/03/1918', '(54) 342- 5678906', 'Significant Medical History', '', 'Mr. Matias Molinas is a 52 y/o accountant with hypercholesterolemia and polycythemia vera', 'who has been in relatively good health.', '', 'Diagnostic: Patient needs to take 50 mg of ibuprofen.', '', 'Chest X-Ray:', '', ' ', '\x0c']


In [None]:
file = path

#open the fitz file
pdf = fitz.open(file)

#select the page number
image_list = pdf.get_page_images(0)

#applying the loop
for image in image_list:
   xref = image[0]
   path_2= os.path.join(report_dir, str(xref))
   pix = fitz.Pixmap(pdf, xref)
   if pix.n < 5:
       pix.save(f'{path_2}.png')
   else:
       pix1 = fitz.open(fitz.csRGB, pix)
       pix1.save(f'{xref}.png')
       pix1 = None
   pix = None


In [None]:
extract=extract.split('\n')

In [None]:
l=list()
for i in extract:
  if i!='' and i!=' ':
    l.append(i)
l


['Patient Medical Report',
 'Name: Matias Molinas',
 'Birth date: 10/03/1918',
 '(54) 342- 5678906',
 'Significant Medical History',
 'Mr. Matias Molinas is a 52 y/o accountant with hypercholesterolemia and polycythemia vera',
 'who has been in relatively good health.',
 'Diagnostic: Patient needs to take 50 mg of ibuprofen.',
 'Chest X-Ray:',
 '\x0c']

In [None]:
sequence = ". ".join(extract)

In [None]:
sequence

# 1) De-identificar texto (quitar informacion personal)

In [None]:
! pip install transformers datasets

In [None]:
from transformers import TFAutoModelForTokenClassification,AutoTokenizer,BertForTokenClassification

In [26]:
model = BertForTokenClassification.from_pretrained("connorboyle/bert-ner-i2b2")
tokenizer = AutoTokenizer.from_pretrained("connorboyle/bert-ner-i2b2")

Downloading:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411M [00:00<?, ?B/s]

In [27]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")
outputs = model(inputs)[0]
predictions = torch.argmax(outputs, axis=2)

In [None]:
for token, prediction in zip(tokens, predictions[0].numpy()):
  print((token, model.config.id2label[prediction]))

# Guardo información personal del paciente

In [30]:
df = pd.DataFrame(columns=["Word","Classification"])
token_list = []
prediction_list = []
for token, prediction in zip(tokens, predictions[0].numpy()):
  token_list.append(token)
  prediction_list.append(model.config.id2label[prediction])

In [31]:
df["Word"] = token_list
df["Classification"] = prediction_list

In [32]:
df.loc[df["Classification"]=='O',"Classification"] = "Not sensitive data"

In [None]:
df.head(20)

# 2) Obtener medicamentos del paciente

In [34]:
model = BertForTokenClassification.from_pretrained("samrawal/bert-large-uncased_med-ner")
tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-large-uncased_med-ner")

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [35]:
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")
outputs = model(inputs)[0]
predictions = torch.argmax(outputs, axis=2)

In [None]:
for token, prediction in zip(tokens, predictions[0].numpy()):
  print((token, model.config.id2label[prediction]))

In [37]:
df_2 = pd.DataFrame(columns=["Word","Classification"])
token_list = []
prediction_list = []
for token, prediction in zip(tokens, predictions[0].numpy()):
  token_list.append(token)
  prediction_list.append(model.config.id2label[prediction])

In [38]:
df_2["Word"] = token_list
df_2["Classification"] = prediction_list

# 3) Predecir si la radiografia corresponde a un paciente normal o con neumonia

In [1]:
#! pip install -q kaggle

In [2]:
#from google.colab import files

#files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"catalinacarenzo","key":"959afb699038c9860bc0c775d95b1c51"}'}

In [3]:
#!mkdir ~/.kaggle


In [4]:
#! cp kaggle.json ~/.kaggle/

In [5]:
#! chmod 600 ~/.kaggle/kaggle.json

In [None]:
#! kaggle datasets list

In [None]:
#! kaggle competitions download -c siim-covid19-detection -p /content/drive/MyDrive/Transformers_Course


In [3]:
num_classes = 2
input_shape = (180, 180, 3)

train_images = tf.data.TFRecordDataset(
    "gs://download.tensorflow.org/data/ChestXRay2017/train/images.tfrec"
)
train_paths = tf.data.TFRecordDataset(
    "gs://download.tensorflow.org/data/ChestXRay2017/train/paths.tfrec"
)

ds = tf.data.Dataset.zip((train_images, train_paths))

In [7]:
COUNT_NORMAL = len(
    [
        filename
        for filename in train_paths
        if "NORMAL" in filename.numpy().decode("utf-8")
    ]
)
print("Normal images count in training set: " + str(COUNT_NORMAL))

COUNT_PNEUMONIA = len(
    [
        filename
        for filename in train_paths
        if "PNEUMONIA" in filename.numpy().decode("utf-8")
    ]
)
print("Pneumonia images count in training set: " + str(COUNT_PNEUMONIA))

Normal images count in training set: 1349
Pneumonia images count in training set: 3883


In [10]:
AUTOTUNE = tf.data.AUTOTUNE
IMAGE_SIZE = [180, 180]
CLASS_NAMES = ["NORMAL", "PNEUMONIA"]

In [11]:
def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, "/")
    # The second to last is the class-directory
    return parts[-2] == "PNEUMONIA"


def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # resize the image to the desired size.
    return tf.image.resize(img, IMAGE_SIZE)


def process_path(image, path):
    label = get_label(path)
    # load the raw data from the file as a string
    img = decode_img(image)
    return img, label


ds = ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [13]:
ds

<ParallelMapDataset shapes: ((180, 180, 3), ()), types: (tf.float32, tf.bool)>

In [14]:
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [15]:
train_ds, val_ds, test_ds = get_dataset_partitions_tf(ds,COUNT_NORMAL + COUNT_PNEUMONIA) 

In [16]:
for image, label in train_ds.take(1):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

Image shape:  (180, 180, 3)
Label:  True


In [17]:
BATCH_SIZE = 128

In [18]:
def prepare_for_training(ds, cache=True):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.batch(BATCH_SIZE)

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [19]:
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)
test_ds = prepare_for_training(test_ds)


image_batch, label_batch = next(iter(train_ds))

In [21]:
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(10, 10))
    for n in range(25):
        ax = plt.subplot(5, 5, n + 1)
        plt.imshow(image_batch[n] / 255)
        if label_batch[n]:
            plt.title("PNEUMONIA")
        else:
            plt.title("NORMAL")
        plt.axis("off")

In [20]:
learning_rate = 0.001
weight_decay = 0.0001
batch_size = BATCH_SIZE
num_epochs = 100
image_size = 180  
patch_size = 18  # Size of the patches to be extract from the input images
num_patches = (image_size // patch_size) ** 2
projection_dim = 128
num_heads = 6
transformer_units = [
    projection_dim,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 2
mlp_head_units = [projection_dim * 4, projection_dim * 2]  # Size of the dense layers of the final classifier

In [22]:
data_augmentation = keras.Sequential(
    [
        layers.experimental.preprocessing.Rescaling(scale=1./255),
        layers.experimental.preprocessing.RandomCrop(image_size, image_size),
        layers.experimental.preprocessing.RandomFlip("horizontal")
    ],
    name="data_augmentation"
)

In [23]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

In [24]:
class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 18))
image = image_batch[0] # / 255
plt.imshow(image.numpy().astype("uint8"))
plt.axis("off")
plt.savefig("original.png", dpi=300, bbox_inches="tight")

resized_image = tf.image.resize(
    tf.convert_to_tensor([image]), size=(image_size, image_size)
)
patches = Patches(patch_size)(resized_image)
print(f"Image size: {image_size} X {image_size}")
print(f"Patch size: {patch_size} X {patch_size}")
print(f"Patches per image: {patches.shape[1]}")
print(f"Elements per patch: {patches.shape[-1]}")

n = int(np.sqrt(patches.shape[1]))
plt.figure(figsize=(18, 18))
for i, patch in enumerate(patches[0]):
    ax = plt.subplot(n, n, i + 1)
    patch_img = tf.reshape(patch, (patch_size, patch_size, 3))
    plt.imshow(patch_img.numpy().astype("uint8"))
    plt.axis("off")
plt.savefig("patched.png", dpi=300, bbox_inches="tight")

In [26]:
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

In [27]:
def create_vit_classifier():
    inputs = layers.Input(shape=input_shape)
    # Augment data.
    augmented = data_augmentation(inputs)
    # Create patches.
    patches = Patches(patch_size)(augmented)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    # Classify outputs.
    logits = layers.Dense(num_classes)(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model


In [28]:
create_vit_classifier().count_params()/1e6

7.681922

In [29]:
initial_bias = np.log([COUNT_PNEUMONIA / COUNT_NORMAL])
print("Initial bias: {:.5f}".format(initial_bias[0]))

TRAIN_IMG_COUNT = COUNT_NORMAL + COUNT_PNEUMONIA
weight_for_0 = (1 / COUNT_NORMAL) * (TRAIN_IMG_COUNT) / 2.0
weight_for_1 = (1 / COUNT_PNEUMONIA) * (TRAIN_IMG_COUNT) / 2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print("Weight for class 0: {:.2f}".format(weight_for_0))
print("Weight for class 1: {:.2f}".format(weight_for_1))

Initial bias: 1.05724
Weight for class 0: 1.94
Weight for class 1: 0.67


In [None]:
def run_experiment(model):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )

    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("xray_model.h5", save_best_only=True)

    history = model.fit(
        train_ds,
        batch_size=batch_size,
        epochs=num_epochs,
        #validation_data=val_ds,
        class_weight=class_weight,
        callbacks=[checkpoint_cb],
    )

    return history


vit_classifier = create_vit_classifier()
history = run_experiment(vit_classifier)

In [31]:
vit_classifier.evaluate(test_ds, return_dict=True)



{'accuracy': 0.9599236845970154,
 'loss': 0.09697042405605316,
 'top-5-accuracy': 1.0}

Determino si la imagen del reporte tiene neumonia. Vemos que si.

In [58]:
from skimage.io import imread
test_image = imread(os.path.join(report_dir, '5.png'))
resized_image = tf.image.resize(
    tf.convert_to_tensor([test_image]), size=(image_size, image_size)
)
print(resized_image.shape)
prediction = vit_classifier.predict(resized_image)

if prediction[0][0] > 0:
    print (CLASS_NAMES[0])
else:
    print (CLASS_NAMES[1])


(1, 180, 180, 3)
PNEUMONIA
