In [2]:

# Replicating https://medium.com/analytics-vidhya/automatic-medical-report-generation-from-x-ray-images-through-ai-fd04de21e0e5
# Authors
# Xiaoning Zhu and Miguel Xochicale

import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import xml.etree.ElementTree as ET
import lxml

from bs4 import BeautifulSoup #Beautiful Soup is a Python library for pulling data out of HTML and XML files. 
import re


import nltk #Natural Language Toolkit https://www.nltk.org/
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mxochicale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mxochicale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:


###########################################
## CHANGE THIS PATHS BASED ON YOUR SYSTEM
HOME_PATH = os.path.expanduser(f'~')
USERNAME = os.path.split(HOME_PATH)[1]
REPOSITORY_PATH='repositories/budai4medtech/amir/'
FULL_REPO_PATH = HOME_PATH+'/'+REPOSITORY_PATH
FULL_DATASET_PATH = FULL_REPO_PATH  + 'datasets/'

print(f'FULL_DATASET_PATH: {FULL_DATASET_PATH}' )


FULL_DATASET_PATH: /home/mxochicale/repositories/budai4medtech/amir/datasets/


In [None]:
indication_list=[]
finding_list=[]
impression_list=[]
image_list=[]
df=pd.DataFrame()


# dir = "/content/drive/MyDrive/Colab Notebooks/dissertation/ecgen-radiology"
dir = FULL_DATASET_PATH + 'ecgen-radiology'

for file in tqdm(os.listdir(dir)):
  if file.endswith(".xml"):
    file_name= dir + "/" +file
    tree=ET.parse(file_name)
    indication=tree.find(".//AbstractText[@Label='INDICATION']").text
    finding = tree.find(".//AbstractText[@Label='FINDINGS']").text
    impression = tree.find(".//AbstractText[@Label='IMPRESSION']").text
    indication_list.append(indication)
    finding_list.append(finding)
    impression_list.append(impression)
    image=set()
    for img in tree.findall("parentImage"):
      img_list = img.attrib['id']+".png"
      image.add(img_list)
    image_list.append(image)
    df=df.append(pd.Series([",".join(image)],index=["images"]),ignore_index=True)

df["indication"]=indication_list
df["finding"]=finding_list
df["impression"]=impression_list
df.to_csv("Extract_dataset.csv",index=False)

In [5]:
df

Unnamed: 0,images,indication,finding,impression
0,"CXR4_IM-2050-2001.png,CXR4_IM-2050-1001.png",XXXX-year-old XXXX with XXXX.,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
1,"CXR3265_IM-1551-1001.png,CXR3265_IM-1551-2001.png",XXXX-year-old female with XXXX,The heart size and mediastinal contours appear...,No acute cardiopulmonary findings.
2,"CXR3044_IM-1418-2001.png,CXR3044_IM-1418-1001.png",Difficulty breathing.,The XXXX examination consists of frontal and l...,No evidence of acute cardiopulmonary process.
3,"CXR2937_IM-1339-1001.png,CXR2937_IM-1339-2001.png",XXXX-year-old male complaining of pain,The heart size and mediastinal contours appear...,No acute cardiopulmonary findings.
4,"CXR1222_IM-0150-2001.png,CXR1222_IM-0150-1001.png",chest pain,The heart and lungs have XXXX XXXX in the inte...,No active disease.
...,...,...,...,...
3950,CXR1769_IM-0503-1001.png,Patient with dyspnea and XXXX,Consolidation and some atelectasis are present...,"Bibasilar airspace disease, left worse right. ..."
3951,CXR2705_IM-1171-1001.png,History of dyspnea,,Heart size is normal and the lungs are clear. ...
3952,"CXR2545_IM-1054-1002.png,CXR2545_IM-1054-1001.png",XXXX.,The trachea is midline. The cardiomediastinal ...,Normal chest x-XXXX.
3953,"CXR3404_IM-1647-2001.png,CXR3404_IM-1647-1001.png",chronic XXXX; left mastectomy,The heart is normal in size. The mediastinum i...,No acute disease.


In [6]:
#check any null in image 
Extract_df=pd.read_csv("Extract_dataset.csv")
null=pd.isnull(Extract_df["images"])
df=Extract_df[null]
print("The number of Nan Values in Images columns",len(df))
df

The number of Nan Values in Images columns 104


Unnamed: 0,images,indication,finding,impression
50,,The patient is a XXXX year-old female with ICD...,ICD XXXX tip remains in the right ventricle. S...,1. ICD XXXX tip overlying the right ventricle....
144,,Pain and difficulty breathing,Stable left-sided ICD and postsurgical changes...,1. No acute radiographic cardiopulmonary process.
179,,Leg weakness.,There is a right pleural effusion which appear...,Loculated appearing right pleural effusion and...
181,,XXXX year old female with left lower lobe infi...,"The lungs are clear bilaterally. Specifically,...",No acute cardiopulmonary abnormality..
228,,XXXX-year-old with chest pain and nausea.,"The heart, pulmonary XXXX and mediastinum are ...",No acute cardiopulmonary disease.
...,...,...,...,...
3719,,"XXXX's syndrome, post surgery for XXXX XXXX, c...",Status post posterior spinal fusion. Status po...,1. Stable position of the cardiac XXXX and lea...
3754,,Chest pain and shortness of breath,Mild cardiomegaly. The lungs are clear bilater...,No acute cardiopulmonary abnormalities.
3842,,"Dyspnea, shortness of breath, abdominal pain.",The cardiomediastinal silhouette and pulmonary...,No acute cardiopulmonary findings. .
3869,,"XXXX-year-old female, XXXX, preop knee surgery.",The cardiomediastinal silhouette is within nor...,No acute cardiopulmonary process.


In [7]:
#TO Removing 104 patient record contain without images.
# df = df[df['images'].notna()]
# df=df.dropna(subset=['images'])

## The following remove NAN rows from any column
Extract_df=pd.read_csv("Extract_dataset.csv")
Extract_df = Extract_df.dropna()
Extract_df

Unnamed: 0,images,indication,finding,impression
0,"CXR4_IM-2050-2001.png,CXR4_IM-2050-1001.png",XXXX-year-old XXXX with XXXX.,There are diffuse bilateral interstitial and a...,1. Bullous emphysema and interstitial fibrosis...
1,"CXR3265_IM-1551-1001.png,CXR3265_IM-1551-2001.png",XXXX-year-old female with XXXX,The heart size and mediastinal contours appear...,No acute cardiopulmonary findings.
2,"CXR3044_IM-1418-2001.png,CXR3044_IM-1418-1001.png",Difficulty breathing.,The XXXX examination consists of frontal and l...,No evidence of acute cardiopulmonary process.
3,"CXR2937_IM-1339-1001.png,CXR2937_IM-1339-2001.png",XXXX-year-old male complaining of pain,The heart size and mediastinal contours appear...,No acute cardiopulmonary findings.
4,"CXR1222_IM-0150-2001.png,CXR1222_IM-0150-1001.png",chest pain,The heart and lungs have XXXX XXXX in the inte...,No active disease.
...,...,...,...,...
3949,"CXR2546_IM-1055-2001.png,CXR2546_IM-1055-1001.png","Chest pain, pressure",The heart size and pulmonary vascularity appea...,Tortuous thoracic aorta. Clear lungs.
3950,CXR1769_IM-0503-1001.png,Patient with dyspnea and XXXX,Consolidation and some atelectasis are present...,"Bibasilar airspace disease, left worse right. ..."
3952,"CXR2545_IM-1054-1002.png,CXR2545_IM-1054-1001.png",XXXX.,The trachea is midline. The cardiomediastinal ...,Normal chest x-XXXX.
3953,"CXR3404_IM-1647-2001.png,CXR3404_IM-1647-1001.png",chronic XXXX; left mastectomy,The heart is normal in size. The mediastinum i...,No acute disease.


In [6]:
def decontraction(doc):
    docs = re.sub(r"won't", "will not", doc)
    docs = re.sub(r"can\'t", "can not", docs)
    docs = re.sub(r"n\'t", " not", docs)
    docs = re.sub(r"\'re", " are", docs)
    docs = re.sub(r"\'s", " is", docs)
    docs = re.sub(r"\'d", " would", docs)
    docs = re.sub(r"\'ll", " will", docs)
    docs = re.sub(r"\'t", " not", docs)
    docs = re.sub(r"\'ve", " have", docs)
    docs = re.sub(r"\'m", " am",docs)
    docs = re.sub(r"n\'t", " not", docs)
    docs = re.sub(r"\'re", " are", docs)
    docs = re.sub(r"\'s", " is", docs)
    docs = re.sub(r"\'d", " would", docs)
    docs = re.sub(r"\'ll", " will", docs)
    docs = re.sub(r"\'t", " not", docs)
    docs = re.sub(r"\'ve", " have",docs)
    docs = re.sub(r"\'m", " am", docs)
    docs = re.sub(r"\*+", "abuse", docs)
    return docs

def remove_stopwords(data):
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')# removing the "not" from the stopwords
  all_stopwords.remove('no')
  text_tokens = word_tokenize(data) # tokenizing the sentence into word
  tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
  return " ".join(tokens_without_sw)

def text_preprocess(data):
    clean_text=[]
    for i in (data.values):
        text = BeautifulSoup(i, 'lxml').get_text()
        text=text.lower()# making the text in lower case
        text=re.sub(r"[|\'||\\||/|?|$|.|.|,||;|><!&)(:`\n\t@#=+%^*~-]","",text)# removing the special character in text
        text=re.sub(r"xxxx","",text)# Removing the irrelavent XXXX words
        text=re.sub(r'xxx',"",text)
        text=re.sub(r'xx',"",text)
        text=re.sub(r'[0-9]',"",text)# Removing the number in text
        text = re.sub(r"yearold", "", text)
        text= re.sub('\s+',' ',text)
        text=re.sub("'",'',text)# Removing apoustahe
        text=re.sub(r'\.\.+', '.',text)# Removing multiple full stop
        text=re.sub(r'  ',"",text)
        text=decontraction(text)
        text=remove_stopwords(text)
        clean_text.append(text)        
    return clean_text

In [7]:
Extract_df["indication"]=text_preprocess(Extract_df["indication"])
Extract_df["finding"]=text_preprocess(Extract_df["finding"])
Extract_df["impression"]=text_preprocess(Extract_df["impression"])   

  text=re.sub(r"[|\'||\\||/|?|$|.|.|,||;|><!&)(:`\n\t@#=+%^*~-]","",text)# removing the special character in text
  text=re.sub(r"[|\'||\\||/|?|$|.|.|,||;|><!&)(:`\n\t@#=+%^*~-]","",text)# removing the special character in text
  text=re.sub(r"[|\'||\\||/|?|$|.|.|,||;|><!&)(:`\n\t@#=+%^*~-]","",text)# removing the special character in text
  text = BeautifulSoup(i, 'lxml').get_text()


In [8]:
Extract_df

Unnamed: 0,images,indication,finding,impression
0,"CXR4_IM-2050-2001.png,CXR4_IM-2050-1001.png",,diffuse bilateral interstitial alveolar opacit...,bullous emphysema interstitial fibrosis probab...
1,"CXR3265_IM-1551-2001.png,CXR3265_IM-1551-1001.png",female,heart size mediastinal contours appear within ...,no acute cardiopulmonary findings
2,"CXR3044_IM-1418-2001.png,CXR3044_IM-1418-1001.png",difficulty breathing,examination consists frontal lateral radiograp...,no evidence acute cardiopulmonary process
3,"CXR2937_IM-1339-1001.png,CXR2937_IM-1339-2001.png",male complaining pain,heart size mediastinal contours appear within ...,no acute cardiopulmonary findings
4,"CXR1222_IM-0150-1001.png,CXR1222_IM-0150-2001.png",chest pain,heart lungs interval lungs clear expanded hear...,no active disease
...,...,...,...,...
3949,"CXR2546_IM-1055-1001.png,CXR2546_IM-1055-2001.png",chest pain pressure,heart size pulmonary vascularity appear within...,tortuous thoracic aorta clear lungs
3950,CXR1769_IM-0503-1001.png,patient dyspnea,consolidation atelectasis present left lower l...,bibasilar airspace disease left worse right bi...
3952,"CXR2545_IM-1054-1002.png,CXR2545_IM-1054-1001.png",,trachea midline cardiomediastinal silhouette n...,normal chest x
3953,"CXR3404_IM-1647-1001.png,CXR3404_IM-1647-2001.png",chronic left mastectomy,heart normal size mediastinum stable postsurgi...,no acute disease


In [9]:
indication_reports=pd.Series(Extract_df["indication"].tolist())
len_indication=len(indication_reports)
unique_indication_report=len(np.unique(indication_reports))
print("no. of unique indication report in the dataset:{}".format(unique_indication_report))
repeated_indication_report=np.sum(indication_reports.value_counts()>1)
print("No. of indication report repeated more than one times:{}".format(repeated_indication_report))


no. of unique indication report in the dataset:1684
No. of indication report repeated more than one times:177


In [10]:
finding_reports=pd.Series(Extract_df["finding"].tolist())
len_finding=len(finding_reports)
unique_finding_report=len(np.unique(finding_reports))
print("no. of unique finding report in the dataset:{}".format(unique_finding_report))
repeated_finding_report=np.sum(finding_reports.value_counts()>1)
print("No. of finding report repeated more than one times:{}".format(repeated_finding_report))

no. of unique finding report in the dataset:2483
No. of finding report repeated more than one times:142


In [11]:
impression_reports=pd.Series(Extract_df["impression"].tolist())
len_finding=len(impression_reports)
unique_impression_report=len(np.unique(impression_reports))
print("no. of unique finding report in the dataset:{}".format(unique_impression_report))
repeated_impression_report=np.sum(impression_reports.value_counts()>1)
print("No. of finding report repeated more than one times:{}".format(repeated_impression_report))


no. of unique finding report in the dataset:1254
No. of finding report repeated more than one times:104
