In [57]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [58]:
chexpert = pd.read_csv('train.csv')
nih = pd.read_csv('NIH.csv')


In [59]:
# filter by images that are not normal
chexpertlist = list(chexpert.columns[5:]) # Diseases
print(chexpertlist)
print("Number of diseases:", str(len(chexpertlist)))

['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices']
Number of diseases: 14


Due to the fact that support devices are not a disease, it may be reasonable to remove them as a disease label. In addition, No Finding isn't a disease, so it can be removed.

In [60]:
chexpertlist.remove('Support Devices')
chexpertlist.remove('No Finding')

# print hte length of the list
print(chexpertlist)
print("Number of diseases:", str(len(chexpertlist)))

['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture']
Number of diseases: 12


In [61]:
nihlist = list(nih.drop(nih[nih['Finding Labels'].str.contains('\|')].index, inplace=False)['Finding Labels'].value_counts().index)
print(nihlist)
print("Number of diseases:", str(len(nihlist)))

['No Finding', 'Infiltration', 'Atelectasis', 'Effusion', 'Nodule', 'Pneumothorax', 'Mass', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Fibrosis', 'Edema', 'Pneumonia', 'Hernia']
Number of diseases: 15


No Finding is not a disease, so it can be removed from NIH dataset as well.

In [62]:
nihlist.remove('No Finding')

print(nihlist)
print("Number of diseases:", str(len(nihlist)))

['Infiltration', 'Atelectasis', 'Effusion', 'Nodule', 'Pneumothorax', 'Mass', 'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Fibrosis', 'Edema', 'Pneumonia', 'Hernia']
Number of diseases: 14


In [63]:
commondiseases = list(set(chexpertlist).intersection(nihlist))
noncommondiseases1 = list(set(chexpertlist).difference(nihlist))
noncommondiseases2 = list(set(nihlist).difference(chexpertlist))
print(commondiseases) # These diseases match up perfectly
print(noncommondiseases1)
print(noncommondiseases2)

['Edema', 'Pneumothorax', 'Consolidation', 'Cardiomegaly', 'Atelectasis', 'Pneumonia']
['Lung Opacity', 'Enlarged Cardiomediastinum', 'Pleural Effusion', 'Pleural Other', 'Lung Lesion', 'Fracture']
['Emphysema', 'Mass', 'Fibrosis', 'Hernia', 'Pleural_Thickening', 'Infiltration', 'Nodule', 'Effusion']


https://utswmed.org/conditions-treatments/pulmonary-nodules-and-lung-lesions/ 

https://www.sciencedirect.com/topics/medicine-and-dentistry/lung-lesion

- The websites above appear to consider pulmonary nodules (like those in the NIH database) to be same as a lung lesion (like those in the CheXpert database)

In [64]:
commondiseases.append('Nodule/Lung Lesion')
noncommondiseases1.remove('Lung Lesion')
noncommondiseases2.remove('Nodule')

https://radiopaedia.org/articles/enlargement-of-the-cardiac-silhouette?lang=us 

This website explains that enlargement of the cardiac silhouette, enlarged cardiomediastinum (like in the CheXpert database) is primarily caused by cardiomegaly.

In [65]:
commondiseases.append('Cardiomegaly/Enlarged Cardiomediastinum')
noncommondiseases1.remove('Enlarged Cardiomediastinum')

It is reasonable to assume that the pleural effusion label in the ChexPert database is the same as the effusion label in the NIH database.

In [66]:
commondiseases.append('Effusion/Pleural Effusion')
noncommondiseases1.remove('Pleural Effusion')
noncommondiseases2.remove('Effusion')

Since we know that Effusion in the NIH dataset and Pleural Effusion are the same. Thus, we know that Pleural Other probably refers to Pleural Thickening. -- Thevindu

In [67]:
commondiseases.append('Pleural Other/Pleural Thickening')
noncommondiseases1.remove('Pleural Other')
noncommondiseases2.remove('Pleural_Thickening')

Check in

In [68]:
# print the length of common diseases
print(commondiseases)
print("Number of common diseases:", str(len(commondiseases)))

['Edema', 'Pneumothorax', 'Consolidation', 'Cardiomegaly', 'Atelectasis', 'Pneumonia', 'Nodule/Lung Lesion', 'Cardiomegaly/Enlarged Cardiomediastinum', 'Effusion/Pleural Effusion', 'Pleural Other/Pleural Thickening']
Number of common diseases: 10


In [69]:
print(noncommondiseases1)
print(noncommondiseases2)

['Lung Opacity', 'Fracture']
['Emphysema', 'Mass', 'Fibrosis', 'Hernia', 'Infiltration']
