In [38]:
import os
import numpy as np
import pandas as pd
from glob import glob
from itertools import chain
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, average_precision_score
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import tensorflow as tf

In [39]:
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'

DATA_DIR = 'data/'
image_size = 256
batch_size = 32


In [40]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [41]:
# read data to dataframe
df = pd.read_csv(f'{DATA_DIR}Data_Entry_2017.csv')
df.shape

(112120, 12)

In [42]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [43]:
# clean up the column names
df = df.rename(columns = {'Image Index' : 'image_index', 'Finding Labels': 'finding_labels',
                             'Follow-up #':'follow_up', 'Patient ID': 'patient_id', 'Patient Age':'patient_age',
                             'Patient Gender':'patient_gender', 'View Position':'view_position', 
                              'OriginalImage[Width':'orig_img_width', 'Height]':'orig_img_height',
                             'OriginalImagePixelSpacing[x':'pixel_spacing_x', 'y]':'pixel_spacing_y',
                             'Unnamed: 11':'unnamed_11'})

In [44]:
df.head()

Unnamed: 0,image_index,finding_labels,follow_up,patient_id,patient_age,patient_gender,view_position,orig_img_width,orig_img_height,pixel_spacing_x,pixel_spacing_y,unnamed_11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


Since the image paths are listed without folder names, need to add full paths

In [45]:
data_image_paths = {os.path.basename(x): x for x in glob(os.path.join(DATA_DIR, 'images*', 'images', '*png'))}

In [46]:
assert(len(data_image_paths) == df.shape[0])
data_image_paths[df['image_index'][0]]

'data\\images_001\\images\\00000001_000.png'

In [47]:
df['image_path'] = df['image_index'].map(data_image_paths.get)
del df['image_index']

Now, perform one-hot encoding for the labels that we want to classify

In [48]:
df['finding_labels'] = df['finding_labels'].map(lambda x: x.replace('No Finding', ''))

In [49]:
df = pd.concat([df, df['finding_labels'].str.get_dummies(sep='|')], axis=1)

Split the data

In [50]:
# add column to stratify on
new = df["finding_labels"].str.split('|', n=1, expand=True)
df['disease'] = new[0]
df[['finding_labels', 'disease']].head(10)

Unnamed: 0,finding_labels,disease
0,Cardiomegaly,Cardiomegaly
1,Cardiomegaly|Emphysema,Cardiomegaly
2,Cardiomegaly|Effusion,Cardiomegaly
3,,
4,Hernia,Hernia
5,Hernia,Hernia
6,Hernia,Hernia
7,Hernia|Infiltration,Hernia
8,Hernia,Hernia
9,Hernia,Hernia


In [51]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state = 2020, stratify=df['disease'])

In [52]:
diseases = df.disease.unique().tolist()
diseases.remove('')
diseases

['Cardiomegaly',
 'Hernia',
 'Mass',
 'Infiltration',
 'Effusion',
 'Nodule',
 'Emphysema',
 'Atelectasis',
 'Pleural_Thickening',
 'Pneumothorax',
 'Fibrosis',
 'Consolidation',
 'Edema',
 'Pneumonia']

In [53]:
train_df.head()

Unnamed: 0,finding_labels,follow_up,patient_id,patient_age,patient_gender,view_position,orig_img_width,orig_img_height,pixel_spacing_x,pixel_spacing_y,...,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,disease
108340,Effusion,3,29358,60,F,PA,2021,2021,0.194311,0.194311,...,0,0,0,0,0,0,0,0,0,Effusion
14801,,1,3866,51,F,PA,2048,2500,0.171,0.171,...,0,0,0,0,0,0,0,0,0,
96793,Mass,2,25518,41,M,PA,2992,2991,0.143,0.143,...,0,0,0,0,1,0,0,0,0,Mass
36151,,0,9547,64,M,PA,2500,2048,0.168,0.168,...,0,0,0,0,0,0,0,0,0,
25974,Mass,6,6827,45,F,PA,2500,2048,0.171,0.171,...,0,0,0,0,1,0,0,0,0,Mass


In [54]:
train_df['labels'] = train_df.apply(lambda x: x['finding_labels'].split('|'), axis=1)
valid_df['labels'] = valid_df.apply(lambda x: x['finding_labels'].split('|'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
core_idg = ImageDataGenerator(rescale=1 / 255,
                                  samplewise_center=True,
                                  samplewise_std_normalization=True,
                                  horizontal_flip=True,
                                  vertical_flip=False,
                                  height_shift_range=0.05,
                                  width_shift_range=0.1,
                                  rotation_range=5,
                                  shear_range=0.1,
                                  fill_mode='reflect',
                                  zoom_range=0.15)

train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                             directory=None,
                                             x_col='image_path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=batch_size,
                                             classes=diseases,
                                             target_size=(image_size, image_size))

valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                             directory=None,
                                             x_col='image_path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=batch_size,
                                             classes=diseases,
                                             target_size=(image_size, image_size))

test_X, test_Y = next(core_idg.flow_from_dataframe(dataframe=valid_df,
                                                       directory=None,
                                                       x_col='image_path',
                                                       y_col='labels',
                                                       class_mode='categorical',
                                                       batch_size=1024,
                                                       classes=diseases,
                                                       target_size=(image_size, image_size)))

In [None]:
from tensorflow.keras.applications.densenet import DenseNet121
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.nasnet import NASNetMobile
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2

base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
output = tf.keras.layers.Dense(len(diseases), activation="sigmoid")(x)
model = tf.keras.Model(base_model.input, output)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
def get_callbacks(model_name):
    callbacks = []
    tensor_board = tf.keras.callbacks.TensorBoard(log_dir='logs', histogram_freq=0, profile_batch = 100000000)
    callbacks.append(tensor_board)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=f'model.{model_name}.h5',
        verbose=1,
        save_best_only=True)
    # erly = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    callbacks.append(checkpoint)
    # callbacks.append(erly)
    return callbacks

In [None]:
callbacks = get_callbacks('inceptionresnetv2')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
with tf.device('/GPU:0'):
    model.fit(train_gen,
                  steps_per_epoch=100,
                  validation_data=(test_X, test_Y),
                  epochs=50,
                  callbacks=callbacks)

Found 41407 validated image filenames belonging to 14 classes.
Found 10352 validated image filenames belonging to 14 classes.
Found 10352 validated image filenames belonging to 14 classes.


In [56]:
from tensorflow.keras.applications.densenet import DenseNet121
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.nasnet import NASNetMobile
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2

base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
output = tf.keras.layers.Dense(len(diseases), activation="sigmoid")(x)
model = tf.keras.Model(base_model.input, output)
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

In [65]:
def get_callbacks(model_name):
    callbacks = []
    tensor_board = tf.keras.callbacks.TensorBoard(log_dir='logs', histogram_freq=0, profile_batch = 100000000)
    callbacks.append(tensor_board)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=f'model.{model_name}.h5',
        verbose=1,
        save_best_only=True)
    # erly = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    callbacks.append(checkpoint)
    # callbacks.append(erly)
    return callbacks

In [None]:
callbacks = get_callbacks('inceptionresnetv2')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
with tf.device('/GPU:0'):
    model.fit(train_gen,
                  steps_per_epoch=100,
                  validation_data=(test_X, test_Y),
                  epochs=50,
                  callbacks=callbacks)

  ...
    to  
  ['...']
Train for 100 steps, validate on 1024 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 1.48433, saving model to model.inceptionresnetv2.h5
Epoch 2/50
Epoch 00002: val_loss did not improve from 1.48433
Epoch 3/50
  8/100 [=>............................] - ETA: 1:54 - loss: 0.2849 - accuracy: 0.8934