This workbook proposes a solution to the Kaggle challenge on pneumonia detection posted by RSNA. It draws on Zahaviguy's 'What are lunch opacities' kernel (https://www.kaggle.com/zahaviguy/what-are-lung-opacities).

In [1]:
import glob, pandas as pd
import matplotlib.pyplot as plt
import pydicom, numpy as np
from sklearn.datasets import load_files 
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
from keras.optimizers import Adam, SGD, RMSprop
from keras.models import Model, Input, Sequential
from keras.callbacks import ModelCheckpoint  
# Set paths

S1_TRAIN_IMGS = "./img/train_png/"
S1_VALID_IMGS = "./img/valid_png/"
S1_TEST_IMGS = "./img/test_png/"
S1_LABELS = "./img/labels/stage_1_train_labels.csv"
S1_CLASS_INFO = "./stage_1_detailed_class_info.csv"

# Helper to parse CSV file

def parse_csv(df):
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]
    parsed = {}
    
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': S1_TRAIN_IMGS + '{0}.dcm.png'.format(pid),
                'label': row['Target'],
                'boxes': []
            }
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))
    
    return parsed


            

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17709702840872271409
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10670292992
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2985982445329014845
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [3]:
df = pd.read_csv(S1_LABELS)

patient_class = pd.read_csv(S1_CLASS_INFO, index_col=0)

parsed = parse_csv(df)
print(df.columns.tolist())

['patientId', 'x', 'y', 'width', 'height', 'Target']


In [4]:
# Explore some data using our parser

patient_0 = df['patientId'][0]
print(parsed[patient_0])
print(patient_class.loc[patient_0])

{'dicom': './img/train_png/0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm.png', 'label': 0, 'boxes': []}
class    No Lung Opacity / Not Normal
Name: 0004cfab-14fd-4e49-80ba-63a80b6bddd6, dtype: object


In [5]:
def draw(data):
    """
    Draw single patient with bounding boxes
    """
    
    di = pydicom.read_file(data['dicom'])
    img = di.pixel_array
    
    img = np.stack([img] * 3, axis = 2)
    
    # add boxes with random colour if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        img = overlay_box(img=img, box=box, rgb=rgb, stroke=6)
        
    plt.imshow(img, cmap=plt.cm.gist_gray)
    plt.axis('off')
    
def overlay_box(img, box, rgb, stroke=1):
    box = [int(b) for b in box]
    
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width
    
    img[y1:y1 + stroke, x1:x2] = rgb
    img[y2:y2 + stroke, x1:x2] = rgb
    img[y1:y2, x1:x1 + stroke] = rgb
    img[y1:y2, x2:x2 + stroke] = rgb
    
    return img

In [6]:
patient_test = df['patientId'][3]
print(patient_class.loc[patient_test])


class    Normal
Name: 003d8fa0-6bf1-40ed-b54c-ac657f8495c5, dtype: object


In [7]:
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions

In [8]:
from keras.preprocessing import image
from tqdm import tqdm

def path_to_tensor(img_path):
    """takes an image path and returns a 4D array/tensor 
    for use with Keras CNN using TensorFlow backend:
    (nb_samples, 224, 224, 3)"""
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    return np.expand_dims(x, axis=0)

def paths_to_tensor(df):
    list_of_tensors =[path_to_tensor(parsed[pid]['dicom']) for pid in tqdm(df['patientId'])]
    return np.vstack(list_of_tensors)

def target_to_tensor(df):
    list_of_tensors =[parsed[pid]['label'] for pid in df['patientId']]
    return np.vstack(list_of_tensors)

In [9]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

train_tensors = paths_to_tensor(df[:24848])
valid_tensors = paths_to_tensor(df[24848:])

100%|██████████| 24848/24848 [02:29<00:00, 166.35it/s]
100%|██████████| 4141/4141 [00:24<00:00, 167.11it/s]


In [10]:
train_targets = target_to_tensor(df[:24848])
valid_targets = target_to_tensor(df[24848:])

In [11]:
Resnet50_model = ResNet50(weights='imagenet', include_top=False, input_shape=train_tensors[0].shape)
for layer in Resnet50_model.layers:
    layer.trainable = False



In [17]:
train_datagen = ImageDataGenerator(rotation_range=45,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.25,
                                   horizontal_flip=True,
                                   fill_mode='nearest')

test_datagen = ImageDataGenerator()

In [26]:
x = Resnet50_model.output
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(2, activation='softmax')(x)

batch_size = 32
learning_rate = 1e-4
optimizer = Adam(lr=learning_rate)

r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
r50_transfer.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [27]:
from keras.utils import to_categorical
y_train = to_categorical(train_targets)
y_valid = to_categorical(valid_targets)

In [28]:
checkpointer = ModelCheckpoint(filepath='weights_best_Resnet50-structure-v0.4.hdf5',
                              verbose=1, save_best_only=True)


In [None]:
r50_transfer.fit(train_tensors, y_train, epochs=10,
                batch_size=batch_size, callbacks=[checkpointer],
                validation_data=(valid_tensors, y_valid), shuffle=True)

Train on 24848 samples, validate on 4141 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.37466, saving model to weights_best_Resnet50-structure-v0.4.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.37466
Epoch 3/10

## Model version 0.2 - Observations & Next steps
> x = Resnet50_model.output
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(2, activation='softmax')(x)

>r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
r50_transfer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

>batch_size = 32

**Observations**:
A _validation accuracy_ of 0.8190 was achieved by epoch 3 with a _loss_ of 2.89894. This did not increase again until epoch 8, which achieved a _validation accuracy_ of 0.8239 with a _loss_ of 2.83392.

**Next steps**:
Increase training sample size and reduce validation sample size, using more of the data for training. Previous training sample size was 22000. I have increased this to: 24848.

Old training sample size: 22000
Old validation sample size: 6989

New training sample size: 24848
New validation sample size 6989


## Model version 0.3 - Observations & Next steps
>x = Resnet50_model.output
>x = Flatten()(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>predictions = Dense(2, activation='softmax')(x)

>r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
>r50_transfer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

>batch_size = 20

**Changes made**:
I have made the changes to the sample sizes as per findings in version 0.2. I have also decreased the batch size from 32 to 20, based on the paper "On Large-Batch Training for Deep Learning" which found that larger batches degraded the quality of the model.

**Observations**:
These changes resulted in the model converging very quickly, to an accuracy of 0.8013 with a validation loss of 3.20338. The model then oscillated and results did not improve for the remaining 9 epochs.

**Next steps**:
Return batch size to 32 to determine the cause of the effect (changes to training sample sizes or batch size). 


## Model version 0.4 - Observations & Next steps
>x = Resnet50_model.output
>x = Flatten()(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>predictions = Dense(2, activation='softmax')(x)

>r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
>r50_transfer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

>batch_size = 32

**Changes made**:
Returned batch size to 32 after poor results from 0.3. It seemed unreasonable to assume that providing the model with more training data would have the effects noted in 0.3.

**Observations**:
Similar observations made to 0.2.

**Next steps**:
Reduce learning rate of adam optimizer from default of 0.001 to 1e-4. Perhaps this will help the model identified more nuanced features associated with lung opacities.

## Model version 0.5 - Observations & Next steps
>x = Resnet50_model.output
>x = Flatten()(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>predictions = Dense(2, activation='softmax')(x)

>r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
>r50_transfer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

>batch_size = 32

**Changes made**:
Reduce learning rate of adam optimizer from default of 0.001 to 1e-4. 

**Observations**:
Significant reduction in loss from a best of 2.898 in v0.2 down to 0.50808. 
Accuracy also increased to 0.83 (previous high of 0.82 in v0.2). However, there is also evidence of overfitting around the 4th epoch onwards.

**Next steps**:
Reduce network size.

## Model version 0.5 - Observations & Next steps
>x = Resnet50_model.output
>x = Flatten()(x)
>x = Dense(128, activation='relu')(x)
>x = Dropout(0.5)(x)
>x = Dense(64, activation='relu')(x)
>x = Dropout(0.5)(x)
>predictions = Dense(2, activation='softmax')(x)

>r50_transfer = Model(inputs=Resnet50_model.input, outputs=predictions)
>r50_transfer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

>batch_size = 32

**Changes made**:
Reduce learning rate of adam optimizer from default of 0.001 to 1e-4. 

**Observations**:
Significant reduction in loss from a best of 2.898 in v0.2 down to 0.50808. 
Accuracy also increased to 0.83 (previous high of 0.82 in v0.2). However, there is also evidence of overfitting around the 4th epoch onwards.

**Next steps**:
Reduce network size.