In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import os

from awesome_progress_bar import ProgressBar

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from utils import LoggerCallback, DataGenerator

`LoggerCallback` and `DataGenerator` placed in a separate module to increase code readability.

So, yeah. As I said, meta is: position of the scan, `SmokingStatus`, `Sex`, `Age`, FVCs measurement of the closest week to 0, and the closest week to 0. Output is FVCs measurements at weeks 6, 8, 10, 12 and 18.

In [2]:
df = pd.read_csv('data/train_base.csv', index_col=0)

def read(path):
    fp = np.memmap(path, dtype='float32', mode='r', shape=(316, 316, 1))
    
    directory, filename = os.path.split(path)
    n = int(filename.replace('.dat', ''))
    n /= len(glob.glob(f'{directory}/*.dat'))
    
    patient = directory.split('\\')[-1]
    
    meta = [
        n,
        df.loc[patient, 'Sex'], 
        df.loc[patient, 'Age'], 
        df.loc[patient, 'SmokingStatus'],
        df.loc[patient, 'FVC_0'],
        df.loc[patient, 'Week'],
    ]
    
    y = []
    for x in [6, 8, 10, 12, 18]:
        y.append(df.loc[patient, f'FVC_{x}'])
        
    return fp, meta, y

In [3]:
images = glob.glob('data\\train\\**/*.dcm')
target_size = (316, 316)

That's the embedder. Seems like there's just not enough parameters. But actually R2 score on the test dataset is pretty high.

In [4]:
def MySeparableConv2D(n_units):
    def _(layer):
        layer = SeparableConv2D(n_units, 3, activation='relu', activity_regularizer=l2(0.6))(layer)
        return MaxPool2D()(layer)
    return _

def create_embedder():
    inp = Input([*target_size, 1])
    inp2 = Input([6])

    layer = MySeparableConv2D(16)(inp)
    layer = MySeparableConv2D(16)(layer)
    layer = MySeparableConv2D(32)(layer)
    layer = MySeparableConv2D(32)(layer)
    layer = GlobalAvgPool2D()(layer)

    layer = Concatenate()([layer, inp2])
    out = Dense(24)(layer)

    return Model([inp, inp2], out)

embedder = create_embedder()

In [5]:
layer = Dense(5)(embedder.output)
model = Model(embedder.input, layer)
model.compile('adam', 'mse', ['mape'])

In [6]:
test_size = 0.1

dat_images = glob.glob('data\\train\\**/*.dat')

train_size = int(np.round(len(dat_images) * (1 - test_size)))
dat_train = dat_images[:train_size]
dat_test = dat_images[train_size:]

print(f'Train dataset size: {len(dat_train)}')
print(f'Test dataset size: {len(dat_test)}')

Train dataset size: 30265
Test dataset size: 3363


Here comes the training process

In [7]:
%%time

train = DataGenerator(dat_train, read)
test = DataGenerator(dat_test, read)
tf.get_logger().setLevel("ERROR") 

history = model.fit(
    train, 
    validation_data=test,
    epochs=5,
    callbacks=[LoggerCallback(len(train))],
    verbose=0,
)

Epoch 0:
Epoch 1:
Epoch 2:
Epoch 3:
Epoch 4:
Wall time: 11min 51s


In [8]:
def embed_patient(directory):
    imgs = []
    meta = []
    
    dats = glob.glob(f'{directory}\\*.dat')
    dats.sort(key=lambda d: int(d[:-4].split('\\')[-1]))
    for dat in dats:
        img, m, _ = read(dat)
        imgs.append(img)
        meta.append(m)
    
    imgs = np.array(imgs)
    meta = np.array(meta)
    
    embedding = embedder.predict([imgs, meta])
        
    return np.hstack([
        np.min(embedding, axis=0),
        np.max(embedding, axis=0),
    ])

In [9]:
embeddings = []

patients = os.listdir('data\\train')
bar = ProgressBar(len(patients))
for patient in patients:
    bar.iter()
    embeddings.append(embed_patient(f'data\\train\\{patient}'))
    
embeddings = pd.DataFrame(embeddings, index=patients)



In [10]:
new_df = pd.read_csv('data/train.csv', index_col=0)
new_df = new_df.drop(['Percent', 'Sex', 'Age', 'SmokingStatus'], axis=1)

In [11]:
new_df = pd.merge(new_df, embeddings, left_index=True, right_index=True)

In [12]:
X = new_df.loc[:, new_df.columns != 'FVC']
y = new_df.FVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)

In [13]:
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.9065728402301392

And that's it. Pretty high. The embedder haven't seen scans of test dataset. Also the final regressor haven't seen them too. So, I guess that's really good result.

In [23]:
embedder.save('data/embedder.h5')