Just another version

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import os

from awesome_progress_bar import ProgressBar

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from utils import LoggerCallback, DataGenerator

In [2]:
df = pd.read_csv('data/train_base_3.csv', index_col=0)

def read(path):
    fp = np.memmap(path, dtype='float32', mode='r', shape=(316, 316, 1))
    
    directory, filename = os.path.split(path)
    n = int(filename.replace('.dat', ''))
    n /= len(glob.glob(f'{directory}/*.dat'))
    
    patient = directory.split('\\')[-1]
    
    meta = [
        n,
        df.loc[patient, 'FVC_0'],
        df.loc[patient, 'Week'],
    ]
    
    y = []
    for x in ['FVC_12', 'FVC_50', 'SmokingStatus', 'Sex', 'Age']:
        y.append(df.loc[patient, x])
        
    return fp, meta, y

In [3]:
images = glob.glob('data\\train\\**/*.dcm')
target_size = (316, 316)

In [5]:
def MySeparableConv2D(n_units):
    def _(layer):
        layer = SeparableConv2D(n_units, 3, activation='relu', activity_regularizer=l2(0.6))(layer)
        return MaxPool2D()(layer)
    return _

def create_embedder():
    inp = Input([*target_size, 1])
    inp2 = Input([3])

    layer = MySeparableConv2D(16)(inp)
    layer = MySeparableConv2D(32)(layer)
    layer = MySeparableConv2D(32)(layer)
    layer = GlobalAvgPool2D()(layer)

    layer = Concatenate()([layer, inp2])
    out = Dense(8)(layer)

    return Model([inp, inp2], out)

embedder = create_embedder()

In [6]:
layer = Dense(8, activation='relu')(embedder.output)
layer = Dense(5)(layer)
model = Model(embedder.input, layer)
model.compile('adam', 'mse', ['mape'])

In [7]:
test_size = 0.1

np.random.seed(42)

patients = os.listdir('data/train')
np.random.shuffle(patients)

train_size = int(np.round(len(patients) * (1 - test_size)))
patients_train = patients[:train_size]
patients_test = patients[train_size:]

dat_train = []
for x in patients_train:
    dat_train.extend(glob.glob(f'data\\train\\{x}/*.dat'))
dat_test = []
for x in patients_test:
    dat_test.extend(glob.glob(f'data\\train\\{x}/*.dat'))

print(f'Train dataset size: {len(dat_train)}')
print(f'Test dataset size: {len(dat_test)}')

Train dataset size: 30153
Test dataset size: 3475


In [8]:
%%time

train = DataGenerator(dat_train, read)
test = DataGenerator(dat_test, read)
tf.get_logger().setLevel("ERROR") 

history = model.fit(
    train, 
    validation_data=test,
    epochs=15,
    callbacks=[LoggerCallback(len(train))],
    verbose=0,
)

Epoch 0:
Epoch 1:
Epoch 2:
Epoch 3:
Epoch 4:
Epoch 5:
Epoch 6:
Epoch 7:
Epoch 8:

KeyboardInterrupt: 

In [9]:
def embed_patient(directory):
    imgs = []
    meta = []
    
    dats = glob.glob(f'{directory}\\*.dat')
    dats.sort(key=lambda d: int(d[:-4].split('\\')[-1]))
    for dat in dats:
        img, m, _ = read(dat)
        imgs.append(img)
        meta.append(m)
    
    imgs = np.array(imgs)
    meta = np.array(meta)
    
    embedding = embedder.predict([imgs, meta])
        
    return np.hstack([
        np.min(embedding, axis=0),
        np.mean(embedding, axis=0),
        np.max(embedding, axis=0),
    ])

In [10]:
embeddings = []

patients = os.listdir('data\\train')
bar = ProgressBar(len(patients))
for patient in patients:
    bar.iter()
    embeddings.append(embed_patient(f'data\\train\\{patient}'))
    
embeddings = pd.DataFrame(embeddings, index=patients)



In [11]:
new_df = pd.read_csv('data/train.csv', index_col=0)
new_df = new_df.drop(['Percent', 'Sex', 'Age', 'SmokingStatus'], axis=1)
new_df = pd.merge(new_df, embeddings, left_index=True, right_index=True)

In [12]:
df_train = new_df.loc[patients_train]
df_test = new_df.loc[patients_test]

In [13]:
from sklearn.preprocessing import StandardScaler

X_train = df_train.loc[:, new_df.columns != 'FVC']
X_test = df_test.loc[:, new_df.columns != 'FVC']
y_train = df_train.FVC
y_test = df_test.FVC

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.8886053828052042