In [1]:
import numpy as np
import pandas as pd
import pydicom
import pickle
import glob
import os

from awesome_progress_bar import ProgressBar

from skimage.transform import resize

from tensorflow.keras.models import load_model

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 

In [2]:
my_dir = 'data'
data_dir = 'data'

images = glob.glob(f'{data_dir}/train/**/*.dcm')
target_size = (316, 316)

In [3]:
df = pd.read_csv(f'{my_dir}/train_base.csv', index_col=0)
embedder = load_model(f'{my_dir}/embedder.h5', compile=False)

def embed_patient(patient):
    imgs = []
    for img in glob.glob(f'{data_dir}/train/{patient}/*.dcm'):
        try:
            img = pydicom.dcmread(img).pixel_array
            img = resize(img, target_size)
            img = img[:, :, np.newaxis]
            imgs.append(img)
        except:
            pass
    imgs = np.array(imgs)
    
    meta = []
    for i in range(imgs.shape[0]):
        meta.append([
            i / imgs.shape[0],
            df.loc[patient, 'Sex'], 
            df.loc[patient, 'Age'], 
            df.loc[patient, 'SmokingStatus'],
            df.loc[patient, 'FVC_0'],
            df.loc[patient, 'Week'],
        ])
    meta = np.array(meta)
    
    pred = embedder.predict([imgs, meta])
    
    return np.hstack([
        np.min(pred, axis=0),
        np.max(pred, axis=0),
    ])

In [4]:
embeddings = []

patients = df.index.unique()
bar = ProgressBar(len(patients))
for patient in patients:
    bar.iter()
    embeddings.append(embed_patient(patient))
    
embeddings = pd.DataFrame(embeddings, index=patients)



In [18]:
new_df = pd.read_csv(f'{data_dir}/train.csv', index_col=0)
new_df = new_df.drop(['Percent', 'Age', 'Sex', 'SmokingStatus'], axis=1)
new_df = pd.merge(new_df, embeddings, left_index=True, right_index=True)
new_df.head(3)

Unnamed: 0_level_0,Weeks,FVC,0,1,2,3,4,5,6,7,...,38,39,40,41,42,43,44,45,46,47
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID00007637202177411956430,-4,2315,0.807122,0.229183,-0.013054,0.136393,-0.291393,0.364893,0.590697,0.167666,...,0.006512,-0.149674,-0.314572,-0.161189,-0.219876,0.183064,0.774404,0.428256,0.11215,0.02126
ID00007637202177411956430,5,2214,0.807122,0.229183,-0.013054,0.136393,-0.291393,0.364893,0.590697,0.167666,...,0.006512,-0.149674,-0.314572,-0.161189,-0.219876,0.183064,0.774404,0.428256,0.11215,0.02126
ID00007637202177411956430,7,2061,0.807122,0.229183,-0.013054,0.136393,-0.291393,0.364893,0.590697,0.167666,...,0.006512,-0.149674,-0.314572,-0.161189,-0.219876,0.183064,0.774404,0.428256,0.11215,0.02126


In [19]:
X = new_df.loc[:, new_df.columns != 'FVC']
y = new_df.FVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

In [20]:
reg = RandomForestRegressor(random_state=0)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.9050402226629061

In [21]:
with open('data/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

columns = ['FVC', 'SmokingStatus', 'Sex', 'Age']

In [22]:
df_test = pd.read_csv('data/test.csv', index_col=0)
df_test.Weeks /= 133
df_test.Sex = LabelEncoder().fit(['Male', 'Female']).transform(df_test.Sex)
df_test.SmokingStatus = LabelEncoder() \
    .fit(['Never smoked', 'Ex-smoker', 'Currently smokes']) \
    .transform(df_test.SmokingStatus)
df_test[columns] = scaler.transform(df_test[columns])
df_test = df_test.drop('Percent', axis=1)
df_test.head(3)

Unnamed: 0_level_0,Weeks,FVC,Age,Sex,SmokingStatus
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ID00419637202311204720264,0.045113,0.320722,0.814493,0.517799,-0.43259
ID00421637202311550012437,0.112782,-0.021202,0.108169,0.517799,-0.43259
ID00422637202311677017371,0.045113,-1.005602,0.814493,0.517799,-0.43259


In [23]:
def embed_patient(patient):
    imgs = []
    for img in glob.glob(f'{data_dir}/test/{patient}/*.dcm'):
        try:
            img = pydicom.dcmread(img).pixel_array
            img = resize(img, target_size)
            img = img[:, :, np.newaxis]
            imgs.append(img)
        except:
            pass
    imgs = np.array(imgs)
    
    meta = []
    for i in range(imgs.shape[0]):
        meta.append([
            i / imgs.shape[0],
            df_test.loc[patient, 'Sex'], 
            df_test.loc[patient, 'Age'], 
            df_test.loc[patient, 'SmokingStatus'],
            df_test.loc[patient, 'FVC'],
            df_test.loc[patient, 'Weeks'],
        ])
    meta = np.array(meta)
    
    pred = embedder.predict([imgs, meta])
    
    return np.hstack([
        np.min(pred, axis=0),
        np.max(pred, axis=0),
    ])

In [24]:
embeddings_test = []

patients = os.listdir(f'{data_dir}/test')
bar = ProgressBar(len(patients))
for patient in patients:
    bar.iter()
    embeddings_test.append(embed_patient(patient))
    
embeddings_test = pd.DataFrame(embeddings_test, index=patients)
print(embeddings_test.shape)

(5, 48)


In [25]:
df_test = pd.read_csv('data/test.csv', index_col=0)
df_test = df_test.drop(['Percent', 'Age', 'Sex', 'SmokingStatus'], axis=1)
df_test = pd.merge(df_test, embeddings_test, left_index=True, right_index=True)
df_test.head(3)

Unnamed: 0_level_0,Weeks,FVC,0,1,2,3,4,5,6,7,...,38,39,40,41,42,43,44,45,46,47
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID00419637202311204720264,6,3020,0.148022,-0.008863,0.234857,0.042761,-0.064553,0.346095,-0.036799,-0.17045,...,0.049484,-0.136113,-0.05908,0.070188,-0.146056,-0.146316,0.307385,0.29055,0.429764,0.106374
ID00421637202311550012437,15,2739,-0.016422,-0.104722,0.209996,0.078798,0.214119,0.271985,-0.152906,-0.060206,...,0.225317,-0.182565,-0.084939,0.292064,0.137222,-0.074507,0.106652,-0.066028,0.276556,0.04337
ID00422637202311677017371,6,1930,0.658853,0.164152,-0.091438,0.175979,0.016927,0.240025,0.516186,0.352649,...,0.254713,-0.168112,-0.356365,0.074708,0.109937,0.326134,0.577251,0.020133,-0.132922,-0.058751


In [26]:
data_test = {
    'Patient_Week': [],
    'FVC': [],
}

for week in np.arange(-12, 134):
    for patient in df_test.index.unique():
        data_test['Patient_Week'].append(f'{patient}_{week}')
        X = df_test.loc[patient, df_test.columns != 'FVC'].copy()
        X.Weeks = week / 133
        pred = reg.predict([X.values])[0]
        data_test['FVC'].append(pred)

In [27]:
data_test = pd.DataFrame(data_test)
data_test['Confidence'] = 100
data_test.head(3)

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,3209.95,100
1,ID00421637202311550012437_-12,3145.09,100
2,ID00422637202311677017371_-12,2023.27,100


In [28]:
data_test.to_csv('data/submission.csv', index=False)