In [1]:
import pandas as pd
import numpy as np
import pydicom
import pickle
import glob
import os

from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

from skimage.transform import resize
from awesome_progress_bar import ProgressBar

In [2]:
df = pd.read_csv('data/train.csv', index_col='Patient')
df = df.drop('Percent', axis=1)
df.Sex = LabelEncoder().fit_transform(df.Sex)
df.SmokingStatus = LabelEncoder().fit(['Never smoked', 'Ex-smoker', 'Currently smokes']).transform(df.SmokingStatus)
df.head(3)

Unnamed: 0_level_0,Weeks,FVC,Age,Sex,SmokingStatus
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ID00007637202177411956430,-4,2315,79,1,1
ID00007637202177411956430,5,2214,79,1,1
ID00007637202177411956430,7,2061,79,1,1


So I need to create an output for the model to train the embedding model. And I've chosen FVCs values at weeks 6, 8, 10, 12 and 18, because this weeks are the most frequent. But not every patient had measured FVC at this weeks. So, for each user I've created `LinearRegression` on the FVC from a third-order polynomial of the week.

In [3]:
def create_row(i: str):
    tmp = df.loc[i]
    
    X = tmp.Weeks.values.reshape(-1, 1)
    y = tmp.FVC
    pipe = Pipeline([
        ('prep', PolynomialFeatures(3)),
        ('reg', LinearRegression())
    ]).fit(X, y)
    
    def get(n):
        return tmp[tmp.Weeks == n].FVC[0] if 0 in tmp.Weeks else pipe.predict([[n]])[0]
    
    weeks = df.loc[pat].Weeks
    w = weeks.abs().min()
    w = w if w in weeks.values else -w
    
    return w, [
        get(w),  get(6), get(8),
        get(10), get(12),  get(18),
    ]

These are the outputs. Also, I thought that besied the scans I can use meta data to train the model. And I've chosen to use position of the scan, `SmokingStatus`, `Sex`, `Age`, FVCs measurement of the closest week to 0, and the closest week to 0.

In [4]:
data = []
for pat in df.index.unique():
    tmp = df.loc[pat]
    row = [pat]
    w, fvcs = create_row(pat)
    row.extend(fvcs)
    row.extend([
        tmp.SmokingStatus[0],
        tmp.Sex[0],
        tmp.Age[0],
        w,
    ])
    data.append(row)

In [5]:
df = pd.DataFrame.from_dict(data)
df.columns = ['Patient', 'FVC_0', 'FVC_6', 'FVC_8', 'FVC_10', 'FVC_12', 
              'FVC_18', 'SmokingStatus', 'Sex', 'Age', 'Week']

Yes, there's no need to scale output values. But I don't want to embedding model to have big outputs. So, I hove that should help.

In [6]:
columns_0 = ['FVC_0', 'SmokingStatus', 'Sex', 'Age']
columns_1 = ['FVC_6', 'FVC_8', 'FVC_10', 'FVC_12', 'FVC_18']
scaler = StandardScaler().fit(df[columns_0])
df[columns_1] = (df[columns_1] - df.FVC_0.mean()) / df.FVC_0.std()
df[columns_0] = scaler.transform(df[columns_0])
df['Week'] /= 133
df.head(3)

Unnamed: 0,Patient,FVC_0,FVC_6,FVC_8,FVC_10,FVC_12,FVC_18,SmokingStatus,Sex,Age,Week
0,ID00007637202177411956430,-0.533965,-0.737345,-0.765451,-0.789803,-0.810632,-0.854286,-0.43259,0.517799,1.662082,-0.030075
1,ID00009637202177434476278,1.1505,1.138684,1.147208,1.151671,1.152282,1.133085,-0.43259,0.517799,0.249433,0.06015
2,ID00010637202177584971671,0.762048,0.532568,0.465364,0.402155,0.342767,0.185815,-0.43259,0.517799,-1.021951,0.0


In [7]:
df.to_csv('data/train_base.csv', index=False)

with open('data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [13]:
images = glob.glob('data\\train\\**/*.dcm')
target_size = (316, 316)

Reading, resizing, adding new axis - this action takes a long time. And I deciced to create `.dat` file for each scan. That didn't seem to be a good idea for the first time, but actually, reading ready-made `.dat` tensors are much faster, when reading `.dcm` and resizing.

In [12]:
bar = ProgressBar(len(images), prefix='Data preparation')
for path in images:
    bar.iter()
    try:
        img = pydicom.dcmread(path).pixel_array
        img = resize(img, target_size)
        img = img[:, :, np.newaxis]
        fp = np.memmap(path.replace('.dcm', '.dat'), dtype='float32', mode='w+', shape=(*target_size, 1))
        fp[:] = img[:]
        del fp
    except:
        pass



Also, sometimes there are gaps in scans. For example, 5.dcm can follow after 2.dcm. And I'd like to get rid of it.

In [None]:
tmp = os.listdir('data/train')
bar = ProgressBar(len(tmp), 'Renaming progress')
for d in tmp:
    bar.iter()
    dats = glob.glob(f'data/train/{d}/*.dat')
    dats.sort(key=lambda d: int(d[:-4].split('\\')[-1]))
    i = 0
    for dat in dats:
        i += 1
        os.rename(dat, f'data/train/{d}/{i}.dat')