In [1]:
import numpy as np
import csv, gzip, os, sys, gc
import math
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F

import logging
import datetime
import optparse
import pandas as pd
import os
from sklearn.metrics import log_loss
import ast
from torch.utils.data import Dataset
from sklearn.metrics import log_loss
from torch.utils.data import DataLoader
from scipy.ndimage import uniform_filter
from torch.optim.lr_scheduler import StepLR

from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp, optimizers
from apex.multi_tensor_apply import multi_tensor_applier

# Print info about environments
parser = optparse.OptionParser()
parser.add_option('-s', '--seed', action="store", dest="seed", help="model seed", default="1234")
parser.add_option('-o', '--fold', action="store", dest="fold", help="Fold for split", default="0")
parser.add_option('-p', '--nbags', action="store", dest="nbags", help="Number of bags for averaging", default="4")
parser.add_option('-e', '--epochs', action="store", dest="epochs", help="epochs", default="10")
parser.add_option('-b', '--batchsize', action="store", dest="batchsize", help="batch size", default="4")
parser.add_option('-r', '--rootpath', action="store", dest="rootpath", help="root directory", default="")
parser.add_option('-i', '--imgpath', action="store", dest="imgpath", help="root directory", default="data/mount/512X512X6/")
parser.add_option('-w', '--workpath', action="store", dest="workpath", help="Working path", default="data/resnext101v12fold1/")
parser.add_option('-f', '--weightsname', action="store", dest="weightsname", help="Weights file name", default="pytorch_model.bin")
parser.add_option('-l', '--lr', action="store", dest="lr", help="learning rate", default="0.00005")
parser.add_option('-g', '--logmsg', action="store", dest="logmsg", help="root directory", default="Recursion-pytorch")
parser.add_option('-c', '--size', action="store", dest="size", help="model size", default="512")
parser.add_option('-a', '--globalepoch', action="store", dest="globalepoch", help="root directory", default="3")
parser.add_option('-n', '--loadcsv', action="store", dest="loadcsv", help="Convert csv embeddings to numpy", default="F")
parser.add_option('-j', '--lstm_units', action="store", dest="lstm_units", help="Lstm units", default="128")
parser.add_option('-d', '--dropout', action="store", dest="dropout", help="LSTM input spatial dropout", default="0.3")
parser.add_option('-z', '--decay', action="store", dest="decay", help="Weight Decay", default="0.0")
parser.add_option('-m', '--lrgamma', action="store", dest="lrgamma", help="Scheduler Learning Rate Gamma", default="1.0")
parser.add_option('-k', '--ttahflip', action="store", dest="ttahflip", help="Bag with horizontal flip on and off", default="F")
parser.add_option('-q', '--ttatranspose', action="store", dest="ttatranspose", help="Bag with horizontal flip on and off", default="F")
parser.add_option('-x', '--datapath', action="store", dest="datapath", help="Data path", default="data")

<Option at 0x7f0fc0c4bf50: -x/--datapath>

In [2]:
options, args = parser.parse_args(['--datapath', 'data/resnext101v12fold1'])
package_dir = options.rootpath
sys.path.append(package_dir)
sys.path.insert(0, 'scripts')
from logs import get_logger
from utils import dumpobj, loadobj, GradualWarmupScheduler

# Print info about environments
logger = get_logger(options.logmsg, 'INFO') # noqa
logger.info('Cuda set up : time {}'.format(datetime.datetime.now().time()))

device=torch.device('cuda')
logger.info('Device : {}'.format(torch.cuda.get_device_name(0)))
logger.info('Cuda available : {}'.format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
logger.info('Cuda n_gpus : {}'.format(n_gpu ))

2020-02-07 01:02:30,469 - Recursion-pytorch - INFO - Cuda set up : time 01:02:30.469087
2020-02-07 01:02:30,634 - Recursion-pytorch - INFO - Device : TITAN RTX
2020-02-07 01:02:30,634 - Recursion-pytorch - INFO - Cuda available : True
2020-02-07 01:02:30,635 - Recursion-pytorch - INFO - Cuda n_gpus : 7


In [3]:
logger.info('Load params : time {}'.format(datetime.datetime.now().time()))
for (k,v) in options.__dict__.items():
    logger.info('{}{}'.format(k.ljust(20), v))

SEED = int(options.seed)
SIZE = int(options.size)
EPOCHS = int(options.epochs)
GLOBALEPOCH=int(options.globalepoch)
n_epochs = EPOCHS 
lr=float(options.lr)
lrgamma=float(options.lrgamma)
DECAY=float(options.decay)
batch_size = int(options.batchsize)
ROOT = options.rootpath
path_data = os.path.join(ROOT, options.datapath)
# path_img = os.path.join(ROOT, options.imgpath)
WORK_DIR = os.path.join(ROOT, options.workpath)
path_emb = os.path.join(ROOT, options.workpath)
WEIGHTS_NAME = options.weightsname
fold = int(options.fold)
LOADCSV= options.loadcsv=='T'
LSTM_UNITS=int(options.lstm_units)
nbags=int(options.nbags)
DROPOUT=float(options.dropout)
TTAHFLIP= 'T' if options.ttahflip=='T' else ''
TTATRANSPOSE= 'P' if options.ttatranspose=='T' else ''
n_classes = 6
label_cols = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any']

def makeSub(ypred, imgs):
    imgls = np.array(imgs).repeat(len(label_cols)) 
    icdls = pd.Series(label_cols*ypred.shape[0])   
    yidx = ['{}_{}'.format(i,j) for i,j in zip(imgls, icdls)]
    subdf = pd.DataFrame({'ID' : yidx, 'Label': ypred.flatten()})
    return subdf

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

def criterion(data, targets, criterion = torch.nn.BCEWithLogitsLoss()):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    loss_all = criterion(data, targets)
    loss_any = criterion(data[:,-1:], targets[:,-1:])
    return (loss_all*6 + loss_any*1)/7

class IntracranialDataset(Dataset):
    def __init__(self, df, mat, labels=label_cols):
        self.data = df
        self.mat = mat
        self.labels = labels
        self.patients = df.SliceID.unique()
        self.data = self.data.set_index('SliceID')

    def __len__(self):
        return len(self.patients)

    def __getitem__(self, idx):
        
        patidx = self.patients[idx]
        patdf = self.data.loc[patidx].sort_values('seq')
        patemb = self.mat[patdf['embidx'].values]

        patdeltalag  = np.zeros(patemb.shape)
        patdeltalead = np.zeros(patemb.shape)
        patdeltalag [1:] = patemb[1:]-patemb[:-1]
        patdeltalead[:-1] = patemb[:-1]-patemb[1:]

        patemb = np.concatenate((patemb, patdeltalag, patdeltalead), -1)
        
        ids = torch.tensor(patdf['embidx'].values)

        if self.labels:
            labels = torch.tensor(patdf[label_cols].values)
            return {'emb': patemb, 'embidx' : ids, 'labels': labels}    
        else:      
            return {'emb': patemb, 'embidx' : ids}

def predict(loader):
    valls = []
    imgls = []
    imgdf = loader.dataset.data.reset_index().set_index('embidx')[['Image']].copy()
    for step, batch in enumerate(loader):
        inputs = batch["emb"]
        mask = batch['mask'].to(device, dtype=torch.int)
        inputs = inputs.to(device, dtype=torch.float)
        logits = model(inputs)
        # get the mask for masked labels
        maskidx = mask.view(-1)==1
        # reshape for
        logits = logits.view(-1, n_classes)[maskidx]
        valls.append(torch.sigmoid(logits).detach().cpu().numpy())
        # Get the list of images
        embidx = batch["embidx"].detach().cpu().numpy().astype(np.int32)
        embidx = embidx.flatten()[embidx.flatten()>-1]
        images = imgdf.loc[embidx].Image.tolist() 
        imgls += images
    return np.concatenate(valls, 0), imgls

2020-02-07 01:02:30,668 - Recursion-pytorch - INFO - Load params : time 01:02:30.668766
2020-02-07 01:02:30,669 - Recursion-pytorch - INFO - seed                1234
2020-02-07 01:02:30,670 - Recursion-pytorch - INFO - fold                0
2020-02-07 01:02:30,670 - Recursion-pytorch - INFO - nbags               4
2020-02-07 01:02:30,671 - Recursion-pytorch - INFO - epochs              10
2020-02-07 01:02:30,671 - Recursion-pytorch - INFO - batchsize           4
2020-02-07 01:02:30,672 - Recursion-pytorch - INFO - rootpath            
2020-02-07 01:02:30,672 - Recursion-pytorch - INFO - imgpath             data/mount/512X512X6/
2020-02-07 01:02:30,673 - Recursion-pytorch - INFO - workpath            data/resnext101v12fold1/
2020-02-07 01:02:30,673 - Recursion-pytorch - INFO - weightsname         pytorch_model.bin
2020-02-07 01:02:30,673 - Recursion-pytorch - INFO - lr                  0.00005
2020-02-07 01:02:30,674 - Recursion-pytorch - INFO - logmsg              Recursion-pytorch
202

In [4]:
# Print info about environments
logger.info('Cuda set up : time {}'.format(datetime.datetime.now().time()))

2020-02-07 01:02:30,688 - Recursion-pytorch - INFO - Cuda set up : time 01:02:30.688295


In [5]:
# Get image sequences
trnmdf = pd.read_csv(os.path.join(path_data, 'train_metadata.csv'))
tstmdf = pd.read_csv(os.path.join(path_data, 'test_metadata.csv'))

In [6]:
trnmdf['SliceID'] = trnmdf[['PatientID', 'SeriesInstanceUID', 'StudyInstanceUID']].apply(lambda x: '{}__{}__{}'.format(*x.tolist()), 1)
tstmdf['SliceID'] = tstmdf[['PatientID', 'SeriesInstanceUID', 'StudyInstanceUID']].apply(lambda x: '{}__{}__{}'.format(*x.tolist()), 1)

poscols = ['ImagePos{}'.format(i) for i in range(1, 4)]
trnmdf[poscols] = pd.DataFrame(trnmdf['ImagePositionPatient']\
              .apply(lambda x: list(map(float, ast.literal_eval(x)))).tolist())
tstmdf[poscols] = pd.DataFrame(tstmdf['ImagePositionPatient']\
              .apply(lambda x: list(map(float, ast.literal_eval(x)))).tolist())

trnmdf = trnmdf.sort_values(['SliceID']+poscols)\
                [['PatientID', 'SliceID', 'SOPInstanceUID']+poscols].reset_index(drop=True)
tstmdf = tstmdf.sort_values(['SliceID']+poscols)\
                [['PatientID', 'SliceID', 'SOPInstanceUID']+poscols].reset_index(drop=True)

In [7]:
trnmdf['seq'] = (trnmdf.groupby(['SliceID']).cumcount() + 1)
tstmdf['seq'] = (tstmdf.groupby(['SliceID']).cumcount() + 1)

keepcols = ['PatientID', 'SliceID', 'SOPInstanceUID', 'seq']
trnmdf = trnmdf[keepcols]
tstmdf = tstmdf[keepcols]

trnmdf.columns = tstmdf.columns = ['PatientID', 'SliceID', 'Image', 'seq']

In [8]:
SIZE=480
fold=1
GLOBALEPOCH=0

In [10]:
# Load Data Frames
trndf = loadobj(os.path.join(path_emb, 'loader_trn_size{}_fold{}_ep{}'.format(SIZE, fold, GLOBALEPOCH))).dataset.data
valdf = loadobj(os.path.join(path_emb, 'loader_val_size{}_fold{}_ep{}'.format(SIZE, fold, GLOBALEPOCH))).dataset.data
tstdf = loadobj(os.path.join('data/stg2tst', 'loader_tst2_size{}_fold{}_ep{}'.format(SIZE, fold, GLOBALEPOCH))).dataset.data

In [11]:
trndf['embidx'] = range(trndf.shape[0])
valdf['embidx'] = range(valdf.shape[0])
tstdf['embidx'] = range(tstdf.shape[0])

trndf = trndf.merge(trnmdf.drop('PatientID', 1), on = 'Image')
valdf = valdf.merge(trnmdf.drop('PatientID', 1), on = 'Image')
# tstdf = tstdf.merge(trnmdf.drop('PatientID', 1), on = 'Image')
tstdf = tstdf.merge(tstmdf, on = 'Image')

In [12]:
trndf.shape

(539827, 12)

In [13]:
valdf.shape

(134430, 12)

In [14]:
tstmdf.shape

(121232, 4)

In [15]:
tstdf.shape

(121232, 6)

In [16]:
tstdf.shape

(121232, 6)

In [17]:
logger.info('Trn df shape {} {}'.format(*trndf.shape))
logger.info('Val df shape {} {}'.format(*valdf.shape))
logger.info('Tst df shape {} {}'.format(*tstdf.shape))

2020-02-07 01:04:02,212 - Recursion-pytorch - INFO - Trn df shape 539827 12
2020-02-07 01:04:02,213 - Recursion-pytorch - INFO - Val df shape 134430 12
2020-02-07 01:04:02,213 - Recursion-pytorch - INFO - Tst df shape 121232 6
