In [29]:
import os
import sys
import torch
import torch.nn as nn
import accimage
from PIL import Image
from imageio import imread
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms, set_image_backend, get_image_backend
import data_utils
import numpy as np
import pandas as pd
import pickle

%reload_ext autoreload
%autoreload 2

In [30]:
# https://github.com/pytorch/accimage
set_image_backend('accimage')
get_image_backend()

'accimage'

## ImageFolder

In [None]:
i = 3
TCGA_COAD_IMG_DIR = '/n/mounted-data-drive/COAD/'

dirs = os.listdir(TCGA_COAD_IMG_DIR)
imgs = [d[:-4] for d in dirs]
current_img = TCGA_COAD_IMG_DIR + dirs[i] + '/' + imgs[i] + '_files'

In [None]:
# https://github.com/pytorch/examples/issues/236
current_img

In [None]:
# https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101
train_dir = current_img
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([
    transforms.RandomResizedCrop(256),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize])

train_dataset = datasets.ImageFolder(train_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)

In [None]:
train_dataset.class_to_idx['20.0']

In [None]:
for i,img in enumerate(train_loader):
    break

In [None]:
img[0].shape

## New Dataset Class

In [None]:
# https://github.com/pytorch/vision/blob/master/torchvision/transforms/functional.py
# https://pillow.readthedocs.io/en/5.1.x/handbook/concepts.html#concept-modes
sample_annotations = {'TCGA-T9-A92H-01Z-00-DX3.1DE7D5ED-60F7-4645-8243-AB0C027B3ED7': 0, 
                      'TCGA-WS-AB45-01Z-00-DX1.1FD99E7A-830F-40DC-98CD-53C62C678AC6': 1,
                      'TCGA-NH-A8F8-01Z-00-DX1.0C13D583-0BCE-44F7-A4E6-5994FE97B99C': 0,
                      'TCGA-QG-A5YV-01Z-00-DX1.9B7FD3EA-D1AB-44B3-B728-820939EF56EA': 1,
                      'TCGA-QG-A5YW-01Z-00-DX1.3242285F-FA82-4A92-9D0E-951013A3C91A': 0,
                      'TCGA-QG-A5YX-01Z-00-DX1.28125B5A-B696-44AE-8A86-72E2CF7B9A6A': 1,
                      'TCGA-QG-A5Z1-01Z-00-DX2.2CE72B6A-557F-43BD-BA4C-B252E14E46EF': 0,
                      'TCGA-QG-A5Z2-01Z-00-DX2.F2352352-8F00-4BB3-8A62-8D1C1E374F95': 1,
                      'TCGA-QL-A97D-01Z-00-DX1.6B48E95D-BE3C-4448-A1AF-6988C00B7AF1': 0,
                      'TCGA-SS-A7HO-01Z-00-DX1.D20B9109-F984-40DE-A4F1-2DFC61002862': 1}
root_dir = '/n/mounted-data-drive/COAD/'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.ToTensor(), normalize])

In [None]:
train_set = data_utils.TCGADataset(sample_annotations, root_dir, transform=transform)

In [None]:
sample = train_set.__getitem__(1)

In [None]:
sample['slide'].shape

In [None]:
sample['label']

In [None]:
train_loader = DataLoader(train_set, batch_size=1, shuffle=True, pin_memory=True)

In [None]:
for s in train_loader:
    print(s['slide'].shape, s['label'])

## COAD

In [31]:
root_dir = '/n/mounted-data-drive/COAD/'
#normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
normalize = transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0])
transform = transforms.Compose([transforms.ToTensor(), normalize])

In [32]:
msi_path = '/home/sxchao/MSI_prediction/tcga_project/msi_raw_data.xlsx'
msi_raw = pd.read_excel(msi_path)

In [33]:
#msi_raw.columns

In [34]:
msi_raw.rename(columns={'Tumor type':'tumor_type', 
                        'Donor id':"donor_id", 
                        'Tumor sample id':'tumor_id',
                        'Normal sample id':'normal_id',
                        'Number of mutations A motif':'muts_A', 
                        'Number of covered A loci':'covg_A',
                        'Number of mutations C motif':'muts_C', 
                        'Number of covered C loci':'covg_C',
                        'Number of mutations AC motif':'muts_AC', 
                        'Number of covered AC loci':'covg_AC',
                        'Number of mutations AG motif':'muts_AG', 
                        'Number of covered AG loci':'covg_AG'}, 
               inplace=True)

In [35]:
msi_raw['muts_tot'] = msi_raw['muts_A'] + msi_raw['muts_C'] + msi_raw['muts_AC'] + msi_raw['muts_AG']
msi_raw['msi'] = msi_raw['muts_tot'] >= 20
msi_raw.msi = msi_raw.msi.astype(int)
#msi_raw.tail(3)

In [36]:
#msi_raw.groupby('tumor_type')['msi'].mean()

In [37]:
coad_msi = msi_raw.loc[msi_raw['tumor_type']=='COAD','donor_id'].values
#coad_msi.shape

In [38]:
sample_name = coad_msi[-1]
name_len = len(sample_name)
#sample_name

In [39]:
coad_full_name = os.listdir(root_dir)
coad_img = np.array([v[0:name_len] for v in coad_full_name])
#len(coad_img), coad_img[5], coad_full_name[5]

In [40]:
coad_both = np.intersect1d(coad_img, coad_msi)

In [41]:
sample_names = []
for sample in coad_both:
    if sample != 'TCGA-A6-2675': # 5.0 empty for 'TCGA-A6-2675-01Z-00-DX1.d37847d6-c17f-44b9-b90a-84cd1946c8ab'
        key = np.argwhere(coad_img == sample).squeeze()
        if key.size != 0:
            sample_names.append(coad_full_name[key][:-4])

In [42]:
len(sample_names)

246

In [43]:
msi_raw.set_index('donor_id', inplace=True)
#msi_raw.head(3)

In [44]:
#np.random.seed(seed=54321)
reorder = np.random.permutation(len(sample_names))
train = reorder[:int(np.floor(len(sample_names)*0.8))]
val = reorder[int(np.floor(len(sample_names)*0.8)):]
len(train), len(val)

(196, 50)

In [45]:
sample_names = np.array(sample_names)

In [46]:
sample_annotations = {}
for sample_name in sample_names[train]:
    sample_annotations[sample_name] = msi_raw.loc[sample_name[0:name_len], 'msi']

In [47]:
all_coad = list(sample_annotations.values())
sum(all_coad) / len(all_coad)

0.39285714285714285

In [49]:
train_set = data_utils.TCGADataset(sample_annotations, root_dir, transform=transform)
train_loader = DataLoader(train_set, batch_size=1, shuffle=True, pin_memory=True)

In [50]:
sample_annotations_val = {}
for sample_name in sample_names[val]:
    sample_annotations_val[sample_name] = msi_raw.loc[sample_name[0:name_len], 'msi']

In [51]:
all_coad_val = list(sample_annotations_val.values())
sum(all_coad_val) / len(all_coad_val)

0.42

In [52]:
valid_set = data_utils.TCGADataset(sample_annotations_val, root_dir, transform=transform)
valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False, pin_memory=True)

## Pre-trained Model

In [None]:
# inception_v3 expects tensors with a size of N x 3 x 299 x 299
#net = models.inception_v3(pretrained=True)
net = models.resnet152(pretrained=True)
for param in net.parameters():
    param.requires_grad = False
net.cuda()

In [94]:
def update_tile_shape(H_in, W_in, kernel_size, dilation=1., padding=0., stride=1.):
    H_out = (H_in + 2. * padding - dilation * (kernel_size-1) -1)/stride + 1
    W_out = (W_in + 2. * padding - dilation * (kernel_size-1) -1)/stride + 1
    return int(np.floor(H_out)), int(np.floor(W_out))

In [95]:
class ConvNet(nn.Module):
    def __init__(self, n_conv_layers, n_fc_layers, kernel_size, n_conv_filters, hidden_size, dropout=0.5,
                dilation = 1., padding = 0, H_in = 256, W_in = 256):
        super(ConvNet, self).__init__()
        self.n_conv_layers = n_conv_layers
        self.n_fc_layers = n_fc_layers
        self.kernel_size = kernel_size
        self.n_conv_filters = n_conv_filters
        self.hidden_size = hidden_size
        self.conv_layers = []
        self.fc_layers = []
        self.mp_ker = 5 # max pool kernel size
        self.mp_str = 5 # max pool stride
        self.m = nn.MaxPool2d(self.mp_ker, stride=self.mp_str)
        self.n = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.H_in, self.W_in = H_in, W_in
        
        in_channels = 3        
        for layer in range(self.n_conv_layers):
            self.conv_layers.append(nn.Conv2d(in_channels, self.n_conv_filters[layer], self.kernel_size[layer]))
            self.conv_layers.append(self.relu)
            self.conv_layers.append(self.m)
            # convolution
            self.H_in, self.W_in = update_tile_shape(self.H_in, self.W_in, kernel_size[layer])
            # max pooling
            self.H_in, self.W_in = update_tile_shape(self.H_in, self.W_in, self.mp_ker, stride=self.mp_str)
            in_channels = self.n_conv_filters[layer]
        in_channels = in_channels * self.H_in * self.W_in
        for layer in range(self.n_fc_layers):
            self.fc_layers.append(nn.Linear(in_channels, self.hidden_size[layer]))
            self.fc_layers.append(self.relu)
            self.fc_layers.append(self.n)
            in_channels = self.hidden_size[layer]
        self.conv = nn.Sequential(*self.conv_layers)
        self.fc = nn.Sequential(*self.fc_layers)
        self.classification_layer = nn.Linear(in_channels, 2)
        
    def forward(self, x):
        embed = self.conv(x)
        embed = embed.view(x.shape[0],-1)
        y = self.fc(embed)
        return y

In [96]:
n_conv_layers = 2
n_fc_layers = 2
kernel_size = [4,3]
n_conv_filters = [36,48]
hidden_size = [512,512]
dropout = 0.5
net = ConvNet(n_conv_layers, n_fc_layers, kernel_size, n_conv_filters, hidden_size, dropout=dropout)
net.cuda()

ConvNet(
  (m): MaxPool2d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  (n): Dropout(p=0.5)
  (relu): ReLU()
  (conv): Sequential(
    (0): Conv2d(3, 36, kernel_size=(4, 4), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(36, 48, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=5, stride=5, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=3888, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=512, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5)
  )
  (classification_layer): Linear(in_features=512, out_features=2, bias=True)
)

In [97]:
def pool_fn(x):
    #v,a = torch.max(x,0)
    v = torch.mean(x,0)
    return v

In [None]:
for slide,label in train_loader:
    break

In [None]:
slide.shape

In [None]:
slide = slide[:,np.random.permutation(slide.shape[1])[:200],:,:,:]
slide = slide.squeeze(0).cuda()

In [None]:
embed = net(slide)
embed.shape

In [None]:
pool = pool_fn(embed).unsqueeze(0)
pool.shape

In [None]:
output = net.classification_layer(pool)
output.shape

In [98]:
def embedding_training_loop(e, train_loader, net, criterion, optimizer, pool_fn):
    net.train()
    total_loss = 0
    
    for idx,(slide,label) in enumerate(train_loader):
        #slide = slide[:,np.random.permutation(slide.shape[1])[:100],:,:,:].squeeze(0)
        slide, label = slide.squeeze(0).cuda(), label.cuda()
        output = net(slide)
        pool = pool_fn(output).unsqueeze(0)
        output = net.classification_layer(pool)
        loss = criterion(output, label)
        loss.backward()
        total_loss += loss.detach().cpu().numpy()
        optimizer.step()
        optimizer.zero_grad()
        #if idx % 10 == 0:
        #    print('Epoch: {0}, Slide: {1}, Train NLL: {2:0.4f}'.format(e, idx, loss))
            
    print('Epoch: {0}, Avg Train NLL: {1:0.4f}'.format(e, total_loss/float(idx+1)))
    

def embedding_validation_loop(e, valid_loader, net, criterion, pool_fn, dataset='Val'):
    net.eval()
    total_loss = 0
    labels = []
    preds = []
    
    for idx,(slide,label) in enumerate(valid_loader):
        #slide = slide[:,np.random.permutation(slide.shape[1])[:100],:,:,:].squeeze(0)
        slide, label = slide.squeeze(0).cuda(), label.cuda()
        output = net(slide)
        pool = pool_fn(output).unsqueeze(0)
        output = net.classification_layer(pool)
        loss = criterion(output, label)
        
        total_loss += loss.detach().cpu().numpy()
        labels.extend(label.float().cpu().numpy())
        preds.append(torch.argmax(output).float().detach().cpu().numpy())
    
        #if idx % 10 == 0:
        #    print('Epoch: {0}, Slide: {1}, {3} NLL: {2:0.4f}'.format(e, idx, loss, dataset))
            
    acc = np.mean(np.array(labels) == np.array(preds))
    print('Epoch: {0}, Avg {3} NLL: {1:0.4f}, {3} Acc: {2:0.4f}'.format(e, total_loss/float(idx+1), acc, dataset))
    
    return total_loss

In [99]:
learning_rate = 1e-4
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate)

In [None]:
for e in range(50):
    embedding_training_loop(e, train_loader, net, criterion, optimizer, pool_fn)
    #train_loss = embedding_validation_loop(e, train_loader, net, criterion, pool_fn, dataset='Train')
    val_loss = embedding_validation_loop(e, valid_loader, net, criterion, pool_fn, dataset='Val')

In [None]:
for idx,(slide,label) in enumerate(train_loader):
    break

In [None]:
idx, slide.shape, label

In [85]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [86]:
count_parameters(net)

2272214

In [None]:
for idx,(slide,label) in enumerate(valid_loader):
    print(idx, slide.shape)

## New Dataset

In [16]:
max_tiles = 100

all_slides = []
all_slide_labels = []

for idx,sample_name in enumerate(sample_names):
    sample_name = sample_names[-1]
    slide_tiles = []
    img_dir = root_dir + sample_name + '.svs/' + sample_name + '_files/5.0'
    imgs = os.listdir(img_dir)

    for im in imgs:
        path = img_dir + '/' + im
        image = data_utils.accimage_loader(path)

        if transform is not None:
            image = transform(image)

        if image.shape[1] == 256 and image.shape[2] == 256:
            slide_tiles.append(image)

    slide = torch.stack(slide_tiles)
    label = msi_raw.loc[sample_name[0:name_len], 'msi']
    slide = slide[np.random.permutation(slide.shape[0])[:max_tiles],:,:,:]

    all_slides.append(slide)
    all_slide_labels.append(label)
    
    if idx % 10 == 0:
        print(idx, end=' ')

0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 

In [26]:
TCGA_COAD_TRAIN = '/n/tcga_coad_train.pkl'
TCGA_COAD_VALID = '/n/tcga_coad_valid.pkl'

In [24]:
all_slides[0].shape, all_slide_labels[0]

(torch.Size([100, 3, 256, 256]), 1)

In [27]:
with open(TCGA_COAD_TRAIN, 'wb') as f: 
    pickle.dump([all_slides[:196], all_slide_labels[:196]], f)

MemoryError: 

In [25]:
with open(TCGA_COAD_VALID, 'wb') as f: 
    pickle.dump([all_slides[196:], all_slide_labels[196:]], f)

MemoryError: 

In [None]:
class TCGADataset_from_pkl(Dataset):
    def __init__(self, pickle_file):
        with open(pickle_file, 'rb') as f: 
            all_slides, all_slide_labels = pickle.load(f)
        self.data = all_slides
        self.labels = all_slide_labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
train = TCGADataset_from_pkl(TCGA_COAD_TRAIN)
train_loader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True, pin_memory=True)
valid = TCGADataset_from_pkl(TCGA_COAD_VALID)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=1, shuffle=False, pin_memory=True)

In [28]:
temp = os.listdir(img_dir)
len(temp)

1010