In [44]:
import os
import sys
import torch
import accimage
from PIL import Image
from imageio import imread
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms, set_image_backend, get_image_backend
import data_utils
import numpy as np
import pandas as pd

%reload_ext autoreload
%autoreload 2

In [2]:
# https://github.com/pytorch/accimage
set_image_backend('accimage')
get_image_backend()

'accimage'

## ImageFolder

In [4]:
i = 3
TCGA_COAD_IMG_DIR = '/n/mounted-data-drive/COAD/'

dirs = os.listdir(TCGA_COAD_IMG_DIR)
imgs = [d[:-4] for d in dirs]
current_img = TCGA_COAD_IMG_DIR + dirs[i] + '/' + imgs[i] + '_files'

In [5]:
# https://github.com/pytorch/examples/issues/236
current_img

'/n/mounted-data-drive/COAD/TCGA-DM-A0XF-01Z-00-DX1.6FD3D3CF-A1E2-4F4E-BF02-F81B1A1061CC.svs/TCGA-DM-A0XF-01Z-00-DX1.6FD3D3CF-A1E2-4F4E-BF02-F81B1A1061CC_files'

In [6]:
# https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101
train_dir = current_img
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([
    transforms.RandomResizedCrop(256),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize])

train_dataset = datasets.ImageFolder(train_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)

In [7]:
train_dataset.class_to_idx['20.0']

15

In [8]:
for i,img in enumerate(train_loader):
    break

In [9]:
img[0].shape

torch.Size([1, 3, 256, 256])

## New Dataset Class

In [10]:
# https://github.com/pytorch/vision/blob/master/torchvision/transforms/functional.py
# https://pillow.readthedocs.io/en/5.1.x/handbook/concepts.html#concept-modes
sample_annotations = {'TCGA-T9-A92H-01Z-00-DX3.1DE7D5ED-60F7-4645-8243-AB0C027B3ED7': 0, 
                      'TCGA-WS-AB45-01Z-00-DX1.1FD99E7A-830F-40DC-98CD-53C62C678AC6': 1,
                      'TCGA-NH-A8F8-01Z-00-DX1.0C13D583-0BCE-44F7-A4E6-5994FE97B99C': 0,
                      'TCGA-QG-A5YV-01Z-00-DX1.9B7FD3EA-D1AB-44B3-B728-820939EF56EA': 1,
                      'TCGA-QG-A5YW-01Z-00-DX1.3242285F-FA82-4A92-9D0E-951013A3C91A': 0,
                      'TCGA-QG-A5YX-01Z-00-DX1.28125B5A-B696-44AE-8A86-72E2CF7B9A6A': 1,
                      'TCGA-QG-A5Z1-01Z-00-DX2.2CE72B6A-557F-43BD-BA4C-B252E14E46EF': 0,
                      'TCGA-QG-A5Z2-01Z-00-DX2.F2352352-8F00-4BB3-8A62-8D1C1E374F95': 1,
                      'TCGA-QL-A97D-01Z-00-DX1.6B48E95D-BE3C-4448-A1AF-6988C00B7AF1': 0,
                      'TCGA-SS-A7HO-01Z-00-DX1.D20B9109-F984-40DE-A4F1-2DFC61002862': 1}
root_dir = '/n/mounted-data-drive/COAD/'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.ToTensor(), normalize])

In [11]:
train_set = data_utils.TCGADataset(sample_annotations, root_dir, transform=transform)

In [13]:
sample = train_set.__getitem__(1)

In [14]:
sample['slide'].shape

torch.Size([1010, 3, 256, 256])

In [15]:
sample['label']

1

In [16]:
train_loader = DataLoader(train_set, batch_size=1, shuffle=True, pin_memory=True)

In [17]:
for s in train_loader:
    print(s['slide'].shape, s['label'])

torch.Size([1, 912, 3, 256, 256]) tensor([0])
torch.Size([1, 1097, 3, 256, 256]) tensor([1])
torch.Size([1, 846, 3, 256, 256]) tensor([0])
torch.Size([1, 966, 3, 256, 256]) tensor([0])
torch.Size([1, 1155, 3, 256, 256]) tensor([1])
torch.Size([1, 867, 3, 256, 256]) tensor([1])
torch.Size([1, 955, 3, 256, 256]) tensor([0])
torch.Size([1, 703, 3, 256, 256]) tensor([0])
torch.Size([1, 1010, 3, 256, 256]) tensor([1])
torch.Size([1, 1373, 3, 256, 256]) tensor([1])


## COAD

In [19]:
msi_path = '/home/sxchao/MSI_prediction/tcga_project/msi_raw_data.xlsx'
msi_raw = pd.read_excel(msi_path)

In [24]:
msi_raw.columns

Index(['Tumor type', 'Donor id', 'Tumor sample id', 'Normal sample id',
       'Number of mutations A motif', 'Number of covered A loci',
       'Number of mutations C motif', 'Number of covered C loci',
       'Number of mutations AC motif', 'Number of covered AC loci',
       'Number of mutations AG motif', 'Number of covered AG loci'],
      dtype='object')

In [25]:
msi_raw.rename(columns={'Tumor type':'tumor_type', 
                        'Donor id':"donor_id", 
                        'Tumor sample id':'tumor_id',
                        'Normal sample id':'normal_id',
                        'Number of mutations A motif':'muts_A', 
                        'Number of covered A loci':'covg_A',
                        'Number of mutations C motif':'muts_C', 
                        'Number of covered C loci':'covg_C',
                        'Number of mutations AC motif':'muts_AC', 
                        'Number of covered AC loci':'covg_AC',
                        'Number of mutations AG motif':'muts_AG', 
                        'Number of covered AG loci':'covg_AG'}, 
               inplace=True)

In [154]:
msi_raw['muts_tot'] = msi_raw['muts_A'] + msi_raw['muts_C'] + msi_raw['muts_AC'] + msi_raw['muts_AG']
msi_raw['msi'] = msi_raw['muts_tot'] >= 20
msi_raw.msi = msi_raw.msi.astype(int)
msi_raw.tail(3)

Unnamed: 0,tumor_type,donor_id,tumor_id,normal_id,muts_A,covg_A,muts_C,covg_C,muts_AC,covg_AC,muts_AG,covg_AG,muts_tot,msi
6744,UCS,TCGA-NG-A4VW,TCGA-NG-A4VW-01A-11D-A28R-08,TCGA-NG-A4VW-10A-01D-A28U-08,4,78297,3,63736,0,2563,0,4741,7,0
6745,UCS,TCGA-QM-A5NM,TCGA-QM-A5NM-01A-11D-A28R-08,TCGA-QM-A5NM-10A-01D-A28U-08,3,79916,1,62766,0,2523,0,4699,4,0
6746,UCS,TCGA-QN-A5NN,TCGA-QN-A5NN-01A-11D-A28R-08,TCGA-QN-A5NN-10A-01D-A28U-08,12,75610,7,62040,0,2468,1,4606,20,1


In [155]:
msi_raw.groupby('tumor_type')['msi'].mean()

tumor_type
ACC     0.076923
BLCA    0.024752
BRCA    0.261927
CESC    0.189723
COAD    0.399240
HNSC    0.026052
KICH    0.106061
KIRC    0.015152
KIRP    0.072727
LGG     0.037698
LIHC    0.099010
LUAD    0.047059
LUSC    0.038793
PRAD    0.016495
READ    0.419643
SKCM    0.097493
STAD    0.308176
THCA    0.014957
UCEC    0.560928
UCS     0.070175
Name: msi, dtype: float64

In [157]:
coad_msi = msi_raw.loc[msi_raw['tumor_type']=='COAD','donor_id'].values
coad_msi.shape

(263,)

In [158]:
sample_name = coad_msi[-1]
name_len = len(sample_name)
sample_name

'TCGA-AY-4071'

In [159]:
coad_full_name = os.listdir(root_dir)
coad_img = np.array([v[0:name_len] for v in coad_full_name])
len(coad_img), coad_img[5], coad_full_name[5]

(434,
 'TCGA-AA-3949',
 'TCGA-AA-3949-01Z-00-DX1.23748e80-0d7e-4238-8b29-f74cddae8596.svs')

In [160]:
coad_both = np.intersect1d(coad_img, coad_msi)

In [161]:
sample_names = []
for sample in coad_both:
    key = np.argwhere(coad_img == sample).squeeze()
    if key.size != 0:
        sample_names.append(coad_full_name[key][:-4])

In [162]:
len(sample_names)

247

In [164]:
msi_raw.set_index('donor_id', inplace=True)
msi_raw.head(3)

Unnamed: 0_level_0,tumor_type,tumor_id,normal_id,muts_A,covg_A,muts_C,covg_C,muts_AC,covg_AC,muts_AG,covg_AG,muts_tot,msi
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TCGA-OR-A5J1,ACC,TCGA-OR-A5J1-01A-11D-A29I-10,TCGA-OR-A5J1-10A-01D-A29L-10,4,86016,1,66150,0,2940,0,5178,5,0
TCGA-OR-A5J2,ACC,TCGA-OR-A5J2-01A-11D-A29I-10,TCGA-OR-A5J2-10A-01D-A29L-10,9,87000,5,65966,1,2925,0,5214,15,0
TCGA-OR-A5J3,ACC,TCGA-OR-A5J3-01A-11D-A29I-10,TCGA-OR-A5J3-10A-01D-A29L-10,7,85172,8,64360,0,2904,0,5166,15,0


In [165]:
sample_annotation = {}
for sample_name in sample_names:
    sample_annotation[sample_name] = msi_raw.loc[sample_name[0:name_len], 'msi']

In [172]:
all = list(sample_annotation.values())
sum(all) / len(all)

0.3967611336032389

In [107]:
img_dir = root_dir + sample_name + '.svs/' + sample_name + '_files/5.0'
imgs = os.listdir(img_dir)
for im in imgs:
    path = img_dir + '/' + im
    image = data_utils.default_loader(path)
    print(transform(image).shape)

torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([

torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 

torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3

torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3, 82, 256])
torch.Size([3, 256, 256])
torch.Size([3, 256, 256])
torch.Size([3,

## Pre-trained Model

In [None]:
# inception_v3 expects tensors with a size of N x 3 x 299 x 299
net = models.inception_v3(pretrained=True)
net