***Data Curation***

Histopathology Slide Data Source: https://portal.gdc.cancer.gov/

In [None]:
import os
import config
import pandas as pd
import numpy as np

In [None]:
####Extract patient_ids for images and labels###

config = Config()
image_pids = []
for i in os.listdir('%s/slides' % config.data_path):
    image_pids.append(i[:12])
labels_pids = list(pd.read_table('/home/aamomeni/research/brca_signatures/Data/TCGA_BRCA_out1.txt' ,header=0, index_col=0).index)
for i in range(len(labels_pids)):
    labels_pids[i] = labels_pids[i][:12]
    
print('%s patients will be considered' % len(np.intersect1d(image_pids,labels_pids)))

In [None]:
###Remove Unlabeled images###

for i in os.listdir('%s/slides' % config.data_path):
    if i[:12] in np.setdiff1d(image_pids,labels_pids):
        os.remove('%s/slides/%s' % (config.data_pathm, i))

###Rename remaining files##
for i in os.listdir('%s/slides' % config.data_path):
    if i[:12] in np.intersect1d(image_pids,labels_pids):
        os.rename('%s/slides/%s' % (config.data_path, i), '%s/slides/%s.svs' %(config.data_path, i[:23]))

In [None]:
###Check we still have 971 patients###

pids = []
for i in os.listdir('%s/slides' % config.data_path):
    pids.append(i[:12])
len(np.unique(pids))

In [None]:
data = pd.read_table('/home/aamomeni/research/brca_signatures/Data/TCGA_BRCA_out1.txt',header=0, index_col=0)
data.index  = labels_pids  
pids = list(set(labels_pids).intersection(pids))
print(len(pids))

In [None]:
np.setdiff1d(np.unique(data.index),pids)

***Preprocessing***

We will generate all patches - see preprocess.py for this. The below two functions will plot patches.

In [None]:

import matplotlib.pyplot as plt
def plot_patches(img, config):
    
    patch_size = config.patch_size
    threshold = config.threshold
    black_tile = np.zeros((patch_size, patch_size,3))    
    width, height = img.dimensions
    f1 = plt.figure(figsize=(100, 100*height/width))
    f2 = plt.figure(figsize=(100, 100*height/width))
    for i in range(int(height/patch_size)):
        for j in range(int(width/patch_size)):
            idx = i*int(width/patch_size) + j
            patch = img.read_region(location=(j*patch_size,i*patch_size), level=0, size=(patch_size,patch_size))
            ratio, mask = tissue_ratio(patch)
            if ratio >= threshold:
                plt.figure(f1.number)
                plt.subplot(int(height/patch_size), int(width/patch_size), idx+1)
                plt.imshow(patch)
                plt.axis('off')
                plt.figure(f2.number)
                plt.subplot(int(height/patch_size), int(width/patch_size), idx+1)
                plt.imshow(patch)
                plt.axis('off')
            else:
                plt.figure(f1.number)
                plt.subplot(int(height/patch_size), int(width/patch_size), idx+1)
                plt.imshow(patch)
                plt.axis('off')
                plt.figure(f2.number)
                plt.subplot(int(height/patch_size), int(width/patch_size), idx+1)
                plt.imshow(black_tile)
                plt.axis('off')
    plt.figure(f1.number)
    plt.savefig("output/temp1.jpg")
    plt.figure(f2.number)
    plt.savefig("output/temp2.jpg")
    plt.figure(figsize=(25,20))
    plt.subplot(121)
    img1 = Image.open("output/temp1.jpg")
    plt.imshow(img1)
    plt.axis('off')
    plt.title("All patches")
    plt.subplot(122)
    img2 = Image.open("output/temp2.jpg")
    plt.imshow(img2)
    plt.axis('off')
    plt.title("Selected patches")
    plt.savefig("output/preprocess/patches.jpg")
    os.remove("output/temp1.jpg")
    os.remove("output/temp2.jpg")

    
def plot_tissue(img, config):
    
    patch_size = config.patch_size
    patch = img.read_region(location=(j*patch_size,i*patch_size), level=0, size=(patch_size,patch_size)).convert('RGB')
    ratio, thresholded = tissue_ratio(patch)
    plt.figure(figsize=(20,20))    
    plt.subplot(121)
    plt.imshow(patch)
    plt.axis('off')
    plt.title('Original image')
    plt.subplot(122)
    plt.imshow(thresholded, cmap='Greys')
    plt.axis('off')
    plt.title('Thresholded image')
    plt.savefig("output/preprocess/tissue.jpg")
    plt.show()

    
        for feature in self.config.selected_features:
            labels[feature] = {}
            for sample in samples:
                labels[feature][sample] = data.loc[sample, feature]

In [None]:
from datasets import Dataset
from config import Config
import pandas as pd
import os
import numpy as np
c = Config()
d = Dataset()

test = d.get_partition(c)['test']

In [None]:
train, labels = d.convert_to_arrays(c,test)
labels[0]

In [None]:
os.listdir('/labs/gevaertlab/data/momena/breast_data/patches_448/')

In [None]:
len(labels[0])

In [None]:
get_patches('TCGA-BH-A0E2-01A-01-BSA.svs', c)

In [3]:
import models

In [None]:
    def generate(self, list_IDs):
        'Generates batches of samples'
        
        while 1:
            indexes = self.__get_exploration_order(list_IDs)
            imax = int(len(indexes)/self.config.batch_size)
            for i in range(imax):
                list_IDs_temp = [list_IDs[k] for k in indexes[i*self.config.batch_size:(i+1)*self.config.batch_size]]
                X, y = self.__data_generation(list_IDs_temp, labels)

                yield X, y

    def __get_exploration_order(self, list_IDs):
        'Generates order of exploration'
        print(list_IDs)
        indexes = np.arange(len(list_IDs))
        np.random.shuffle(indexes)
        return indexes

    def __data_generation(self, list_IDs_temp, labels):
        'Generates data of batch_size samples'
        #High resolution vs Low resolution
        X = []
        for ID in list_IDs_temp:
            img = Image.open(ID)
            img = img.resize((self.config.input_shape, self.config.input_shape))
            image = np.array(img)[:,:,:3]
            X.append(image)
        X = np.asarray(X)
    
        y = []
        for label in labels.keys():
            y_label = []
            for ID in list_IDs_temp:
                sample = ID.split('/')[-2][0:16]
                y_label.append(labels[label][sample])
            y_label = np.asarray(y_label)
            y.append(y_label)
        #y = np.asarray(y)
        
        return X, y

In [12]:
from datasets import Dataset
from config import Config
from PIL import Image
import numpy as np 

c = Config()
d = Dataset()

list_IDs = d.get_partition(c)['train']

indexes = np.arange(len(list_IDs))

(np.random.shuffle(indexes))
print(indexes)

imax = int(len(indexes)/c.batch_size)
for i in range(imax):
    list_IDs_temp = [list_IDs[k] for k in indexes[i*c.batch_size:(i+1)*c.batch_size]]
list_IDs_temp

  if np.intersect1d(l,i)!= []:


[512 146 237 780  82 163 768 549 645 786  69 701 856 474 463 339 211 260
  80 315 575 651 612 671 360  51 635 473 737 425 764 405  32 670 637 403
  91 525 236 279 288 523 144 534 394 582 373  88 553 706 828  97  81 742
 426  71 420 143  34 115 359 819 802 395 475 216 678 566 606 133 138 551
   3 825 530 636 460 594 413 760 800   7 779 209  75 269 454 722 689 842
 342 450 579 603 835 229 695 491 634 684 386  70 623 618 418 788 725 535
 483 766 150 273 654 830   4 106 568 251 833 380 626 151  99 282 641 412
 851 803  76 313 774 214  17   0 452 250  66 806 625 648 270 367 155 245
 778  73 731 402 258 465 812 783 700 316 852 212  33 352 524 140 340 619
 419 592 558 754 661 738 485  61 741  89 704  63 735 815 826 505 112 570
 162 382 758  86 653 713  18  36 520 284 444 113 254 204 256 198 338 375
 577 855 408 846 228 169  59 243 590 172 663  15 277 598  65 813 789 715
 662 728 147  22 667 750  98 629 286 459 753 655 397 697  23 516 807 586
 776 808 347  10 581 208 470  37 102 756 792 301 67

['TCGA-AN-A0FT-01A-01-TSA',
 'TCGA-AR-A256-01A-01-TSA',
 'TCGA-E9-A1RI-01A-01-TSA',
 'TCGA-AC-A3OD-01B-05-BS5',
 'TCGA-C8-A1HO-01A-01-TSA',
 'TCGA-BH-A0E6-01A-01-TSA',
 'TCGA-A2-A25B-01A-01-TSA',
 'TCGA-A2-A0YT-01A-01-BSA',
 'TCGA-D8-A140-01A-01-TSA',
 'TCGA-D8-A1X6-01A-01-TS1',
 'TCGA-A2-A04T-01A-02-BSB',
 'TCGA-A7-A13E-01A-01-TSA',
 'TCGA-AQ-A04H-01B-01-TS1',
 'TCGA-E2-A15A-01A-01-TS1',
 'TCGA-BH-A0E0-01A-01-TSA',
 'TCGA-B6-A0RO-01A-02-TSB',
 'TCGA-A8-A090-01A-01-BS1',
 'TCGA-A8-A08P-01A-01-BS1',
 'TCGA-BH-A0DD-01A-03-TSC',
 'TCGA-A2-A0YF-01A-02-TSB',
 'TCGA-BH-A0B9-01A-01-TSA',
 'TCGA-AO-A03M-01B-01-TSA',
 'TCGA-B6-A0IO-01A-01-TSA',
 'TCGA-E9-A1NA-01A-01-TSA',
 'TCGA-A8-A09E-01A-01-BS1',
 'TCGA-A2-A0CQ-01A-02-TSB',
 'TCGA-AO-A03T-01A-02-MS2',
 'TCGA-E9-A1ND-01A-01-TSA',
 'TCGA-EW-A1J5-01A-01-TSA',
 'TCGA-D8-A1XK-01A-02-TS2',
 'TCGA-AN-A0FJ-01A-01-BSA',
 'TCGA-A7-A3IY-01A-02-TSB']

In [13]:
d.convert_to_arrays(c, list_IDs_temp )

100%|██████████| 32/32 [00:12<00:00,  1.75it/s]


(array([[[[152, 114, 165],
          [105,  66, 121],
          [ 98,  56, 119],
          ...,
          [229, 218, 224],
          [231, 220, 226],
          [225, 214, 220]],
 
         [[182, 152, 190],
          [125,  92, 137],
          [107,  70, 122],
          ...,
          [228, 217, 223],
          [229, 218, 224],
          [226, 215, 221]],
 
         [[215, 196, 216],
          [169, 144, 174],
          [136, 105, 147],
          ...,
          [227, 216, 222],
          [227, 216, 222],
          [228, 217, 223]],
 
         ...,
 
         [[ 71,  34,  86],
          [ 68,  31,  82],
          [ 64,  27,  78],
          ...,
          [178,  57,  92],
          [183,  52,  83],
          [189,  52,  78]],
 
         [[ 72,  29,  83],
          [ 68,  28,  81],
          [ 64,  24,  76],
          ...,
          [182,  58,  94],
          [187,  54,  83],
          [187,  49,  75]],
 
         [[ 72,  27,  84],
          [ 69,  24,  79],
          [ 64,  21,  74],
   

In [None]:
4