## Initial CNN 
Alex Lu (alu2)
November 16, 2018


In [110]:
import os 
import torch
import pandas as pd 
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils 
from torch.utils.data.sampler import SubsetRandomSampler

import torch.optim as optim
import torch.nn as nn


import warnings
warnings.filterwarnings("ignore")

plt.ion()

We need to extend torch dataset in order to use torch dataloader. Implementation based off of https://pytorch.org/tutorials/beginner/data_loading_tutorial.html. 

In [74]:
class HumanProteinDataset(Dataset):
    '''
    '''
    def __init__(self, labels_csv, root_dir, transform=None):
        self.label_names = {
            0:  "Nucleoplasm",  
            1:  "Nuclear membrane",   
            2:  "Nucleoli",   
            3:  "Nucleoli fibrillar center",   
            4:  "Nuclear speckles",
            5:  "Nuclear bodies",   
            6:  "Endoplasmic reticulum",   
            7:  "Golgi apparatus",   
            8:  "Peroxisomes",   
            9:  "Endosomes",   
            10:  "Lysosomes",   
            11:  "Intermediate filaments",   
            12:  "Actin filaments",   
            13:  "Focal adhesion sites",   
            14:  "Microtubules",   
            15:  "Microtubule ends",   
            16:  "Cytokinetic bridge",   
            17:  "Mitotic spindle",   
            18:  "Microtubule organizing center",   
            19:  "Centrosome",   
            20:  "Lipid droplets",   
            21:  "Plasma membrane",   
            22:  "Cell junctions",   
            23:  "Mitochondria",   
            24:  "Aggresome",   
            25:  "Cytosol",   
            26:  "Cytoplasmic bodies",   
            27:  "Rods & rings"
        }

        self.labels_df = pd.read_csv(labels_csv)
        for _, row in self.labels_df.iterrows():
            labels = np.array(row.Target.split(" ")).astype(np.int)
            row.Target = np.array([1 if i in labels else 0 for i in range(28)])
            
        self.root_dir = root_dir
        self.transform = transform
        
        self.raw_h = 512
        self.raw_w = 512
        
    def __len__(self):
        return len(self.labels_df)
    
    def __getitem__(self, idx):
        '''
        
        '''
#        image_base = os.path.join(self.root_dir, self.labels_df.iloc[idx, 0])
        image_stack = self._load_image(self.labels_df.iloc[idx, 0])
        
        sample = {'stack': image_stack, 'labels': self.labels_df['Target'].iloc[idx]}
        if self.transform: 
            sample = self.transform(sample)
            
        return sample
    
    def _load_image(self, image_id, factor = 1):
        image_stack = np.zeros((4,self.raw_w,self.raw_h))
        image_stack[0,:,:] = io.imread(self.root_dir + image_id + "_green" + ".png")
        image_stack[1,:,:] = io.imread(self.root_dir + image_id + "_red" + ".png")
        image_stack[2,:,:] = io.imread(self.root_dir + image_id + "_blue" + ".png")
        image_stack[3,:,:] = io.imread(self.root_dir + image_id + "_yellow" + ".png")

        if factor != 1:
            image_scaled = np.zeros(shape=(4, int(self.raw_w*factor), int(self.raw_h*factor)))
            image_scaled[0,:,:] = rescale(images[0,:,:], factor)
            image_scaled[1,:,:] = rescale(images[1,:,:], factor)
            image_scaled[2,:,:] = rescale(images[2,:,:], factor)
            image_scaled[3,:,:] = rescale(images[3,:,:], factor)
            return image_scaled

        return image_stack 
    

In [82]:
class Rescale(object):
    '''
    '''
    
    def __init__(self, scaled_dims):
        self.scaled_dims = scaled_dims
    
    def __call__(self, sample):
        stack_raw = sample['stack']
        
        stack_scaled = np.zeros(shape = (4, self.scaled_dims[0], self.scaled_dims[1]))
        stack_scaled[0,:,:] = transform.resize(stack_raw[0, :, :], self.scaled_dims)
        stack_scaled[1,:,:] = transform.resize(stack_raw[1, :, :], self.scaled_dims)
        stack_scaled[2,:,:] = transform.resize(stack_raw[2, :, :], self.scaled_dims)
        stack_scaled[3,:,:] = transform.resize(stack_raw[3, :, :], self.scaled_dims)
        
        return {'stack': stack_scaled, 'labels':sample['labels']}

class ToTensor(object):
    def __call__(self, sample):
        temp = sample['stack']/255.0
        totensor = transforms.ToTensor()
        sample['stack'] = totensor(temp.transpose((1, 2, 0)))
        return sample

In [84]:
data = HumanProteinDataset(labels_csv = './data/train.csv',
                          root_dir = './data/train/',
                          transform = transforms.Compose([
                                                          Rescale((256, 256)),
                                                          ToTensor()
                         ]))

for i in range(len(data)):
    sample = data[i]
    #print sample
    print sum(sample['labels'])
    if i == 2:
        break
    

2
4
1


In [98]:
def get_data_loaders(labels_csv = './data/train.csv', root_dir = './data/train/',):
    '''
    
    '''
    data = HumanProteinDataset(labels_csv, root_dir, transform=transforms.Compose([
                                                          Rescale((256, 256)),
                                                          ToTensor()
                         ]))
    indices = np.arange(len(data))
    indices_train = np.random.choice(indices, size=int(.75*len(data)), replace=False)
    indices_test = list(set(indices) - set(indices_train))
    
    sampler_train = SubsetRandomSampler(indices_train)
    sampler_test = SubsetRandomSampler(indices_test)
    
    dataloader_train = DataLoader(data, batch_size=5, sampler=sampler_train, num_workers=4)
    dataloader_test = DataLoader(data, batch_size=5, sampler=sampler_test, num_workers=4)
    
    return (dataloader_train, dataloader_test)
    


In [101]:
for i_batch, sample_batched in enumerate(get_data_loaders()[0]):
    print i_batch, sample_batched['stack'].size(), sample_batched['labels'].size()
    
    
    

0 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
1 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
2 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
3 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
4 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
5 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
6 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
7 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
8 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
9 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
10 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])
11 torch.Size([5, 4, 256, 256]) torch.Size([5, 28])


Process Process-11:
    img = t(img)
Traceback (most recent call last):
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
Process Process-9:
Traceback (most recent call last):
  File "<ipython-input-74-54e956ce1603>", line 59, in __getitem__
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    sample = self.transform(sample)
    self.run()
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/torchvision-0.2.1-py2.7.egg/torchvision/transforms/transforms.py", line 49, in __call__
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114

Traceback (most recent call last):
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 358, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/alex/miniconda2/lib/python2.7/inspect.py", line 1051, in getinnerframes
    framelist.append((tb.tb_frame,) + getframeinfo(tb, context))
  File "/Users/alex/miniconda2/lib/python2.7/inspect.py", line 1015, in getframeinfo
    lines, lnum = findsource(frame)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/IPython/core/ultratb.py", line 182, in findsource
    lines = linecache.getlines(file, globals_dict)
  File "

    self.run()


IndexError: string index out of range

  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/numpy/core/numeric.py", line 424, in asarray
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-74-54e956ce1603>", line 55, in __getitem__
    image_stack = self._load_image(self.labels_df.iloc[idx, 0])
  File "<ipython-input-74-54e956ce1603>", line 66, in _load_image
    image_stack[1,:,:] = io.imread(self.root_dir + image_id + "_red" + ".png")
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args,

In [121]:
class CNN(nn.Module):
    def __init__(self):
        self.W = self.H = 256
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(4, 8, 5, 1, 2), # input_channels, output_channels, kernele_size, stride, padding
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(      #input: 4xWxH
            nn.Conv2d(8,16,5,1,2),        # input_channels, output_channels, kernel_size, stride, padding   
            nn.ReLU(),                      
            nn.MaxPool2d(kernel_size=2), #output: 16xW/4xH/4
        )
        self.drop_out = nn.Dropout()
        self.out1 = nn.Linear( int(16 * self.W/4 * self.H/4), 900)
        self.out2 = nn.Linear(900, 28)
        
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)
        output = self.drop_out(x)
        output = self.out1(output)
        output = self.out2(output)
        return output
    
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

In [125]:
def Train(model, epochs=10, criterion=nn.BCEWithLogitsLoss(reduction='sum'), optimizer=None):
    '''
    
    '''
    dataloader_train = get_data_loaders()[0]
    
    if optimizer is None:
        optimizer = optim.Adam(model.parameters(), lr=.04, betas=(.9, .99))
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print "training with device:" + str(device)
    
    model.to(device)
    
    for epoch in range(epochs):
        for i_batch, sample_batch in enumerate(dataloader_train):
            stacks, labels = sample_batch['stack'], sample_batch['labels']
            
            stacks, labels = stacks.to(device, dtype=torch.float), labels.to(device, dtype=torch.float)
            
            optimizer.zero_grad()
            
            outputs = model(stacks)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            if i_batch % 10 == 9:
                print '\n[%d, %5d] loss: %.3f'%(epoch+1, i_batch+1, loss.item())
                running_loss = 0.0
    
    print 'Finished training!'

    
def run_model(model, batch):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    stacks = batch.to(device, dtype=torch.float)
    output = model(stacks).cpu()
    return output
    

In [126]:
torch.cuda.empty_cache()
cnn = CNN()
cnn.apply(init_weights)

Train(cnn, epochs=5,  criterion=nn.BCEWithLogitsLoss(reduction='sum'), optimizer = optim.Adam(cnn.parameters(), lr=0.001, betas=(0.9, 0.99)))



training with device:cpu

[1,    10] loss: 20.938

[1,    20] loss: 37.021

[1,    30] loss: 30.009

[1,    40] loss: 25.317

[1,    50] loss: 24.674

[1,    60] loss: 24.706

[1,    70] loss: 30.546

[1,    80] loss: 25.941

[1,    90] loss: 26.016

[1,   100] loss: 23.802

[1,   110] loss: 20.225

[1,   120] loss: 27.812

[1,   130] loss: 24.283

[1,   140] loss: 31.988

[1,   150] loss: 20.849

[1,   160] loss: 23.717

[1,   170] loss: 27.216

[1,   180] loss: 23.780

[1,   190] loss: 30.063

[1,   200] loss: 29.082

[1,   210] loss: 27.042

[1,   220] loss: 24.108

[1,   230] loss: 25.897

[1,   240] loss: 29.025

[1,   250] loss: 29.922

[1,   260] loss: 30.639

[1,   270] loss: 30.384

[1,   280] loss: 24.121

[1,   290] loss: 28.079

[1,   300] loss: 18.939

[1,   310] loss: 22.572

[1,   320] loss: 24.826

[1,   330] loss: 22.755

[1,   340] loss: 27.670

[1,   350] loss: 24.157

[1,   360] loss: 22.456

[1,   370] loss: 26.361

[1,   380] loss: 32.346

[1,   390] loss: 24.605


Process Process-36:
Process Process-34:
Process Process-35:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
    self.run()
    self.run()
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/alex/miniconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
    self._target(*self._args, **self._kwargs)
  File "/Users/alex/miniconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 96, in _worker_loop
    self._targ

KeyboardInterrupt: 

In [63]:
def fill_targets(row, label_names, full):
    '''

    '''

    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    if full:
        for num in row.Target:
            name = label_names[int(num)]
            row.loc[name] = 1
    return row

def read_labels(labels_csv):
    '''

    '''
    print 'reading the label csv'

    label_names = {
        0:  "Nucleoplasm",  
        1:  "Nuclear membrane",   
        2:  "Nucleoli",   
        3:  "Nucleoli fibrillar center",   
        4:  "Nuclear speckles",
        5:  "Nuclear bodies",   
        6:  "Endoplasmic reticulum",   
        7:  "Golgi apparatus",   
        8:  "Peroxisomes",   
        9:  "Endosomes",   
        10:  "Lysosomes",   
        11:  "Intermediate filaments",   
        12:  "Actin filaments",   
        13:  "Focal adhesion sites",   
        14:  "Microtubules",   
        15:  "Microtubule ends",   
        16:  "Cytokinetic bridge",   
        17:  "Mitotic spindle",   
        18:  "Microtubule organizing center",   
        19:  "Centrosome",   
        20:  "Lipid droplets",   
        21:  "Plasma membrane",   
        22:  "Cell junctions",   
        23:  "Mitochondria",   
        24:  "Aggresome",   
        25:  "Cytosol",   
        26:  "Cytoplasmic bodies",   
        27:  "Rods & rings"
    }

    df = pd.read_csv(labels_csv)

    full = False
    if full:
        for key in label_names.keys():
            df[label_names[key]] = 0

        df = df.apply(fill_targets, axis=1, args=((label_names, full)))

    else:
        for _, row in df.iterrows():
            row.Target = np.array(row.Target.split(" ")).astype(np.int)
    return (df, df['Id'])


In [69]:
df, ids = read_labels("./data/train.csv")
df.head()

reading the label csv


Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,"[16, 0]"
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,"[7, 1, 2, 0]"
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,[5]
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,[1]
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,[18]


In [34]:
def load_image(image_id, basepath = "./data/train/", factor = 1):
    image_stack = np.zeros((4,512,512))
    image_stack[0,:,:] = io.imread(basepath + image_id + "_green" + ".png", )
    image_stack[1,:,:] = io.imread(basepath + image_id + "_red" + ".png")
    image_stack[2,:,:] = io.imread(basepath + image_id + "_blue" + ".png")
    image_stack[3,:,:] = io.imread(basepath + image_id + "_yellow" + ".png")

    if factor != 1:
        image_scaled = np.zeros(shape=(4, int(512*factor), int(512*factor)))
        image_scaled[0,:,:] = rescale(images[0,:,:], factor)
        image_scaled[1,:,:] = rescale(images[1,:,:], factor)
        image_scaled[2,:,:] = rescale(images[2,:,:], factor)
        image_scaled[3,:,:] = rescale(images[3,:,:], factor)
        return image_scaled

    return image_stack 

x = load_image("00070df0-bbc3-11e8-b2bc-ac1f6b6435d0")
print np.min(x)
print np.max(x)

0.0
255.0


In [59]:
labels = [0, 27, 5]
temp = [1 if i in labels else 0 for i in range(28)]
print temp

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
