## Imports and constants

In [0]:
import tensorflow as tf
import numpy as np
import os
import zipfile
import pandas as pd

## Preprocessing stuff

In [0]:
def extract_datazip(zipfile_path=None, validation_start_index=None, train_percent=0.95, extract_path='data', save_filelist=True):
    """
    Extract dataset train and test zip file present in 'zipfile_path' path to train, valid and test directory in 'extract_path' path.
    Creating directory structure:
        # TODO: find directory structure
        
    return:
        list, list, list : list of files in train, valid and test directories
        dict : dict of paths 
    
    """
    
    assert os.path.isfile(os.path.join(zipfile_path, 'train.zip')), "{} file not found".format(os.path.join(zipfile_path,'train.zip'))
    assert os.path.isfile(os.path.join(zipfile_path,'test.zip')), "{} file not found".format(os.path.join(zipfile_path,'test.zip'))
    # create directories
    if not os.path.isdir(extract_path): 
        os.makedirs(extract_path)
        os.makedirs(os.path.join(extract_path,'train'))
        os.makedirs(os.path.join(extract_path,'valid'))
        os.makedirs(os.path.join(extract_path,'test'))
    
    train_zfile = zipfile.ZipFile(os.path.join(zipfile_path,'train.zip'))
    test_zfile = zipfile.ZipFile(os.path.join(zipfile_path,'test.zip'))
    train_filelist = train_zfile.namelist()
    train_filelist = pd.DataFrame(train_filelist, columns=['fileid'])
    train_filelist = train_filelist.sort_values(by='fileid')
    train_filelist = train_filelist.reset_index(drop=True)
    
    filelist = pd.DataFrame(train_filelist.fileid.str.split("_",1).tolist(),columns=["id","color"] )
    
    
    if validation_start_index ==None:
        validation_start_index = int(train_filelist.shape[0]//4*train_percent)*4
    
    if save_filelist:
        save_list_validation_start_index = validation_start_index//4
        if validation_start_index == None:
            save_list_validation_start_index = int(save_list.shape[0]*train_percent)
        
        save_list = pd.DataFrame(train_filelist.fileid.str.split('_', 1).tolist(), columns=['fileid','color']).drop(columns='color').drop_duplicates()        
        save_list[ : save_list_validation_start_index].to_csv(os.path.join(extract_path,'train/train_filelist.csv'), sep=',', index=False)
        save_list[save_list_validation_start_index : ].to_csv(os.path.join(extract_path,'valid/valid_filelist.csv'), sep=',', index=False)
        test_savelist = pd.DataFrame(test_zfile.namelist(), columns=["fileid"]).sort_values(by="fileid").reset_index(drop=True)
        test_savelist = pd.DataFrame(test_savelist.fileid.str.split("_",1).tolist(),columns=["id","color"] ).drop(columns='color').drop_duplicates()
        test_savelist.to_csv(os.path.join(extract_path,'test/test_filelist.csv'), sep=',', index=False)
    
    valid_filelist = np.squeeze(train_filelist[validation_start_index : ].values)
    train_filelist = np.squeeze(train_filelist[ : validation_start_index].values)


    print('Extracting train images at {}'.format(extract_path), end='\t')
    train_zfile.extractall(os.path.join(extract_path, 'train'), train_filelist)
    print('done')
    print('Extracting validation images at {}'.format(extract_path), end='\t')
    train_zfile.extractall(os.path.join(extract_path, 'valid'), valid_filelist)
    print('done')
    print('Extracting test images at {}'.format(extract_path), end='\t')
    test_zfile.extractall(os.path.join(extract_path, 'test'))
    print('done')
    extract_path = os.path.dirname(os.path.abspath(extract_path))
    return {'train': os.path.join(extract_path,'train'), 'valid': os.path.join(extract_path,'valid'), 'test': os.path.join(extract_path,'test')}

In [0]:
def preprocess_label(label_file=None, save_file=None):
    """
    Converts label csv file into onehot format, requires pandas dataframe
    """
    
    new_lbl = pd.DataFrame(data=None, columns=['Id']+ [i for i in range(28)])
    tr_list = pd.read_csv(label_file, sep=',')
    for index, row in tr_list.iterrows():
        n_hot = np.array([0]*28)
        n_hot[list(map(int,row['Target'].split()))] = 1
        new_lbl.loc[index]= [row["Id"]]+ n_hot.tolist()
    if save_file:
        new_lbl.to_csv(save_file, index=False)
    return new_lbl


In [0]:
def split_label(label, validation_start_index=None, train_percent=0.95, save_listfile=True, save_path='data'):
    """
    Split label file into train and validation 
    
    """
    lbl = pd.read_csv(label)
    if validation_start_index == None:
        validation_start_index = int(lbl.shape[0] * train_percent)
    if save_listfile:
        lbl[ : validation_start_index].to_csv(os.path.join(save_path, 'train_label.csv'), index=False)
        lbl[validation_start_index : ].to_csv(os.path.join(save_path, 'valid_label.csv'), index=False)
    return np.squeeze(lbl[ : validation_start_index].values), np.squeeze(lbl[validation_start_index : ].values)

## Make Iterators

In [0]:
def get_batch(img_file_path='data', batch_type=None, lbl_dir_path ='./', image_dim=[512, 512, 1], batch_size=32):
    '''
    Get iterator of dataset
    
    Parameters:
    img_file_path - directory path containing train, valid and test image directories
    batch_type - either 'train', 'valid' or 'test'
    lbl_dir_path - directory path containing train, valid and test label csv file
    image_dim - Three dimensions of image [width, height, channel]
    batch_size - batch size of dataset
    
    Returns:
    train and validation iterator if batch type is train
    else test iterator
    '''
    
    def decode_img(file_path, filename, image_dim):
        r = tf.reshape(tf.image.decode_png(tf.read_file(file_path+'/'+filename + '_red.png'), channels=1), image_dim)
        g = tf.reshape(tf.image.decode_png(tf.read_file(file_path+'/'+filename + '_green.png'), channels=1), image_dim)
        b = tf.reshape(tf.image.decode_png(tf.read_file(file_path+'/'+filename + '_blue.png'), channels=1), image_dim)
        y = tf.reshape(tf.image.decode_png(tf.read_file(file_path+'/'+filename + '_yellow.png'), channels=1), image_dim)
        img = tf.concat([r,g,b,y], axis=2)
        return img/ 255
    
    assert batch_type.lower() in ['train', 'valid','test'], 'type should be train or test'
    if batch_type.lower() in ['train','valid']:
        train_img_path=os.path.join(img_file_path, batch_type)
        train_lbl_path=os.path.join(lbl_dir_path, '')
        tr_lbl_mat = pd.read_csv(train_lbl_path+'/{}_label.csv'.format(batch_type), sep=',').drop('Id', axis=1).values        
        tr_img = tf.data.TextLineDataset(filenames=[train_img_path+'/{}_filelist.csv'.format(batch_type)]).skip(1)
        tr_lbl = tf.data.Dataset.from_tensor_slices(tr_lbl_mat)
        tr_dataset = tf.data.Dataset.zip((tr_img,tr_lbl))
        tr_dataset = tr_dataset.shuffle(buffer_size=50000).map(lambda x,y: (x,decode_img(train_img_path, x, image_dim), y))
        tr_dataset = tr_dataset.batch(batch_size).prefetch(batch_size)
        return tr_dataset.make_initializable_iterator()
    else:
        test_img_path=os.path.join(img_file_path, 'test')
        te_dataset = tf.data.TextLineDataset(filenames=[test_img_path+'/test_filelist.csv']).skip(1)
        te_dataset = te_dataset.map(lambda x: [x,decode_img(test_img_path, x, image_dim)])
        te_dataset = te_dataset.batch(batch_size).prefetch(batch_size)
        return te_dataset.make_initializable_iterator()
    

## Model Definition

In [0]:
class SimpleCNN:
    def __init__(self,X, output_units):
        self.layer = tf.layers.conv2d(inputs=X, filters=16, kernel_size=[5,5], kernel_initializer= tf.initializers.random_uniform(),	)
        self.layer = tf.layers.max_pooling2d(inputs = self.layer, pool_size=[2,2], strides=2)
        self.layer = tf.nn.relu(self.layer)
        
        self.layer = tf.layers.conv2d(inputs=self.layer, filters=32, kernel_size=[3,3], kernel_initializer= tf.initializers.random_uniform(), )
        self.layer = tf.layers.max_pooling2d(inputs = self.layer, pool_size=[2,2], strides=2)
        self.layer = tf.nn.relu(self.layer)
        
        self.layer = tf.layers.conv2d(inputs=self.layer, filters=64, kernel_size=[3,3], kernel_initializer= tf.initializers.random_uniform(), )
        self.layer = tf.layers.max_pooling2d(inputs = self.layer, pool_size=[2,2], strides=2)
        self.layer = tf.nn.relu(self.layer)
        
        self.layer = tf.layers.flatten(inputs = self.layer,)
        print(self.layer)
        self.layer = tf.layers.dense(inputs= self.layer, units = output_units, kernel_initializer=tf.initializers.random_uniform(), )
    
    def get_NN(self):
        return self.layer

In [0]:
# https://arxiv.org/pdf/1608.06993.pdf
class DenseNet:
    
    #DenseNet-121 DenseNet-169 DenseNet-201 DenseNet-264
    nw_types = {121:[6, 12, 24, 16],
                169:[6, 12, 32, 32],
                201:[6, 12, 48, 32],
                264:[6, 12, 64, 48],}
        
    def __init__(self, x, nb_blocks, nw_type, n_class, filters, training):
        self.dropout_rate = 0.2
        self.nb_blocks = nb_blocks
        self.nw_type = nw_type
        self.filters = filters
        self.training = training
        self.n_class = n_class
        self.model = self._dense_net(x)
        
        
    def get_model(self):
        return self.model

    def _bottleneck_layer(self, x, scope):
        with tf.name_scope(scope):
            x = tf.layers.batch_normalization(x,  training=self.training, name=scope+'_0_bn')
            x = tf.nn.relu(x, name=scope+'_0_relu')
            x = tf.layers.conv2d(x, filters=4 * self.filters, kernel_size=[1,1], padding='same', name=scope+'_0_conv')
            x = tf.layers.dropout(x, rate=self.dropout_rate, training=self.training, name=scope+'_0_drop')

            x = tf.layers.batch_normalization(x, training=self.training, name=scope+'_1_bn')
            x = tf.nn.relu(x, name=scope+'_1_relu')
            x = tf.layers.conv2d(x, filters=self.filters, kernel_size=[3,3], padding='same', name=scope+'_1_conv')
            x = tf.layers.dropout(x, rate=self.dropout_rate, training=self.training, name=scope+'_1_drop')
            return x

    def _transition_layer(self, x, scope):
        with tf.name_scope(scope):
            x = tf.layers.batch_normalization(x,  training=self.training, name=scope+'_bn')
            x = tf.nn.relu(x, name=scope+'_relu')
            x = tf.layers.conv2d(x, filters=self.filters, kernel_size=[1,1], padding='same', name=scope+'_conv')
            x = tf.layers.dropout(x, rate=self.dropout_rate, training=self.training, name=scope+'_drop')
            x = tf.layers.average_pooling2d(x, pool_size=[2,2], strides=2, name=scope+'_pool')
            return x
    
    def _global_average_pooling2d(self, inputs, data_format='channels_last', keepdims=False, name='avg_pool'):
        
        assert data_format.lower() in ['channels_last', 'channels_first'], "incorrect dataformat: should be either of ['channels_last', 'channels_first']"
        
        if data_format=='channels_last':
            return tf.reduce_mean(inputs, axis=[1,2], keepdims=keepdims, name=name )
        else:
            return tf.reduce_mean(inputs, axis=[2,3], keepdims=keepdims, name=name )

    def _dense_block(self, input_x, nb_layers, layer_name):
        with tf.name_scope(layer_name):
            layers_concat = list()
            layers_concat.append(input_x)

            x = self._bottleneck_layer(input_x, scope=layer_name + '_block' + str(0))

            layers_concat.append(x)

            for i in range(nb_layers - 1):
                x = tf.concat(layers_concat, axis=3)
                x = self._bottleneck_layer(x, scope=layer_name + '_block' + str(i + 1))
                layers_concat.append(x)

            x = tf.concat(layers_concat, axis=3)

            return x

    def _dense_net(self, input_x):
        x = tf.layers.conv2d(input_x, filters=2 * self.filters, kernel_size=[7,7], strides=2, name='conv1/conv')
        x = tf.layers.max_pooling2d(x, pool_size=[3,3], strides=2, name='max_pool0')
        
        if self.nw_type in self.nw_types.keys():
            
            for i, layers in enumerate(self.nw_types[self.nw_type][:-1]) :
                # 6 -> 12 -> 48
                x = self._dense_block(input_x=x, nb_layers=layers, layer_name='dense_'+str(i))
                x = self._transition_layer(x, scope='trans_'+str(i))
                
            x = self._dense_block(input_x=x, nb_layers=self.nw_types[self.nw_type][-1], layer_name='dense_final')
            
        else:
            # default to 
            x = self._dense_block(input_x=x, nb_layers=6, layer_name='dense_1')
            x = self._transition_layer(x, scope='trans_1')

            x = self._dense_block(input_x=x, nb_layers=12, layer_name='dense_2')
            x = self._transition_layer(x, scope='trans_2')

            x = self._dense_block(input_x=x, nb_layers=24, layer_name='dense_3')
            x = self._transition_layer(x, scope='trans_3')
            x = self._dense_block(input_x=x, nb_layers=16, layer_name='dense_final')
            
        x = self._global_average_pooling2d(x, name='global_avg_pool')
        x = tf.layers.flatten(x, name='flat')
        x = tf.layers.dense(inputs=x, units=self.n_class, name='fully_connected')

        return x

## Run

In [0]:
#constants

kwargs=dict(
    validation_start_partition = 0.9, 
    image_dir ='/mnt/disk2/proteinatlax/data',
    label_dir ='/mnt/disk2/proteinatlax/data/labels',
    batch_size=8,
    lr = 0.01,
    #lr = 0.1,
    #decay_lr = True,
    #decay_rate = 0.96,
    #decay_step = 1000,
    label_size = 28,
    checkpoint_dir = '/mnt/disk2/proteinatlax/model',
    break_patience = 10,
    nw_type=121, 
    filters=12,
    sigmoid_threshold=0.5,
    epochs=1000
)

In [0]:
#path_dict = extract_datazip(zipfile_path='../',extract_path=kwargs['image_dir'])
#processed_label = preprocess_label(label_file='../train.csv', save_file=os.path.join(kwargs['label_dir'],'processed_train.csv'))
#tr_label, va_label = split_label(label=os.path.join(kwargs['label_dir'],'processed_train.csv'), save_path=kwargs['label_dir'])

### Setup graph

In [0]:
#tf graph
graph = tf.Graph()
with graph.as_default():

    tr_summaries_dir       = os.path.join(kwargs['checkpoint_dir'], 'train')
    va_summaries_dir       = os.path.join(kwargs['checkpoint_dir'], 'validation')
    tr_checkpoint_prefix   = os.path.join( kwargs['checkpoint_dir'], 'model.ckpt')
    best_checkpoint_prefix = os.path.join( kwargs['checkpoint_dir'], 'best_model/model.ckpt')
    
    #Dataset
    tr_iter = get_batch( img_file_path=kwargs['image_dir'], batch_type='train', 
                        lbl_dir_path=kwargs['label_dir'], batch_size=kwargs['batch_size'] )
    tr_id, tr_x, tr_y = tr_iter.get_next()
    va_iter = get_batch( img_file_path=kwargs['image_dir'], batch_type='valid', 
                        lbl_dir_path=kwargs['label_dir'], batch_size=kwargs['batch_size'] )
    va_id, va_x, va_y = va_iter.get_next()
    
    #placeholders
    X = tf.placeholder(name='X', shape=[None,tr_x.shape[1],tr_x.shape[2],tr_x.shape[3]], dtype=tf.float32)
    Y = tf.placeholder(name='Y', shape=[None, tr_y.shape[1]], dtype=tf.float32)
    Y = tf.stop_gradient(Y)
    #variables
    
    global_step = tf.Variable(0, name='global_step')
    sig_cond = tf.Variable([kwargs['sigmoid_threshold']]*kwargs['label_size'])
    
    #model
    logit = DenseNet(x=X, nb_blocks=None, nw_type=kwargs['nw_type'], 
                     n_class=kwargs['label_size'], filters=kwargs['filters'], training=True).get_model()
    
    Y_ = tf.nn.sigmoid(logit,)
    Y_ = tf.cast(x = tf.greater(tf.cast(Y_, tf.float32), sig_cond), dtype=tf.int32, name = 'predictions')
    
    loss = tf.reduce_mean( tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=logit), axis=1))
    f1_score, score_update = tf.contrib.metrics.f1_score( labels=Y, predictions=Y_, name='f1_score')
    
    score_op_init = tf.variables_initializer(tf.get_default_graph().get_collection('local_variables',), 
                                                name='metrics_initializer')
    
    if 'decay_lr' in kwargs.keys() and kwargs['decay_lr']:
        lr = tf.train.exponential_decay(kwargs['lr'], global_step=global_step, decay_rate=kwargs['decay_rate'], decay_steps=kwargs['decay_steps'], name='lr_decay' )
        tf.summary.scalar('Learning Rate', lr)
    else: 
        lr = kwargs['lr']
                
    optimizer = tf.train.AdamOptimizer( learning_rate=lr, )
    
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss, global_step=global_step)
    
    # Model Summaries
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('F1_Score', f1_score)  
    all_summaries      = tf.summary.merge_all()
    tr_summary_writer  = tf.summary.FileWriter( tr_summaries_dir, graph )
    val_summary_writer = tf.summary.FileWriter( va_summaries_dir, graph )
    tr_saver           = tf.train.Saver(max_to_keep=10)
    best_saver         = tf.train.Saver(max_to_keep=1)
    config_proto       = tf.ConfigProto(allow_soft_placement=True)
    tr_session         = tf.Session(graph=graph, config= config_proto )

## Training loop

In [0]:
# training step
with graph.as_default():
    with tf.name_scope('training_loop'):
        
        tr_session.run(tf.global_variables_initializer())
        tr_saver.save(save_path=tr_checkpoint_prefix, sess=tr_session, global_step=global_step.eval(tr_session))
        
        # naive early stopping :P params
        prev_loss = best_loss = 100000000000.0
        cur_patience = 0 
        
        for epoc in range( kwargs['epochs'] ):
            # one complete pass of training data
            print("epoc: {}".format(epoc) )
            tr_session.run(tr_iter.initializer)
            tr_session.run(score_op_init)
            while True:
                try:
                    #train model
                    tr_Id, tr_X, tr_Y = tr_session.run([tr_id, tr_x, tr_y])
                    act,pred, tr_loss, _, tr_accuracy, _ , tr_summaries = tr_session.run([Y,Y_, loss, score_update, f1_score, 
                                                                                train_op, all_summaries],
                                                                        feed_dict={X:tr_X, Y:tr_Y})
                    #print(act, pred)
                    print("global_step: {:6}, loss: {:13.6f}, accuracy ={:.6f}".format(global_step.eval(tr_session), tr_loss, tr_accuracy), end='\r')
                    
                    tr_summary_writer.add_summary(tr_summaries, global_step.eval(tr_session))

                except tf.errors.OutOfRangeError:
                    tr_saver.save(save_path=tr_checkpoint_prefix, sess=tr_session, 
                                  global_step=global_step.eval(tr_session))
                    break
                    
            # one complete pass of validation data
            tr_session.run(va_iter.initializer)
            tr_session.run(score_op_init)
            while True:
                try:
                    #validate model
                    va_Id, va_X, va_Y = tr_session.run([va_id, va_x, va_y])
                    val_loss, _, val_accuracy, val_summaries = tr_session.run([loss, score_update, 
                                                                               f1_score, all_summaries],
                                                                             feed_dict={X:va_X, Y:va_Y})
                except tf.errors.OutOfRangeError:
                    val_summary_writer.add_summary(val_summaries, global_step.eval(tr_session))
                    print("global_step: {:6}, loss: {:13.6f}, accuracy ={:.6f}, val_loss: {:13.6f}, val_accuracy: {:.6f}".format(global_step.eval(tr_session), 
                                                                   tr_loss, tr_accuracy, val_loss, val_accuracy))
                    break
            # save checkpoint if loss is better than previous best loss
            if float("{:.3f}".format(val_loss)) < float("{:.3f}".format(best_loss)):
                best_loss = val_loss
                cur_patience = 0
                best_saver.save(save_path=best_checkpoint_prefix, sess=tr_session, 
                                global_step=global_step.eval(tr_session))
            else:
                cur_patience += 1

            if cur_patience == kwargs['break_patience']:
                print('\n############ Early stopping ############')
                tr_session.close()
                break

epoc: 0
global_step:   3690, loss:      4.667702, accuracy =0.271346, val_loss:                                  3.603684, val_accuracy: 0.279509
epoc: 1
global_step:   7380, loss:      4.524732, accuracy =0.323440, val_loss:                                  8.385436, val_accuracy: 0.335511
epoc: 2
global_step:  11070, loss:      5.644957, accuracy =0.362442, val_loss:                                  3.277678, val_accuracy: 0.457942
epoc: 3
global_step:  14760, loss:      4.071045, accuracy =0.403178, val_loss:                                  2.428470, val_accuracy: 0.454779
epoc: 4
global_step:  18450, loss:      4.816981, accuracy =0.442238, val_loss:                                  5.741456, val_accuracy: 0.483895
epoc: 5
global_step:  22140, loss:      3.601640, accuracy =0.469573, val_loss:                                  2.313452, val_accuracy: 0.426277
epoc: 6
global_step:  25830, loss:      4.246340, accuracy =0.493176, val_loss:                                  4.693980, v

## Generate prediction

In [0]:

with tf.device("/device:GPU:0"):
        with tf.name_scope('test_loop'):
            with graph.as_default():
                with tf.variable_scope('test_graph'):    
                    te_iter = get_batch( img_file_path=kwargs['image_dir'], batch_type='test', lbl_dir_path=kwargs['label_dir'], batch_size=kwargs['test_batch_size'] )
                    te_img_ids, te_x, te_y = te_iter.get_next()
                    
                    graph.run(te_iter.initializer)
                    while True:
                        try:
                            #validate model
                            predictions = graph.run(Y_, {X:tr_x})
                            all_predictions.append( zip(te_img_ids,predictions) )
                        except tf.errors.OutOfRangeError:
                            with open(os.path.join(checkpoint_dir, 'predictions.csv')) as fp:
                                fp.write(all_predictions)
                            break

