In [1]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import time
from pathlib import Path

from logzero import logger

import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.enable_eager_execution() # To enable Tensor.numpy() member

In [2]:
## IMPLEMENTED ##
# 1. Modify tile names to get rid of wild/tumor label
# 2. Create csv of tile names to connect to samples 
# Load tiles and create appropriate number of TFRecords
# Script for reading 8 samples at a time into the model (see below)
# Save tfrecords of 800 jpgs each
    # train_set and test_set separately.
# Create tsv-files to read into tf.data.tfrecorddataset.

## TODOS ####
# Move the files into /lustre/nvme/ and use modify inception driver script to read these files 


## ON HOLD ##



In [3]:
# Shuffle samples for division into train and test sets
sample_sheet = pd.read_csv('/lustre/scratch/kiviaho/myoma/myoma-new/sample_sheet_with_mutation_type_tile_path.tsv',sep="\t")
df = pd.DataFrame(list(dict.fromkeys(sample_sheet['Sample']))) # This preserves the order, unlike set !!
df = df.sample(frac=1,random_state=11).reset_index(drop=True)

In [4]:
# Create splits for sample
train_n = round(0.8*len(df))
test_n = len(df)-train_n
train_samples = df.head(train_n)
test_samples = df.tail(test_n)

In [5]:
# Divide samples and shuffle
train_set = sample_sheet.loc[sample_sheet['Sample'].isin(train_samples[0])]
train_set = train_set.sample(frac=1,random_state=11).reset_index(drop=True)

test_set = sample_sheet.loc[sample_sheet['Sample'].isin(test_samples[0])]
test_set = test_set.sample(frac=1,random_state=11).reset_index(drop=True)

In [6]:
# Create datatypes with tf.train.feature
def _array_feature(value):
    """Returns a bytes_list from a array."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.numpy()]))
    
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


In [7]:
# Create one serialization of a datapoint.
# Takes img array, encoded label and img shape as input
def serialize_example(img, label):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
      'img': _array_feature(img),
      'label':  _bytes_feature(label),
    }
    # Create a Features message using tf.train.Example.
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def np_to_serialized_tensor(arr):
    arr = tf.constant(arr,dtype=tf.int16)
    arr = tf.io.serialize_tensor(arr)
    return arr



In [8]:
def write_tfrecords_from_csv(sample_set, batch_size, write_path):
    ''' 
    Writes jpgs into a set of tfrecord-files defined by the first argument. 
    Number of jpg's written into a single TFRecord is defined by batch_size.
    
    params:
        sample_set: A data-frame of shape nx3, with columns 'Sample', 'Type' and 'Tile'.
        batch_size: The number of jpg-files to be written into a single TFRecord
        write_path: Write path, relative or absolute
        
    returns:
        sample_set: A dataframe of shape nx4, with column 'TFRecords' specifying the file
        into which the particular tile has been written into.
    '''
    # Define the number of TFRecord-files to be written
    n_batches = np.floor(len(sample_set)/batch_size).astype(int)
    batch_res = len(sample_set)-batch_size*n_batches
    tfrecords = []
    
    # Write the full-batch TFRecords
    for it in range(n_batches):
        if it%10 == 0:
            logger.info("Writing TFRecord "+ str(it) +" of "+ str(n_batches+1))
        start = time.time()
        subset = sample_set[int(batch_size*it):int(batch_size*(it+1))]
        subset = subset.reset_index(drop=True)
        filename = write_path + str(it) + '.tfrecord'
        tfrecords = np.hstack((tfrecords,np.repeat(filename,batch_size)))

        labels = subset['Type']
        img_paths = subset['Tile']
        with tf.io.TFRecordWriter(filename) as writer:
            for i in range(batch_size):
                img_bytes = open(img_paths[i],'rb').read()
                img_bytes = tf.io.serialize_tensor(img_bytes)
                example = serialize_example(img_bytes,labels[i].encode())
                writer.write(example)
    
    # Write the last "partial" batch
    subset = sample_set[int(batch_size*n_batches):int(batch_size*n_batches+batch_res)]
    subset = subset.reset_index(drop=True)
    filename = write_path + str(n_batches+1) + '.tfrecord'
    tfrecords = np.hstack((tfrecords,np.repeat(filename,batch_res)))

    labels = subset['Type']
    img_paths = subset['Tile']
    with tf.io.TFRecordWriter(filename) as writer:
        for i in range(batch_res):
            img_bytes = open(img_paths[i],'rb').read()
            img_bytes = tf.io.serialize_tensor(img_bytes)
            example = serialize_example(img_bytes,labels[i].encode())
            writer.write(example)
    
    sample_set['TFRecords'] = tfrecords.tolist()
    return sample_set



In [10]:
# Testing the function pipeline
Path('tmp/').mkdir(parents=True,exist_ok=True)
sample = train_set
sample = write_tfrecords_from_csv(sample, 800, 'tmp/')