In [None]:
import tensorflow as tf

In [2]:
import numpy as np
import os
import shutil
import sys
import glob
import tempfile

In [3]:
class MODparser_write():
    """ defined the  .tfrecord format """
    def __init__(self):

        self.feature_format= {
            'x250/data': tf.FixedLenFeature([], tf.string),
            'x250/shape': tf.FixedLenFeature([4], tf.int64),
            'x500/data': tf.FixedLenFeature([], tf.string),
            'x500/shape': tf.FixedLenFeature([4], tf.int64),
            'dates/doy': tf.FixedLenFeature([], tf.string),
            'dates/year': tf.FixedLenFeature([], tf.string),
            'dates/shape': tf.FixedLenFeature([1], tf.int64),
            'labels/data': tf.FixedLenFeature([], tf.string),
            'labels/shape': tf.FixedLenFeature([3], tf.int64)
        }

        return None

    def write(self, filename, x250ds, x500ds, doy, year, labelsds):
        # https://stackoverflow.com/questions/39524323/tf-sequenceexample-with-multidimensional-arrays

        options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)
        writer = tf.python_io.TFRecordWriter(filename, options=options)
          
        for x in range(x250ds.shape[0]):

          x250=x250ds[x].astype(np.int64)
          x500=x500ds[x].astype(np.int64)
          doy=doy.astype(np.int64)
          year=year.astype(np.int64)
          labels=labelsds[x].astype(np.int64)
          
          # Create a write feature
          feature={
              'x250/data' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[x250.tobytes()])),
              'x250/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=x250.shape)),
              'x500/data' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[x500.tobytes()])),
              'x500/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=x500.shape)),
              'labels/data': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labels.tobytes()])),
              'labels/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=labels.shape)),
              'dates/doy': tf.train.Feature(bytes_list=tf.train.BytesList(value=[doy.tobytes()])),
              'dates/year': tf.train.Feature(bytes_list=tf.train.BytesList(value=[year.tobytes()])),
              'dates/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=doy.shape))
          }

          example = tf.train.Example(features=tf.train.Features(feature=feature))

          writer.write(example.SerializeToString())

        writer.close()
        sys.stdout.flush()
        
    def parse_example(self,serialized_example):
        """
        example proto can be obtained via
        filename_queue = tf.train.string_input_producer(filenames, num_epochs=None)
        or by passing this function in dataset.map(.)
        """

        feature = tf.parse_single_example(serialized_example, self.feature_format)
        
        # decode and reshape
        x500 = tf.reshape(tf.decode_raw(feature['x250/data'], tf.int64),tf.cast(feature['x250/shape'], tf.int32))
        x500 = tf.reshape(tf.decode_raw(feature['x500/data'], tf.int64),tf.cast(feature['x500/shape'], tf.int32))

        doy = tf.reshape(tf.decode_raw(feature['dates/doy'], tf.int64), tf.cast(feature['dates/shape'], tf.int32))
        year = tf.reshape(tf.decode_raw(feature['dates/year'], tf.int64), tf.cast(feature['dates/shape'], tf.int32))

        labels = tf.reshape(tf.decode_raw(feature['labels/data'], tf.int64), tf.cast(feature['labels/shape'], tf.int32))

        return x250, x500, doy, year, labels
      
    def read(self,filenames):
        """ depricated! """

        if isinstance(filenames,list):
            filename_queue = tf.train.string_input_producer(filenames, num_epochs=None)
        elif isinstance(filenames,tf.FIFOQueue):
            filename_queue = filenames
        else:
            print ("please insert either list or tf.FIFOQueue")

        reader = tf.TFRecordReader()
        f, serialized_example = reader.read(filename_queue)

        feature = tf.parse_single_example(serialized_example, features=self.feature_format)

        # decode and reshape
        x250 = tf.reshape(tf.decode_raw(feature['x250/data'], tf.int64),tf.cast(feature['x250/shape'], tf.int32))
        x500 = tf.reshape(tf.decode_raw(feature['x500/data'], tf.int64),tf.cast(feature['x500/shape'], tf.int32))
        
        doy = tf.reshape(tf.decode_raw(feature['dates/doy'], tf.int64), tf.cast(feature['dates/shape'], tf.int32))
        year = tf.reshape(tf.decode_raw(feature['dates/year'], tf.int64), tf.cast(feature['dates/shape'], tf.int32))

        labels = tf.reshape(tf.decode_raw(feature['labels/data'], tf.int64), tf.cast(feature['labels/shape'], tf.int32))

        return x250, x500, doy, year, labels

    def read_and_return(self,filename):
        """ depricated! """

        # get feature operation containing
        feature_op = self.read([filename])

        with tf.Session() as sess:

            tf.global_variables_initializer()

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            return sess.run(feature_op)

In [4]:
def parser_fn(ds_250m, ds_500m, bands_250m, bands_500m, tfiles, psize, timesteps, version, batchsize, project):

  parser = MODparser_write()

  ds_250m = tf.data.TFRecordDataset(ds_250m, compression_type='GZIP')
  ds_500m = tf.data.TFRecordDataset(ds_500m, compression_type='GZIP')

  bands250m = bands_250m + ['DOY', 'year', version]

  # Dictionary with names as keys, features as values.
  featureNames = list(bands250m)

  columns = [tf.FixedLenFeature([timesteps, psize, psize], tf.int64) for k in featureNames]

  featuresDict250m = dict(zip(featureNames, columns))

  # Dictionary with names as keys, features as values.
  featureNames = list(bands_500m)

  columns = [tf.FixedLenFeature([timesteps, psize/2, psize/2], tf.int64) for k in featureNames]

  featuresDict500m = dict(zip(featureNames, columns))
  
  def parse_tfrecord250m(example_proto):
      feat = tf.parse_single_example(example_proto, featuresDict250m)
      
      band_names = bands_250m

      x250 = tf.stack([(feat[x]) for x in bands_250m], axis=0)
      x250 = tf.transpose(x250, [1, 2, 3, 0])

      #for predictions over all area wo mask
      year = tf.to_float(tf.reduce_mean(feat.pop('year'), axis=[1, 2]))
      DOY = tf.to_float(tf.reduce_mean(feat.pop('DOY'), axis=[1, 2]))

      landcover = feat.pop(version)

      return x250, DOY, year, landcover

  def parse_tfrecord500m(example_proto):
    
      feat = tf.parse_single_example(example_proto, featuresDict500m)
      
      x500 = tf.stack([(feat[x]) for x in bands_500m], axis=0)
      x500 = tf.transpose(x500, [1, 2, 3, 0])

      return x500
    
  # Map the function over the dataset
  parsedDataset250m = ds_250m.map(parse_tfrecord250m, num_parallel_calls=20)
  parsedDataset500m = ds_500m.map(parse_tfrecord500m, num_parallel_calls=20)

  parsedDataset250m = parsedDataset250m.batch(batchsize)
  parsedDataset500m = parsedDataset500m.batch(batchsize)

  iterator250m = parsedDataset250m.make_one_shot_iterator()
  iterator500m = parsedDataset500m.make_one_shot_iterator()

  nfiles = tfiles/batchsize
  
  if tfiles % batchsize != 0:
    nfiles = nfiles + 1

  filepaths=["{}/{}.gz".format(project,i) for i in range(int(nfiles))]

  with tf.Session() as sess:

      for t in range(int(nfiles)):
        x250, DOY, year, labels = iterator250m.get_next()
        x250, DOY, year, labels = sess.run([x250, DOY, year, labels])
        
        x500 = iterator500m.get_next()
        x500 = sess.run([x500])
        
        print(x250.shape)
        print(x500[0].shape)

        parser.write(filepaths[t], x250, x500[0], DOY[0,:], year[0,:], labels)

In [5]:
fileNames_250m = sorted(glob.glob(os.path.join('data', 'tile_p240k0*.gz')),key=os.path.getmtime)

In [6]:
fileNames_500m = sorted(glob.glob(os.path.join('data', 'tile_p120k0*.gz')),key=os.path.getmtime)

In [8]:
options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)

psize = 240
timesteps = 46
version = 'MCD12Q1v6'

n_patches_first = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_250m[0], options=options))
n_patches_last = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_250m[-1], options=options))

batchsize = n_patches_first

tfiles_250m = (n_patches_first * (len(fileNames_250m)-1)) + n_patches_last
print(tfiles_250m)

n_patches_first = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_500m[0], options=options))
n_patches_last = sum(1 for _ in tf.python_io.tf_record_iterator(fileNames_500m[-1], options=options))

tfiles_500m = (n_patches_first * (len(fileNames_500m)-1)) + n_patches_last
print(tfiles_500m)

assert(tfiles_250m == tfiles_500m)

bands_250m = ['red', 'NIR']
bands_500m = ['blue', 'green', 'SWIR1', 'SWIR2','SWIR3']

# define directory to store the fake dataset
project = 'out/raw/240/data10'
if not os.path.exists(project):
    os.makedirs(project)
     
parser_fn(fileNames_250m, fileNames_500m, bands_250m, bands_500m, tfiles_250m, psize, timesteps, version, batchsize, project)

28
28
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
(4, 46, 240, 240, 2)
(4, 46, 120, 120, 5)
