In [1]:
import tensorflow as tf
import pathlib
from Utilities.directories import *

In [2]:
# Define directories to load raw data from and to save serialized data to
root_dir = pathlib.Path(data + "/example_extrect_ma")
save_dir = pathlib.Path(data + "/example_extract-tfecord")
dataset = tf.data.Dataset.list_files(str(root_dir/'*'), seed=10000)

In [8]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [9]:
print(_bytes_feature(b'some string'))

bytes_list {
  value: "some string"
}



In [10]:
def serialize_example(feature0, feature1):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.

    feature = {
        "feature0": _bytes_feature(feature0),
        "feature1": _int64_feature(feature1)
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [11]:
# Test the serialization

serialized_example = serialize_example(feature0=b'test', feature1=15)
serialized_example

b'\n)\n\x14\n\x08feature0\x12\x08\n\x06\n\x04test\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x0f'

In [2]:
"""Adapted from: https://gist.github.com/dschwertfeger/3288e8e1a2d189e5565cc43bb04169a1"""

import math

from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import os
from Utilities.directories import lexis_data, lexis_abstract
import pathlib
import xml.etree.ElementTree as ET
import xmltodict
import glob

_SEED = 2020
_COMPRESSION_SCALING_FACTOR = 4
_COMPRESSION_LIB = "ZLIB" # 'ZLIB is the coompression type

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _parallelize(func, data):
    processes = cpu_count() - 1
    with Pool(processes) as pool:
        # We need the enclosing list statement to wait for the iterator to end
        # https://stackoverflow.com/a/45276885/1663506
        list(tqdm(pool.imap_unordered(func, data), total=len(data)))

class TFREcordsConverter:
    """Convert XML files to TFRecords."""

    # When compression is used, resulting TFRecord files are four to five times
    # smaller. So, we can reduce the number of shards by this factor
    _COMPRESSION_SCALING_FACTOR = 4

    def __init__(self, filepaths, output_dir, test_size, val_size):
        """

        :param filepaths: pandas dataframe with filepaths
        :param output_dir:
        :param test_size:
        :param val_size:
        """
        self.output_dir = output_dir

        # Shuffle data by "sampling" the entire data frame
        self.filepaths = filepaths.sample(frac=1, random_state=_SEED)

        # Calculate number of instances for each sub dataset
        n_samples = len(filepaths)
        self.n_test = math.ceil(test_size * n_samples)
        self.n_val = math.ceil(val_size * n_samples)
        self.n_train = n_samples - self.n_test - self.n_val

        # Determine number of shards per sub dataset
        self.n_shards_test = self._n_shards(self.n_test)
        self.n_shards_val = self._n_shards(self.n_val)
        self.n_shards_train = self._n_shards(self.n_train)

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def __repr__(self):
        return ('{}.{}(output_dir={}, n_shards_train={}, n_shards_test={}, '
                'n_shards_val={}, n_train={}, '
                'n_test={}, n_val={})').format(
            self.__class__.__module__,
            self.__class__.__name__,
            self.output_dir,
            self.n_shards_train,
            self.n_shards_test,
            self.n_shards_val,
            self.n_train,
            self.n_test,
            self.n_val,
        )

    def _n_shards(self, n_samples):
        """
        Compute number of shards for number of samples.

        TFRecords are split into multiple shards. Each shard's size should be
        between 100 MB and 200 MB according to the TensorFlow documentation.

        :param
        n_samples: int
            The number of samples to split into TFRecord shards.
        :return:
        n_shards: int
            The number of shards needed to fit the provided number of samples.
        """

        shard_size = 2 * 10**8
        avg_file_size = 1.6 * 10**5 # rough estimation since the file size per document varies a lot.
        files_per_shard = math.ceil(shard_size / avg_file_size) * _COMPRESSION_SCALING_FACTOR
        return math.ceil(n_samples / files_per_shard)

    def _process_files(self, shard_data):
        """
        Write TFRecord file.

        :param
        shard_data: tuple(str, list)
            A tupöe containing the shard path and the list of indices to write to it.
        :return:
        """

        shard_path, indices = shard_data
        with tf.io.TFRecordWriter(shard_path, options=_COMPRESSION_LIB):
            for index in indices:
                file_path = self.filepaths.iloc[index,0] # get the respective filepath
                xml_tree = self._parse_xml(file_path)

                # Extract features
                # abstract
                abstract = xml_tree["lexisnexis-patent-document"]["abstract"]
                if type(abstract) == list:
                    for element in abstract:
                        if element["@lang"] == "eng":
                            abstract_text = element["p"]
                            break

                example = tf.train.Example(features=tf.train.Features(feature={
                    'abstract': _bytes_feature(abstract_text),
                    'label': _bytes_feature(),

                }))

    @staticmethod
    def _parse_xml(path):
        """
        Load all xml files from the directory and convert them to a list of
        python dictionaries.

        :param file_dir: Directory to the files
        :return: list of python dictionaries
        """

        tree = ET.parse(path)
        root = tree.getroot()
        xml_str = ET.tostring(root)
        xml_tree = xmltodict.parse(xml_str)

        return xml_tree

In [15]:
import os
print(root_dir)
df = pd.read_csv(pathlib.Path.joinpath(root_dir, "paths.csv"))
n_rows = len(df)
avg_size = 0
for row in df.iloc[:,0]:
    avg_size += os.path.getsize(row)
avg_size /= n_rows
print(avg_size)

E:\MLData\thesis\Datasets\example_extrect_ma
168785.9


[[1 2 3]
 [4 5 6]]
