# Introduction

This is a very simple serialization demo that use the built-in JSON serializer.

# Necessary Imports

In [1]:
from typing import Any, Dict
import os
import sys

from forte.data.caster import MultiPackBoxer
from forte.data.data_pack import DataPack
from forte.data.multi_pack import MultiPack
from forte.data.readers import OntonotesReader, DirPackReader
from forte.data.readers.deserialize_reader import MultiPackDirectoryReader
from forte.pipeline import Pipeline
from forte.processors.base import MultiPackProcessor, MultiPackWriter
from forte.processors.writers import PackNameJsonPackWriter
from fortex.nltk import NLTKWordTokenizer, NLTKPOSTagger, NLTKSentenceSegmenter
from ft.onto.base_ontology import EntityMention, CrossDocEntityRelation

  from .autonotebook import tqdm as notebook_tqdm


# Functions and class definitions 

In [2]:
class PackCopier(MultiPackProcessor):
    """
    Copy the text from existing pack to a new pack.
    """

    def _process(self, input_pack: MultiPack):
        from_pack: DataPack = input_pack.get_pack(self.configs.copy_from)
        copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to)

        copy_pack.set_text(from_pack.text)

        if from_pack.pack_name is not None:
            copy_pack.pack_name = from_pack.pack_name + "_copy"
        else:
            copy_pack.pack_name = "copy"

        ent: EntityMention
        for ent in from_pack.get(EntityMention):
            EntityMention(copy_pack, ent.begin, ent.end)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        return {"copy_from": "default", "copy_to": "duplicate"}


class ExampleCoreferencer(MultiPackProcessor):
    """
    Mark some example coreference relations.
    """

    def _process(self, input_pack: MultiPack):
        pack_i = input_pack.get_pack("default")
        pack_j = input_pack.get_pack("duplicate")

        for ent_i, ent_j in zip(
            pack_i.get(EntityMention), pack_j.get(EntityMention)
        ):
            link = CrossDocEntityRelation(input_pack, ent_i, ent_j)
            link.rel_type = "coreference"
            input_pack.add_entry(link)


class ExampleCorefCounter(MultiPackProcessor):
    def __init__(self):
        super().__init__()
        self.coref_count = 0

    def _process(self, input_pack: MultiPack):
        rels = list(input_pack.get_entries_of(CrossDocEntityRelation))
        self.coref_count += len(rels)

    def finish(self, _):
        print(f"Found {self.coref_count} pairs in the multi packs.")


def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:
    Returns:
    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(
        PackNameJsonPackWriter(),
        {
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    nlp.run(input_path)


def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.
    Args:
        input_path:
        output_path:
    Returns:
    """
    print("Multi Pack serialization example.")

    print(
        "We first read the data, and add multi-packs to them, and then "
        "save the results."
    )
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        config={
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    coref_pl.run(input_path)

    print(
        "We can then load the saved results, and see if everything is OK. "
        "We should see the same number of multi packs there. "
    )
    reading_pl = Pipeline()
    reading_pl.set_reader(
        MultiPackDirectoryReader(),
        config={
            "multi_pack_dir": os.path.join(output_path, "multi"),
            "data_pack_dir": os.path.join(output_path, "packs"),
        },
    )
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()

# Dataset path

In [3]:
data_path='../../data_samples/ontonotes/00/'

# serialize

In [4]:
pack_output = "pack_out"
multipack_output = "multi_out"

pack_example(data_path, pack_output)
multi_example(pack_output, multipack_output)



Pack serialization example.


[nltk_data] Downloading package punkt to
[nltk_data]     /home/bhaskar.rao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/bhaskar.rao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Multi Pack serialization example.
We first read the data, and add multi-packs to them, and then save the results.




Found 423 pairs in the multi packs.
We can then load the saved results, and see if everything is OK. We should see the same number of multi packs there. 
Found 423 pairs in the multi packs.
