Update default serialization to the new json method. (#918)

asyml · Jan 19, 2023 · 6e2d6ea · 6e2d6ea
1 parent 38c96f5
commit 6e2d6ea
Show file tree

Hide file tree

Showing 33 changed files with 713 additions and 511 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -407,7 +407,4 @@ def setup(sphinx):
 
 # Enable hypothesis.is in comments
 # https://sphinx-comments.readthedocs.io/en/latest/hypothesis.html#activate-hypothes-is
-comments_config = {
-   "hypothesis": True
-}
-
+comments_config = {"hypothesis": True}
diff --git a/examples/audio/speaker_segmentation_pipeline.py b/examples/audio/speaker_segmentation_pipeline.py
@@ -50,7 +50,7 @@ def _process(self, input_pack: DataPack):
             audio_utter: AudioUtterance = AudioUtterance(
                 pack=input_pack,
                 begin=int(turn.start * input_pack.sample_rate),
-                end=int(turn.end * input_pack.sample_rate)
+                end=int(turn.end * input_pack.sample_rate),
             )
             audio_utter.speaker = speaker
 
@@ -97,7 +97,7 @@ def _process(self, input_pack: DataPack):
             text_utter: Utterance = Utterance(
                 pack=input_pack,
                 begin=len(input_pack.text) - len(transcription[0]),
-                end=len(input_pack.text)
+                end=len(input_pack.text),
             )
             text_utter.speaker = audio_utter.speaker
             Link(pack=input_pack, parent=audio_utter, child=text_utter)

diff --git a/examples/blog_post_examples/ecosystem_script_only.py b/examples/blog_post_examples/ecosystem_script_only.py
@@ -20,16 +20,12 @@
 from forte.processors.stave import StaveProcessor
 from fortex.spacy import SpacyProcessor
 
-Pipeline[DataPack](
-).set_reader(
-    HTMLReader()
-).add(
-    SpacyProcessor(), config={
+Pipeline[DataPack]().set_reader(HTMLReader()).add(
+    SpacyProcessor(),
+    config={
         "processors": ["sentence", "tokenize", "pos", "ner", "dep", "umls_link"]
-    }
-).add(
-    StaveProcessor()
-).run(
+    },
+).add(StaveProcessor()).run(
     "<body><p>"
     "she does not have SVS syndrome from an axillary vein thrombosis."
     "</p></body>"

diff --git a/examples/classification/amazon_review_sentiment.py b/examples/classification/amazon_review_sentiment.py
@@ -37,7 +37,9 @@
 for pack in pl.process_dataset(csv_path):
     for sent in pack.get(Sentence):
         if (
-            input("Type n for the next documentation and its prediction: ").lower()
+            input(
+                "Type n for the next documentation and its prediction: "
+            ).lower()
             == "n"
         ):
             sent_text = sent.text

diff --git a/examples/classification/bank_customer_intent.py b/examples/classification/bank_customer_intent.py
@@ -113,9 +113,7 @@
         "label",
     ],
     "index2class": index2class,
-    "text_fields": [
-        "ft.onto.base_ontology.Body"
-    ],
+    "text_fields": ["ft.onto.base_ontology.Body"],
     "digit_label": False,
     "one_based_index_label": False,
 }

diff --git a/examples/clinical_pipeline/utterance_searcher.py b/examples/clinical_pipeline/utterance_searcher.py
@@ -94,8 +94,8 @@ def _process(self, input_pack: DataPack):
             else:
                 links: List[str] = create_links(self.configs.url_stub, answers)
                 response_text: str = (
-                        "I found the following results: <br> -- "
-                        + "<br> -- ".join(links)
+                    "I found the following results: <br> -- "
+                    + "<br> -- ".join(links)
                 )
                 print(response_text)
 

diff --git a/examples/content_rewriter/model/config_model_clean.py b/examples/content_rewriter/model/config_model_clean.py
@@ -10,7 +10,7 @@ def get_embedder_hparams(dimension, name):
             "type": "random_normal_initializer",
             "kwargs": {
                 "mean": 0.0,
-                "stddev": dimension ** -0.5,
+                "stddev": dimension**-0.5,
             },
         },
     }

diff --git a/examples/content_rewriter/rewriter.py b/examples/content_rewriter/rewriter.py
@@ -121,6 +121,4 @@ def prepare_data(self, context: UtteranceContext, utterance: Utterance):
 
     @classmethod
     def default_configs(cls) -> Dict[str, Any]:
-        return {
-            "model_dir": "content_rewriter/model"
-        }
+        return {"model_dir": "content_rewriter/model"}
diff --git a/examples/serialization/README.md b/examples/serialization/README.md
@@ -1,6 +1,22 @@
 This is a very simple serialization demo that use the built-in JSON serializer.
-Just run the following command in this directory:
+
+First, let's install some simple processors via:
+
+`
+pip install forte.nltk
+`
+
+To ensure you are using the current version of Forte, go to Forte root and install from source:
+
+`
+cd <forte source directory>
+pip install .
+`
+
+Then just run the following command from this example directory:
 
 `
 python serialize_example.py "../../data_samples/ontonotes/00/"
-`
+`
+
+You should be able to see the progress and the serialized content.
diff --git a/examples/wiki_parser/wiki_dump_parse.py b/examples/wiki_parser/wiki_dump_parse.py
@@ -94,12 +94,12 @@ def add_wiki_info(
 
     if resume_from_last:
         if not os.path.exists(out_index_path):
-            raise ValueError(f"Configured to do resume but path "
-                             f"{out_index_path} does not exists.")
+            raise ValueError(
+                f"Configured to do resume but path "
+                f"{out_index_path} does not exists."
+            )
 
-        print_progress(
-            f"\nWill resume from last from {out_index_path}", "\n"
-        )
+        print_progress(f"\nWill resume from last from {out_index_path}", "\n")
         pl.set_reader(
             reader,
             config={

diff --git a/forte/data/base_reader.py b/forte/data/base_reader.py
@@ -107,11 +107,11 @@ def default_configs(cls):
             False.
 
           - serialize_method: The method used to serialize the data. Current
-            available options are `jsonpickle` and `pickle`. Default is
-            `jsonpickle`.
+            available options are `json`, `jsonpickle` and `pickle`. Default is
+            `json`.
 
         """
-        return {"zip_pack": False, "serialize_method": "jsonpickle"}
+        return {"zip_pack": False, "serialize_method": "json"}
 
     @staticmethod
     def pack_type():

diff --git a/forte/data/readers/deserialize_reader.py b/forte/data/readers/deserialize_reader.py
@@ -74,16 +74,16 @@ def default_configs(cls):
             default value is None.
 
           - serialize_method: The method used to serialize the data. Current
-            available options are `jsonpickle` and `pickle`. Default is
-            `jsonpickle`.
+            available options are `json`, `jsonpickle` and `pickle`. Default is
+            `json`.
 
         Returns:
             The default configuration of this writer.
         """
         return {
             "zip_pack": False,
             "indent": None,
-            "serialize_method": "jsonpickle",
+            "serialize_method": "json",
         }
 
 
@@ -262,13 +262,13 @@ def default_configs(cls):
 
         Here:
           - serialize_method: The method used to serialize the data. Current
-              available options are `jsonpickle` and `pickle`. Default is
-              `jsonpickle`.
+              available options are `json`, `jsonpickle` and `pickle`. Default is
+              `json`.
 
         Returns: The default configuration of this writer.
         """
         return {
-            "serialize_method": "jsonpickle",
+            "serialize_method": "json",
         }
 
 
@@ -326,8 +326,8 @@ def default_configs(cls):
 
           - serialize_method (str): The method used to serialize the data, this
               should be the same as how serialization is done. The current
-              options are `jsonpickle` and `pickle`. The default method
-              is `jsonpickle`.
+              options are `json`, `jsonpickle` and `pickle`. The default method
+              is `json`.
 
           - zip_pack (bool): whether to zip the data pack. The default value is
               False.
@@ -338,7 +338,7 @@ def default_configs(cls):
             "multi_pack_dir": None,
             "data_pack_dir": None,
             "suffix": ".json",
-            "serialize_method": "jsonpickle",
+            "serialize_method": "json",
             "zip_pack": False,
         }
 

diff --git a/forte/processors/base/writers.py b/forte/processors/base/writers.py
@@ -49,7 +49,7 @@ def write_pack(
     zip_pack: bool = False,
     overwrite: bool = False,
     drop_record: bool = False,
-    serialize_method: str = "jsonpickle",
+    serialize_method: str = "json",
 ) -> str:
     """
     Write a pack to a path.
@@ -63,8 +63,8 @@ def write_pack(
         overwrite: Whether to overwrite the file if already exists.
         drop_record: Whether to drop the creation records in the serialization.
         serialize_method: The method used to serialize the data. Current
-          available options are `jsonpickle` and `pickle`.
-          Default is `jsonpickle`.
+          available options are `json`, `jsonpickle` and `pickle`.
+          Default is `json`.
 
     Returns:
         If successfully written, will return the path of the output file.
@@ -144,8 +144,8 @@ def default_configs(cls):
              the default value is False.
 
           - serialize_method: The method used to serialize the data. Current
-              available options are `jsonpickle` and `pickle`. Default is
-              "jsonpickle".
+              available options are `json`, `jsonpickle` and `pickle`. Default is
+              "json".
 
         Returns: The default configuration of this writer.
         """
@@ -154,7 +154,7 @@ def default_configs(cls):
             "zip_pack": False,
             "indent": None,
             "drop_record": False,
-            "serialize_method": "jsonpickle",
+            "serialize_method": "json",
         }
 
     def _process(self, input_pack: DataPack):
@@ -260,5 +260,5 @@ def default_configs(cls) -> Dict[str, Any]:
             "zip_pack": False,
             "indent": None,
             "drop_record": False,
-            "serialize_method": "jsonpickle",
+            "serialize_method": "json",
         }
diff --git a/forte/processors/writers.py b/forte/processors/writers.py
@@ -22,7 +22,7 @@
 class PackIdJsonPackWriter(PackWriter):
     """
     A writer implementation that writes data pack to disk. The default
-    serialization uses jsonpickle (readable). The file name of each data pack
+    serialization uses json. The file name of each data pack
     is the auto generated pack id of each pack.
     """
 
@@ -51,7 +51,7 @@ def default_configs(cls):
 class PackNameJsonPackWriter(PackWriter):
     """
     A writer implementation that writes data pack to disk. The default
-    serialization uses jsonpickle (readable). The file name of
+    serialization uses json. The file name of
     each data pack is the assigned name of each pack.
     """
 

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 VERSION_VAR = "VERSION"
 version = {}
 with open(
-        os.path.join(os.path.dirname(os.path.abspath(__file__)), "forte/version.py")
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "forte/version.py")
 ) as fp:
     exec(fp.read(), version)
 if VERSION_VAR not in version or not version[VERSION_VAR]:
@@ -26,7 +26,7 @@
     version=version[VERSION_VAR],
     url="https://github.com/asyml/forte",
     description="Forte is extensible framework for building composable and "
-                "modularized NLP workflows.",
+    "modularized NLP workflows.",
     long_description=long_description,
     long_description_content_type="text/markdown",
     license="Apache License Version 2.0",
@@ -60,7 +60,12 @@
             "requests",
         ],
         "ir": ["texar-pytorch>=0.1.4", "tensorflow>=1.15.0"],
-        "remote": ["fastapi>=0.65.2, <=0.75.2", "pydantic<=1.9.2", "uvicorn>=0.14.0", "requests"],
+        "remote": [
+            "fastapi>=0.65.2, <=0.75.2",
+            "pydantic<=1.9.2",
+            "uvicorn>=0.14.0",
+            "requests",
+        ],
         "audio_ext": ["soundfile>=0.10.3"],
         "stave": ["stave>=0.0.1.dev12"],
         "models": [
@@ -90,9 +95,9 @@
             "soundfile>=0.10.3",
             "Pillow",
             "requests",
-            "urlpath>=1.2.0"
+            "urlpath>=1.2.0",
         ],
-        "ocr_tutorial": ["Pillow", "requests", "pytesseract"]
+        "ocr_tutorial": ["Pillow", "requests", "pytesseract"],
     },
     entry_points={
         "console_scripts": [

diff --git a/tests/forte/data/data_pack_test.py b/tests/forte/data/data_pack_test.py
@@ -318,25 +318,25 @@ def test_get_entries(self):
         with self.assertRaises(ValueError):
             for doc in self.data_pack.get("forte.data.data_pack.DataPack"):
                 print(doc)
-        
+
         # Test get raw entries
 
         # fetching documents
-        primitive_documents = list(self.data_pack.get(Document, get_raw = True))
+        primitive_documents = list(self.data_pack.get(Document, get_raw=True))
         object_documents = list(self.data_pack.get(Document))
 
         self.assertEqual(
             primitive_documents[0],
             {
-                'begin': 0,
-                'end': 228,
-                'payload_idx': 0,
-                'document_class': [],
-                'sentiment': {},
-                'classifications': {},
-                'tid': object_documents[0].tid,
-                'type': 'ft.onto.base_ontology.Document'
-            }
+                "begin": 0,
+                "end": 228,
+                "payload_idx": 0,
+                "document_class": [],
+                "sentiment": {},
+                "classifications": {},
+                "tid": object_documents[0].tid,
+                "type": "ft.onto.base_ontology.Document",
+            },
         )
 
         # fetching groups
@@ -353,12 +353,11 @@ def test_get_entries(self):
                     em_object = self.data_pack.get_entry(em)
                     members.append(em_object.text)
             group_members.append(sorted(members))
-        
+
         self.assertEqual(
             group_members,
-             [["He", "The Indonesian billionaire James Riady", "he"]]
+            [["He", "The Indonesian billionaire James Riady", "he"]],
         )
-
 
     def test_delete_entry(self):
         # test delete entry