Improve tests. Blackify and isortify. Optimize the code.

barseghyanartur · Dec 21, 2022 · 20e0341 · 20e0341
1 parent a1ac470
commit 20e0341
Show file tree

Hide file tree

Showing 10 changed files with 146 additions and 44 deletions.
diff --git a/scripts/demo.py b/scripts/demo.py
@@ -1,12 +1,12 @@
-import pandas as pd
 import sys
 
+import pandas as pd
+
 sys.path.append("../src")
 
 import itnpy
 import itnpy.vocab as vocab
 
-
 if __name__ == "__main__":
     df = vocab.get_dataframe()
     word2number = vocab.get_word2number_dict(df)

diff --git a/scripts/docs.py b/scripts/docs.py
@@ -1,18 +1,19 @@
-import pandas as pd
 import sys
 
+import pandas as pd
+
 sys.path.append("../src")
 
 import itnpy
 import itnpy.vocab as vocab
 
-
 EXAMPLES = [
     "i payed one hundred and twenty seven dollars and twenty six cents",
     "was your birthday on march twenty ninth",
     "my phone number is nine four nine six eight two seventy fourteen",
     "i have a minus one hundred point four three balance",
-    "calling to place an order of three hundred thousand sixty four hundred and eighteen parts",
+    "calling to place an order of three hundred thousand sixty four hundred "
+    "and eighteen parts",
     "my order id is seven eighteen fourteen fifteen nine eight zero",
     "my date of birth is three seven fifty four",
     "what is eighty percent of negative point nine four",

diff --git a/src/itnpy/__init__.py b/src/itnpy/__init__.py
@@ -2,8 +2,38 @@
 
 from itnpy import vocab
 from itnpy._version import __version__
-from itnpy.custom import *
-from itnpy.itn import *
+from itnpy.custom import (
+    postprocess,
+    postprocess_dollar,
+    postprocess_money,
+    preprocess,
+)
+from itnpy.itn import (
+    group_tokens,
+    inverse_normalize_classes,
+    inverse_normalize_numbers,
+    number_length,
+    number_of_trailing_zeros,
+    spokens2digit,
+    tokens2digit,
+)
+
+__all__ = (
+    "__version__",
+    "group_tokens",
+    "inverse_normalize",
+    "inverse_normalize_classes",
+    "inverse_normalize_numbers",
+    "number_length",
+    "number_of_trailing_zeros",
+    "postprocess",
+    "postprocess_dollar",
+    "postprocess_money",
+    "preprocess",
+    "spokens2digit",
+    "tokens2digit",
+    "vocab",
+)
 
 
 def inverse_normalize(

diff --git a/src/itnpy/itn.py b/src/itnpy/itn.py
@@ -44,6 +44,7 @@ def group_tokens(tokens: List[str], mask: List[int]) -> List[dict]:
     groups = []
     start = 0
 
+    value = None
     for i, _ in enumerate(tokens):
         if i:
             if value != mask[i]:

diff --git a/tests/assets/vocab/failing.csv b/tests/assets/vocab/failing.csv
@@ -0,0 +1 @@
+input,output,mistake
diff --git a/tests/assets/vocab/passing.csv b/tests/assets/vocab/passing.csv
@@ -0,0 +1,13 @@
+input,output
+i payed one hundred and twenty seven dollars and twenty six cents, i payed 127$ and 26¢
+was your birthday on march twenty ninth, was your birthday on march 29th
+my phone number is nine four nine six eight two seventy fourteen, my phone number is 9496827014
+i have a minus one hundred point four three balance, i have a -100.43 balance
+calling to place an order of three hundred thousand sixty four hundred and eighteen parts, calling to place an order of 30006418 parts
+my order id is seven eighteen fourteen fifteen nine eight zero, my order id is 7181415980
+my date of birth is three seven fifty four, my date of birth is 3754
+what is eighty percent of negative point nine four, what is 80% of -.94
+seems to cost a thousand or more maybe a few hundred or so, seems to cost a 1000 or more maybe a few 100 or so
+double zero, 00
+triple zero, 000
+quadruple zero, 0000
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+import itnpy.vocab as vocab
+
+__all__ = (
+    "get_word2number_dict",
+    "error_message",
+)
+
+
+def get_word2number_dict(path="assets/vocab.csv"):
+    df = vocab.get_dataframe(path)
+    return vocab.get_word2number_dict(df)
+
+
+def error_message(spoken, written, output):
+    df = [
+        {
+            "[spoken]".upper(): spoken,
+            "[written]".upper(): written,
+            "[output]".upper(): output,
+        }
+    ]
+    df = pd.DataFrame(df)
+    df = df.set_index("[spoken]".upper())
+    df = df.T
+    return "\n" + df.to_string()
diff --git a/tests/test_inverse_normalize_numbers.py b/tests/test_inverse_normalize_numbers.py
@@ -1,30 +1,30 @@
 import pandas as pd
 import pytest
-import sys
-
-sys.path.append("src")
 
 import itnpy
-import itnpy.vocab as vocab
-
-
-def get_word2number_dict(path="assets/vocab.csv"):
-    df = vocab.get_dataframe(path)
-    return vocab.get_word2number_dict(df)
-
 
-def error_message(spoken, written, output):
-    df = [
-        {
-            "[spoken]".upper(): spoken,
-            "[written]".upper(): written,
-            "[output]".upper(): output,
-        }
-    ]
-    df = pd.DataFrame(df)
-    df = df.set_index("[spoken]".upper())
-    df = df.T
-    return "\n" + df.to_string()
+from .helpers import error_message, get_word2number_dict
+
+# import itnpy.vocab as vocab
+
+
+# def get_word2number_dict(path="assets/vocab.csv"):
+#     df = vocab.get_dataframe(path)
+#     return vocab.get_word2number_dict(df)
+#
+#
+# def error_message(spoken, written, output):
+#     df = [
+#         {
+#             "[spoken]".upper(): spoken,
+#             "[written]".upper(): written,
+#             "[output]".upper(): output,
+#         }
+#     ]
+#     df = pd.DataFrame(df)
+#     df = df.set_index("[spoken]".upper())
+#     df = df.T
+#     return "\n" + df.to_string()
 
 
 @pytest.mark.parametrize(

diff --git a/tests/test_tokens2digit.py b/tests/test_tokens2digit.py
@@ -1,24 +1,22 @@
 import pandas as pd
 import pytest
-import sys
-
-sys.path.append("src")
 
 import itnpy
 
+from .helpers import error_message
 
-def error_message(spoken, written, output):
-    df = [
-        {
-            "[spoken]".upper(): spoken,
-            "[written]".upper(): written,
-            "[output]".upper(): output,
-        }
-    ]
-    df = pd.DataFrame(df)
-    df = df.set_index("[spoken]".upper())
-    df = df.T
-    return "\n" + df.to_string()
+# def error_message(spoken, written, output):
+#     df = [
+#         {
+#             "[spoken]".upper(): spoken,
+#             "[written]".upper(): written,
+#             "[output]".upper(): output,
+#         }
+#     ]
+#     df = pd.DataFrame(df)
+#     df = df.set_index("[spoken]".upper())
+#     df = df.T
+#     return "\n" + df.to_string()
 
 
 @pytest.mark.parametrize(

diff --git a/tests/test_vocab.py b/tests/test_vocab.py
@@ -0,0 +1,31 @@
+import pandas as pd
+import pytest
+
+import itnpy
+
+from .helpers import error_message, get_word2number_dict
+
+
+@pytest.mark.parametrize(
+    "path",
+    [
+        "tests/assets/vocab/passing.csv",
+        "tests/assets/vocab/failing.csv",
+    ],
+)
+def test_vocab(path):
+    df = pd.read_csv(path, dtype={"input": object, "output": object})
+    df = df.fillna("")
+    # --- Get the vocab for converting spoken-form text into written-form text
+    word2number = get_word2number_dict()
+
+    for _, row in df.iterrows():
+        tokens = row["input"].strip()
+        output = row["output"].strip()
+        # NOTE: This can be modified depending on your needs
+        spoken2 = itnpy.preprocess(tokens.split(), word2number)
+        # --- Convert spoken-form tokens to written-form tokens
+        digit = itnpy.inverse_normalize_numbers(spoken2, word2number)
+        # --- Convert tokens to string
+        digit = " ".join(digit)
+        assert output == digit, error_message(" ".join(tokens), digit, output)