Skip to content

Commit

Permalink
Improve tests. Blackify and isortify. Optimize the code.
Browse files Browse the repository at this point in the history
  • Loading branch information
barseghyanartur committed Dec 21, 2022
1 parent a1ac470 commit 20e0341
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 44 deletions.
4 changes: 2 additions & 2 deletions scripts/demo.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import pandas as pd
import sys

import pandas as pd

sys.path.append("../src")

import itnpy
import itnpy.vocab as vocab


if __name__ == "__main__":
df = vocab.get_dataframe()
word2number = vocab.get_word2number_dict(df)
Expand Down
7 changes: 4 additions & 3 deletions scripts/docs.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import pandas as pd
import sys

import pandas as pd

sys.path.append("../src")

import itnpy
import itnpy.vocab as vocab


EXAMPLES = [
"i payed one hundred and twenty seven dollars and twenty six cents",
"was your birthday on march twenty ninth",
"my phone number is nine four nine six eight two seventy fourteen",
"i have a minus one hundred point four three balance",
"calling to place an order of three hundred thousand sixty four hundred and eighteen parts",
"calling to place an order of three hundred thousand sixty four hundred "
"and eighteen parts",
"my order id is seven eighteen fourteen fifteen nine eight zero",
"my date of birth is three seven fifty four",
"what is eighty percent of negative point nine four",
Expand Down
34 changes: 32 additions & 2 deletions src/itnpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,38 @@

from itnpy import vocab
from itnpy._version import __version__
from itnpy.custom import *
from itnpy.itn import *
from itnpy.custom import (
postprocess,
postprocess_dollar,
postprocess_money,
preprocess,
)
from itnpy.itn import (
group_tokens,
inverse_normalize_classes,
inverse_normalize_numbers,
number_length,
number_of_trailing_zeros,
spokens2digit,
tokens2digit,
)

__all__ = (
"__version__",
"group_tokens",
"inverse_normalize",
"inverse_normalize_classes",
"inverse_normalize_numbers",
"number_length",
"number_of_trailing_zeros",
"postprocess",
"postprocess_dollar",
"postprocess_money",
"preprocess",
"spokens2digit",
"tokens2digit",
"vocab",
)


def inverse_normalize(
Expand Down
1 change: 1 addition & 0 deletions src/itnpy/itn.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def group_tokens(tokens: List[str], mask: List[int]) -> List[dict]:
groups = []
start = 0

value = None
for i, _ in enumerate(tokens):
if i:
if value != mask[i]:
Expand Down
1 change: 1 addition & 0 deletions tests/assets/vocab/failing.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
input,output,mistake
13 changes: 13 additions & 0 deletions tests/assets/vocab/passing.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
input,output
i payed one hundred and twenty seven dollars and twenty six cents, i payed 127$ and 26¢
was your birthday on march twenty ninth, was your birthday on march 29th
my phone number is nine four nine six eight two seventy fourteen, my phone number is 9496827014
i have a minus one hundred point four three balance, i have a -100.43 balance
calling to place an order of three hundred thousand sixty four hundred and eighteen parts, calling to place an order of 30006418 parts
my order id is seven eighteen fourteen fifteen nine eight zero, my order id is 7181415980
my date of birth is three seven fifty four, my date of birth is 3754
what is eighty percent of negative point nine four, what is 80% of -.94
seems to cost a thousand or more maybe a few hundred or so, seems to cost a 1000 or more maybe a few 100 or so
double zero, 00
triple zero, 000
quadruple zero, 0000
27 changes: 27 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd

import itnpy.vocab as vocab

__all__ = (
"get_word2number_dict",
"error_message",
)


def get_word2number_dict(path="assets/vocab.csv"):
df = vocab.get_dataframe(path)
return vocab.get_word2number_dict(df)


def error_message(spoken, written, output):
df = [
{
"[spoken]".upper(): spoken,
"[written]".upper(): written,
"[output]".upper(): output,
}
]
df = pd.DataFrame(df)
df = df.set_index("[spoken]".upper())
df = df.T
return "\n" + df.to_string()
44 changes: 22 additions & 22 deletions tests/test_inverse_normalize_numbers.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
import pandas as pd
import pytest
import sys

sys.path.append("src")

import itnpy
import itnpy.vocab as vocab


def get_word2number_dict(path="assets/vocab.csv"):
df = vocab.get_dataframe(path)
return vocab.get_word2number_dict(df)


def error_message(spoken, written, output):
df = [
{
"[spoken]".upper(): spoken,
"[written]".upper(): written,
"[output]".upper(): output,
}
]
df = pd.DataFrame(df)
df = df.set_index("[spoken]".upper())
df = df.T
return "\n" + df.to_string()
from .helpers import error_message, get_word2number_dict

# import itnpy.vocab as vocab


# def get_word2number_dict(path="assets/vocab.csv"):
# df = vocab.get_dataframe(path)
# return vocab.get_word2number_dict(df)
#
#
# def error_message(spoken, written, output):
# df = [
# {
# "[spoken]".upper(): spoken,
# "[written]".upper(): written,
# "[output]".upper(): output,
# }
# ]
# df = pd.DataFrame(df)
# df = df.set_index("[spoken]".upper())
# df = df.T
# return "\n" + df.to_string()


@pytest.mark.parametrize(
Expand Down
28 changes: 13 additions & 15 deletions tests/test_tokens2digit.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
import pandas as pd
import pytest
import sys

sys.path.append("src")

import itnpy

from .helpers import error_message

def error_message(spoken, written, output):
df = [
{
"[spoken]".upper(): spoken,
"[written]".upper(): written,
"[output]".upper(): output,
}
]
df = pd.DataFrame(df)
df = df.set_index("[spoken]".upper())
df = df.T
return "\n" + df.to_string()
# def error_message(spoken, written, output):
# df = [
# {
# "[spoken]".upper(): spoken,
# "[written]".upper(): written,
# "[output]".upper(): output,
# }
# ]
# df = pd.DataFrame(df)
# df = df.set_index("[spoken]".upper())
# df = df.T
# return "\n" + df.to_string()


@pytest.mark.parametrize(
Expand Down
31 changes: 31 additions & 0 deletions tests/test_vocab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
import pytest

import itnpy

from .helpers import error_message, get_word2number_dict


@pytest.mark.parametrize(
"path",
[
"tests/assets/vocab/passing.csv",
"tests/assets/vocab/failing.csv",
],
)
def test_vocab(path):
df = pd.read_csv(path, dtype={"input": object, "output": object})
df = df.fillna("")
# --- Get the vocab for converting spoken-form text into written-form text
word2number = get_word2number_dict()

for _, row in df.iterrows():
tokens = row["input"].strip()
output = row["output"].strip()
# NOTE: This can be modified depending on your needs
spoken2 = itnpy.preprocess(tokens.split(), word2number)
# --- Convert spoken-form tokens to written-form tokens
digit = itnpy.inverse_normalize_numbers(spoken2, word2number)
# --- Convert tokens to string
digit = " ".join(digit)
assert output == digit, error_message(" ".join(tokens), digit, output)

0 comments on commit 20e0341

Please sign in to comment.