adbar · sourcery-ai · Oct 7, 2022 · Oct 10, 2022 · Oct 10, 2022
diff --git a/simplemma/data/fi-rules.plzma b/simplemma/data/fi-rules.plzma
diff --git a/simplemma/rules.py b/simplemma/rules.py
@@ -1,11 +1,14 @@
 """Simple rules for unknown tokens."""
 
+import lzma
+import pickle
 import re
 
+from pathlib import Path
 from typing import Optional
 
 
-RULES_LANGS = {"de", "en"}
+RULES_LANGS = {"de", "en", "fi"}
 
 ADJ_DE = re.compile(
     r"^(.+?)(arm|artig|bar|chig|ell|en|end|erig|ern|esk|fach|fähig|förmig|frei|haft|iert|igt|isch|iv|lich|los|mäßig|reich|rig|sam|sch|schig|selig|voll)(?:er|e?st)?(?:e|em|en|er|es)?$"
@@ -27,6 +30,8 @@
 ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"})
 ENDING_DE = re.compile(r"(?:e|em|en|er|es)$")
 
+SUFFIX_RULES_FI = None  # lazy loading when first needed
+
 
 def apply_rules(
     token: str, langcode: Optional[str], greedy: bool = False
@@ -37,6 +42,8 @@ def apply_rules(
         candidate = apply_de(token, greedy)
     elif langcode == "en":
         candidate = apply_en(token)
+    elif langcode == "fi":
+        candidate = apply_fi(token)
     return candidate
 
 
@@ -51,10 +58,7 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
         if match and len(match[0]) > 2:
             groups = [g for g in match.groups() if g is not None]
             # lemma identified
-            if not groups:
-                return token
-            # apply -en/-e/-n/-s patterns
-            return token[: -len(groups[0])]
+            return token[: -len(groups[0])] if groups else token
         # -end
         if GERUNDIVE_DE.search(token):
             return ENDING_DE.sub("er", token)
@@ -66,12 +70,11 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
                 return PLUR_ORTH_DE.sub(":innen", token)
             # normalize without regex
             return token[:-3]
-        # last resort
-        # if greedy:
-        # -s → ø
-        # if token[-1] == "s":
-        #    return token[:-1]
-    # adjectives
+            # last resort
+            # if greedy:
+            # -s → ø
+            # if token[-1] == "s":
+            #    return token[:-1]
     elif token[0].islower():  # and token[-1] in ENDING_CHARS_ADJ_DE
         candidate, alternative = None, None
         # general search
@@ -93,7 +96,7 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
         if alternative:
             if not candidate:
                 return alternative
-            if candidate and len(alternative) < len(candidate):
+            if len(alternative) < len(candidate):
                 return alternative
         return candidate
     return None
@@ -105,35 +108,53 @@ def apply_en(token: str) -> Optional[str]:
     if token[-1] == "s":
         if token.endswith("ies") and len(token) > 7:
             if token.endswith("cies"):
-                return token[:-4] + "cy"
+                return f"{token[:-4]}cy"
             if token.endswith("ries"):
-                return token[:-4] + "ry"
+                return f"{token[:-4]}ry"
             if token.endswith("ties"):
-                return token[:-4] + "ty"
+                return f"{token[:-4]}ty"
         if token.endswith("doms"):
-            return token[:-4] + "dom"
+            return f"{token[:-4]}dom"
         if token.endswith("esses"):
-            return token[:-5] + "ess"
+            return f"{token[:-5]}ess"
         if token.endswith("isms"):
-            return token[:-4] + "ism"
+            return f"{token[:-4]}ism"
         if token.endswith("ists"):
-            return token[:-4] + "ist"
+            return f"{token[:-4]}ist"
         if token.endswith("ments"):
-            return token[:-5] + "ment"
+            return f"{token[:-5]}ment"
         if token.endswith("nces"):
-            return token[:-4] + "nce"
+            return f"{token[:-4]}nce"
         if token.endswith("ships"):
-            return token[:-5] + "ship"
+            return f"{token[:-5]}ship"
         if token.endswith("tions"):
-            return token[:-5] + "tion"
-    # verbs
+            return f"{token[:-5]}tion"
     elif token.endswith("ed"):
         if token.endswith("ated"):
-            return token[:-4] + "ate"
+            return f"{token[:-4]}ate"
         if token.endswith("ened"):
-            return token[:-4] + "en"
+            return f"{token[:-4]}en"
         if token.endswith("fied"):
-            return token[:-4] + "fy"
+            return f"{token[:-4]}fy"
         if token.endswith("ized"):
-            return token[:-4] + "ize"
+            return f"{token[:-4]}ize"
+    return None
+
+
+def apply_fi(token: str) -> Optional[str]:
+    "Apply pre-defined rules for Finnish."
+    global SUFFIX_RULES_FI
+
+    if SUFFIX_RULES_FI is None:
+        filename = "data/fi-rules.plzma"
+        filepath = str(Path(__file__).parent / filename)
+        with lzma.open(filepath, "rb") as filehandle:
+            SUFFIX_RULES_FI = pickle.load(filehandle)
+
+    for length in (6, 5, 4, 3):
+        if len(token) < length + 2:
+            continue  # token is too short to try suffix rules
+        suffix = token[-length:]
+        if suffix in SUFFIX_RULES_FI:
+            return token[:-length] + SUFFIX_RULES_FI[suffix]
     return None
diff --git a/tests/test_rules.py b/tests/test_rules.py
@@ -3,7 +3,7 @@
 import logging
 import pytest
 
-from simplemma.rules import apply_rules, apply_de, apply_en
+from simplemma.rules import apply_rules, apply_de, apply_en, apply_fi
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -74,9 +74,24 @@ def test_apply_en():
     # assert apply_en('realised') == 'realise'
 
 
+def test_apply_fi():
+    """Test Finnish rules."""
+    # doesn't exist
+    assert apply_fi("Whatawordicantbelieveit") is None
+    # nouns
+    assert apply_fi("kansalaiseksi") == "kansalainen"
+    assert apply_fi("huokoisten") == "huokoinen"
+    assert apply_fi("kasvatteja") == "kasvatti"
+
+
 def test_apply_rules():
     """Test rules on all available languages."""
     assert apply_rules("Pfifferlinge", "de") == "Pfifferling"
     assert apply_rules("Pfifferlinge", "en") is None
+    assert apply_rules("Pfifferlinge", "fi") is None
     assert apply_rules("atonements", "de") is None
     assert apply_rules("atonements", "en") == "atonement"
+    assert apply_rules("atonements", "fi") is None
+    assert apply_rules("kansalaiseksi", "de") is None
+    assert apply_rules("kansalaiseksi", "en") is None
+    assert apply_rules("kansalaiseksi", "fi") == "kansalainen"