Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some simple suffix rules for Finnish #23

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added simplemma/data/fi-rules.plzma
Binary file not shown.
28 changes: 27 additions & 1 deletion simplemma/rules.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Simple rules for unknown tokens."""

import lzma
import pickle
import re

from pathlib import Path
from typing import Optional


RULES_LANGS = {"de", "en"}
RULES_LANGS = {"de", "en", "fi"}

ADJ_DE = re.compile(
r"^(.+?)(arm|artig|bar|chig|ell|en|end|erig|ern|esk|fach|fähig|förmig|frei|haft|iert|igt|isch|iv|lich|los|mäßig|reich|rig|sam|sch|schig|selig|voll)(?:er|e?st)?(?:e|em|en|er|es)?$"
Expand All @@ -27,6 +30,8 @@
ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"})
ENDING_DE = re.compile(r"(?:e|em|en|er|es)$")

SUFFIX_RULES_FI = None # lazy loading when first needed


def apply_rules(
token: str, langcode: Optional[str], greedy: bool = False
Expand All @@ -37,6 +42,8 @@ def apply_rules(
candidate = apply_de(token, greedy)
elif langcode == "en":
candidate = apply_en(token)
elif langcode == "fi":
candidate = apply_fi(token)
return candidate


Expand Down Expand Up @@ -137,3 +144,22 @@ def apply_en(token: str) -> Optional[str]:
if token.endswith("ized"):
return token[:-4] + "ize"
return None


def apply_fi(token: str) -> Optional[str]:
"Apply pre-defined rules for Finnish."
global SUFFIX_RULES_FI

if SUFFIX_RULES_FI is None:
filename = "data/fi-rules.plzma"
filepath = str(Path(__file__).parent / filename)
with lzma.open(filepath, "rb") as filehandle:
SUFFIX_RULES_FI = pickle.load(filehandle)

for length in (6, 5, 4, 3):
if len(token) < length + 2:
continue # token is too short to try suffix rules
suffix = token[-length:]
if suffix in SUFFIX_RULES_FI:
return token[:-length] + SUFFIX_RULES_FI[suffix]
return None
17 changes: 16 additions & 1 deletion tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import pytest

from simplemma.rules import apply_rules, apply_de, apply_en
from simplemma.rules import apply_rules, apply_de, apply_en, apply_fi

logging.basicConfig(level=logging.DEBUG)

Expand Down Expand Up @@ -74,9 +74,24 @@ def test_apply_en():
# assert apply_en('realised') == 'realise'


def test_apply_fi():
"""Test Finnish rules."""
# doesn't exist
assert apply_fi("Whatawordicantbelieveit") is None
# nouns
assert apply_fi("kansalaiseksi") == "kansalainen"
assert apply_fi("huokoisten") == "huokoinen"
assert apply_fi("kasvatteja") == "kasvatti"


def test_apply_rules():
"""Test rules on all available languages."""
assert apply_rules("Pfifferlinge", "de") == "Pfifferling"
assert apply_rules("Pfifferlinge", "en") is None
assert apply_rules("Pfifferlinge", "fi") is None
assert apply_rules("atonements", "de") is None
assert apply_rules("atonements", "en") == "atonement"
assert apply_rules("atonements", "fi") is None
assert apply_rules("kansalaiseksi", "de") is None
assert apply_rules("kansalaiseksi", "en") is None
assert apply_rules("kansalaiseksi", "fi") == "kansalainen"