Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some simple suffix rules for Finnish (Sourcery refactored) #25

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added simplemma/data/fi-rules.plzma
Binary file not shown.
77 changes: 49 additions & 28 deletions simplemma/rules.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Simple rules for unknown tokens."""

import lzma
import pickle
import re

from pathlib import Path
from typing import Optional


RULES_LANGS = {"de", "en"}
RULES_LANGS = {"de", "en", "fi"}

ADJ_DE = re.compile(
r"^(.+?)(arm|artig|bar|chig|ell|en|end|erig|ern|esk|fach|fähig|förmig|frei|haft|iert|igt|isch|iv|lich|los|mäßig|reich|rig|sam|sch|schig|selig|voll)(?:er|e?st)?(?:e|em|en|er|es)?$"
Expand All @@ -27,6 +30,8 @@
ENDING_CHARS_ADJ_DE = ENDING_CHARS_NN_DE.union({"d", "t"})
ENDING_DE = re.compile(r"(?:e|em|en|er|es)$")

SUFFIX_RULES_FI = None # lazy loading when first needed


def apply_rules(
token: str, langcode: Optional[str], greedy: bool = False
Expand All @@ -37,6 +42,8 @@ def apply_rules(
candidate = apply_de(token, greedy)
elif langcode == "en":
candidate = apply_en(token)
elif langcode == "fi":
candidate = apply_fi(token)
return candidate


Expand All @@ -51,10 +58,7 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
if match and len(match[0]) > 2:
groups = [g for g in match.groups() if g is not None]
# lemma identified
if not groups:
return token
# apply -en/-e/-n/-s patterns
return token[: -len(groups[0])]
return token[: -len(groups[0])] if groups else token
# -end
if GERUNDIVE_DE.search(token):
return ENDING_DE.sub("er", token)
Expand All @@ -66,12 +70,11 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
return PLUR_ORTH_DE.sub(":innen", token)
# normalize without regex
return token[:-3]
# last resort
# if greedy:
# -s → ø
# if token[-1] == "s":
# return token[:-1]
# adjectives
# last resort
# if greedy:
# -s → ø
# if token[-1] == "s":
# return token[:-1]
elif token[0].islower(): # and token[-1] in ENDING_CHARS_ADJ_DE
candidate, alternative = None, None
# general search
Expand All @@ -93,7 +96,7 @@ def apply_de(token: str, greedy: bool = False) -> Optional[str]:
if alternative:
if not candidate:
return alternative
if candidate and len(alternative) < len(candidate):
if len(alternative) < len(candidate):
return alternative
return candidate
return None
Expand All @@ -105,35 +108,53 @@ def apply_en(token: str) -> Optional[str]:
if token[-1] == "s":
if token.endswith("ies") and len(token) > 7:
if token.endswith("cies"):
return token[:-4] + "cy"
return f"{token[:-4]}cy"
if token.endswith("ries"):
return token[:-4] + "ry"
return f"{token[:-4]}ry"
if token.endswith("ties"):
return token[:-4] + "ty"
return f"{token[:-4]}ty"
if token.endswith("doms"):
return token[:-4] + "dom"
return f"{token[:-4]}dom"
if token.endswith("esses"):
return token[:-5] + "ess"
return f"{token[:-5]}ess"
if token.endswith("isms"):
return token[:-4] + "ism"
return f"{token[:-4]}ism"
if token.endswith("ists"):
return token[:-4] + "ist"
return f"{token[:-4]}ist"
if token.endswith("ments"):
return token[:-5] + "ment"
return f"{token[:-5]}ment"
if token.endswith("nces"):
return token[:-4] + "nce"
return f"{token[:-4]}nce"
if token.endswith("ships"):
return token[:-5] + "ship"
return f"{token[:-5]}ship"
if token.endswith("tions"):
return token[:-5] + "tion"
# verbs
return f"{token[:-5]}tion"
elif token.endswith("ed"):
if token.endswith("ated"):
return token[:-4] + "ate"
return f"{token[:-4]}ate"
if token.endswith("ened"):
return token[:-4] + "en"
return f"{token[:-4]}en"
if token.endswith("fied"):
return token[:-4] + "fy"
return f"{token[:-4]}fy"
if token.endswith("ized"):
return token[:-4] + "ize"
return f"{token[:-4]}ize"
return None


def apply_fi(token: str) -> Optional[str]:
"Apply pre-defined rules for Finnish."
global SUFFIX_RULES_FI

if SUFFIX_RULES_FI is None:
filename = "data/fi-rules.plzma"
filepath = str(Path(__file__).parent / filename)
with lzma.open(filepath, "rb") as filehandle:
SUFFIX_RULES_FI = pickle.load(filehandle)

for length in (6, 5, 4, 3):
if len(token) < length + 2:
continue # token is too short to try suffix rules
suffix = token[-length:]
if suffix in SUFFIX_RULES_FI:
return token[:-length] + SUFFIX_RULES_FI[suffix]
return None
17 changes: 16 additions & 1 deletion tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import pytest

from simplemma.rules import apply_rules, apply_de, apply_en
from simplemma.rules import apply_rules, apply_de, apply_en, apply_fi

logging.basicConfig(level=logging.DEBUG)

Expand Down Expand Up @@ -74,9 +74,24 @@ def test_apply_en():
# assert apply_en('realised') == 'realise'


def test_apply_fi():
"""Test Finnish rules."""
# doesn't exist
assert apply_fi("Whatawordicantbelieveit") is None
# nouns
assert apply_fi("kansalaiseksi") == "kansalainen"
assert apply_fi("huokoisten") == "huokoinen"
assert apply_fi("kasvatteja") == "kasvatti"


def test_apply_rules():
"""Test rules on all available languages."""
assert apply_rules("Pfifferlinge", "de") == "Pfifferling"
assert apply_rules("Pfifferlinge", "en") is None
assert apply_rules("Pfifferlinge", "fi") is None
assert apply_rules("atonements", "de") is None
assert apply_rules("atonements", "en") == "atonement"
assert apply_rules("atonements", "fi") is None
assert apply_rules("kansalaiseksi", "de") is None
assert apply_rules("kansalaiseksi", "en") is None
assert apply_rules("kansalaiseksi", "fi") == "kansalainen"