Skip to content

Commit

Permalink
mypy: add type annotations (1)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jul 5, 2022
1 parent 904771c commit 28f4487
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 44 deletions.
19 changes: 10 additions & 9 deletions simplemma/langdetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@

from collections import Counter
from operator import itemgetter
from typing import List, Optional, Sequence, Tuple

from .simplemma import _load_data, _return_lemma


SPLIT_INPUT = re.compile(r'[^\W\d_]{3,}')


def prepare_text(text):
def prepare_text(text: str) -> List[str]:
"""Extract potential words, scramble them, extract the most frequent,
some of the rest, and return at most 1000 tokens."""
# generator expression to split the text
Expand All @@ -27,29 +28,29 @@ def prepare_text(text):
return [item[0] for item in counter.most_common(1000)]


def in_target_language(text, lang=None):
def in_target_language(text: str, lang: Optional[Tuple[str]]=None) -> float:
"""Determine which proportion of the text is in the target language(s)."""
total = 0
in_target = 0
for token in prepare_text(text):
total += 1
langdata = _load_data(lang)
for l in langdata:
candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code)
candidate = _return_lemma(token, l.dict, greedy=True, lang=l.code) # type: ignore
if candidate is not None:
in_target += 1
break
return in_target/total


def _return_default():
def _return_default() -> Sequence[Tuple[str, float]]:
# todo: None if 'unk'?
return [('unk', 1)]


def lang_detector(text, lang=None, extensive=False):
def lang_detector(text: str, lang: Optional[Tuple[str]]=None, extensive: bool=False) -> Sequence[Tuple[Optional[str], float]]:
"""Determine which proportion of the text is in the target language(s)."""
myresults = {}
myresults = {} # Dict[str, float]
tokens = prepare_text(text)
total_tokens = len(tokens)
if total_tokens == 0:
Expand All @@ -58,9 +59,9 @@ def lang_detector(text, lang=None, extensive=False):
langdata = _load_data(lang)
for l in langdata:
if extensive is False:
in_target = len(list(filter(None, (_return_lemma(t, l.dict, greedy=False, lang=l.code) for t in tokens))))
in_target = len(list(filter(None, (_return_lemma(t, l.dict, greedy=False, lang=l.code) for t in tokens)))) # type: ignore
else:
in_target = len(list(filter(None, (_return_lemma(t, l.dict, greedy=True, lang=l.code) for t in tokens))))
in_target = len(list(filter(None, (_return_lemma(t, l.dict, greedy=True, lang=l.code) for t in tokens)))) # type: ignore
# compute results
found_ratio = in_target/total_tokens
myresults[l.code] = found_ratio
Expand All @@ -70,7 +71,7 @@ def lang_detector(text, lang=None, extensive=False):
results = sorted(myresults.items(), key=itemgetter(1), reverse=True)
# in case of ex-aequo
if extensive is False and results[0][1] == results[1][1]:
results = lang_detector(text, lang=lang, extensive=True)
results = lang_detector(text, lang=lang, extensive=True) # type: ignore
if len(results) > 1 and results[0][1] == results[1][1]:
return _return_default()
return results
8 changes: 5 additions & 3 deletions simplemma/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@

import re

from typing import Optional


ADJ_DE = re.compile(r'^(.+?)(arm|artig|bar|chig|ell|en|end|erig|ern|fach|frei|haft|iert|igt|isch|iv|lich|los|mäßig|reich|rig|sam|sch|schig|voll)(er|e?st)?(e|em|en|es|er)?$') # ig
# https://de.wiktionary.org/wiki/-ent

ENDING_DE = re.compile(r'(e|em|en|er|es)$')


def apply_rules(token, langcode):
def apply_rules(token: str, langcode: Optional[str]) -> Optional[str]:
'Apply pre-defined rules for certain languages.'
candidate = None
if langcode == 'de':
Expand All @@ -19,7 +21,7 @@ def apply_rules(token, langcode):
return candidate


def apply_de(token):
def apply_de(token: str) -> Optional[str]:
'Apply pre-defined rules for German.'
if token[0].isupper() and len(token) > 8:
if ENDING_DE.search(token):
Expand Down Expand Up @@ -50,7 +52,7 @@ def apply_de(token):
return None


def apply_en(token):
def apply_en(token: str) -> Optional[str]:
'Apply pre-defined rules for English.'
# nouns
if token.endswith('s'):
Expand Down
63 changes: 32 additions & 31 deletions simplemma/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Iterator, Optional, Tuple, Union

try:
from .rules import apply_rules
Expand All @@ -34,7 +35,7 @@
HYPHENS = {'-', '_'}
PUNCTUATION = {'.', '?', '!', '…', '¿', '¡'}

LANG_DATA = []
LANG_DATA = [] # type: List[LangDict]

#class LangData:
# "Class to store word pairs and relevant information."
Expand All @@ -49,23 +50,23 @@ class LangDict:
"Class to store word pairs and relevant information for a single language."
__slots__ = ('code', 'dict')

def __init__(self, langcode=None, langdict=None):
self.code = langcode
self.dict = langdict
def __init__(self, langcode: Optional[str]=None, langdict: Optional[Dict[str, str]]=None):
self.code: Optional[str] = langcode
self.dict: Optional[Dict[str, str]] = langdict


def _determine_path(listpath, langcode):
def _determine_path(listpath: str, langcode: str) -> str:
filename = f'{listpath}/{langcode}.txt'
return str(Path(__file__).parent / filename)


def _load_dict(langcode, listpath='lists', silent=True):
def _load_dict(langcode: str, listpath: str='lists', silent: bool=True) -> Dict[str, str]:
filepath = _determine_path(listpath, langcode)
return _read_dict(filepath, langcode, silent)


def _read_dict(filepath, langcode, silent):
mydict, myadditions, i = {}, [], 0
def _read_dict(filepath: str, langcode: str, silent: bool) -> Dict[str, str]:
mydict, myadditions, i = {}, [], 0 # type: Dict[str, str], List[str], int
leftlimit = 1 if langcode in SAFE_LIMIT else 2
# load data from list
with open(filepath , 'r', encoding='utf-8') as filehandle:
Expand Down Expand Up @@ -112,7 +113,7 @@ def _read_dict(filepath, langcode, silent):
return dict(sorted(mydict.items()))


def _pickle_dict(langcode):
def _pickle_dict(langcode: str) -> None:
mydict = _load_dict(langcode)
filename = f'data/{langcode}.plzma'
filepath = str(Path(__file__).parent / filename)
Expand All @@ -121,19 +122,19 @@ def _pickle_dict(langcode):
LOGGER.debug('%s %s', langcode, len(mydict))


def _load_pickle(langcode):
def _load_pickle(langcode: str) -> Dict[str, str]:
filename = f'data/{langcode}.plzma'
filepath = str(Path(__file__).parent / filename)
with lzma.open(filepath, 'rb') as filehandle:
return pickle.load(filehandle)
return pickle.load(filehandle) # type: ignore


def _load_data(langs):
def _load_data(langs: Optional[Tuple[str]]) -> List[LangDict]:
"""Decompress und unpickle lemmatization rules.
Takes one or several ISO 639-1 code language code as input.
Returns a list of dictionaries."""
langlist = []
for lang in langs:
for lang in langs: # type: ignore
if lang not in LANGLIST:
LOGGER.error('language not supported: %s', lang)
continue
Expand All @@ -142,7 +143,7 @@ def _load_data(langs):
return langlist


def _update_lang_data(lang):
def _update_lang_data(lang: Optional[Union[str, Tuple[str]]]) -> Tuple[str]:
# convert string
if isinstance(lang, str):
lang = (lang,)
Expand All @@ -157,7 +158,7 @@ def _update_lang_data(lang):


@lru_cache(maxsize=65536)
def _levenshtein_dist(str1, str2):
def _levenshtein_dist(str1: str, str2: str) -> int:
# inspired by this noticeably faster code:
# https://gist.github.com/p-hash/9e0f9904ce7947c133308fbe48fe032b
if str1 == str2:
Expand Down Expand Up @@ -198,7 +199,7 @@ def _levenshtein_dist(str1, str2):
# return True


def _simple_search(token, datadict, initial=False):
def _simple_search(token: str, datadict: Dict[str, str], initial: bool=False) -> Optional[str]:
# beginning of sentence, reverse case
if initial is True:
token = token.lower()
Expand All @@ -212,7 +213,7 @@ def _simple_search(token, datadict, initial=False):
return candidate


def _greedy_search(candidate, datadict, steps=1, distance=5):
def _greedy_search(candidate: str, datadict: Dict[str, str], steps: int=1, distance: int=5) -> str:
i = 0
while candidate in datadict and (
len(datadict[candidate]) < len(candidate) and
Expand All @@ -225,7 +226,7 @@ def _greedy_search(candidate, datadict, steps=1, distance=5):
return candidate


def _decompose(token, datadict, affixlen=0):
def _decompose(token: str, datadict: Dict[str, str], affixlen: int=0) -> Tuple[Optional[str], Optional[str]]:
candidate, plan_b = None, None
# this only makes sense for languages written from left to right
# AFFIXLEN or MINCOMPLEN can spare time for some languages
Expand Down Expand Up @@ -258,7 +259,7 @@ def _decompose(token, datadict, affixlen=0):
# backup: equal length or further candidates accepted
if candidate is None:
# try without capitalizing
newcandidate = _simple_search(part2, datadict)
newcandidate = _simple_search(part2, datadict) # type: ignore
if newcandidate and len(newcandidate) <= len(part2):
candidate = part1 + newcandidate.lower()
# even greedier
Expand All @@ -275,7 +276,7 @@ def _decompose(token, datadict, affixlen=0):
return candidate, plan_b


def _dehyphen(token, datadict, greedy):
def _dehyphen(token: str, datadict: Dict[str, str], greedy: bool) -> Optional[str]:
if not '-' in token and not '_' in token:
return None
splitted = HYPHEN_REGEX.split(token)
Expand All @@ -287,7 +288,7 @@ def _dehyphen(token, datadict, greedy):
if subcandidate in datadict:
return datadict[subcandidate]
# decompose
subcandidate = _simple_search(splitted[-1], datadict)
subcandidate = _simple_search(splitted[-1], datadict) # type: ignore
# search further
if subcandidate is None and greedy is True:
subcandidate = _affix_search(splitted[-1], datadict)
Expand All @@ -298,7 +299,7 @@ def _dehyphen(token, datadict, greedy):
return None


def _affix_search(wordform, datadict, maxlen=AFFIXLEN):
def _affix_search(wordform: str, datadict: Dict[str, str], maxlen: int=AFFIXLEN) -> Optional[str]:
for length in range(maxlen, 1, -1):
candidate, plan_b = _decompose(wordform, datadict, affixlen=length)
if candidate is not None:
Expand All @@ -309,7 +310,7 @@ def _affix_search(wordform, datadict, maxlen=AFFIXLEN):
return candidate


def _suffix_search(token, datadict):
def _suffix_search(token: str, datadict: Dict[str, str]) -> Optional[str]:
lastcount = 0
for count in range(MINCOMPLEN, len(token)-MINCOMPLEN+1):
#print(token[-count:], token[:-count], lastpart)
Expand All @@ -321,7 +322,7 @@ def _suffix_search(token, datadict):
return None


def _return_lemma(token, datadict, greedy=True, lang=None, initial=False):
def _return_lemma(token: str, datadict: Dict[str, str], greedy: bool=True, lang: Optional[str]=None, initial: bool=False) -> Optional[str]:
# filters
if token.isnumeric():
return token
Expand Down Expand Up @@ -357,21 +358,21 @@ def _return_lemma(token, datadict, greedy=True, lang=None, initial=False):
return candidate


def _control_input_type(token):
def _control_input_type(token: Any) -> None:
"Make sure the input is a string of length > 0."
if not isinstance(token, str):
raise TypeError(f'Wrong input type, expected string, got {type(token)}')
if token == '':
raise ValueError('Wrong input type: empty string')


def is_known(token, lang=None):
def is_known(token: str, lang: Optional[Union[str, Tuple[str]]]=None) -> bool:
"""Tell if a token is present in one of the loaded dictionaries.
Case-insensitive, whole word forms only. Returns True or False."""
_control_input_type(token)
_ = _update_lang_data(lang)
for language in LANG_DATA:
if _simple_search(token, language.dict) is not None:
if _simple_search(token, language.dict) is not None: # type: ignore
return True
return False
# suggestion:
Expand All @@ -381,7 +382,7 @@ def is_known(token, lang=None):


@lru_cache(maxsize=1048576)
def lemmatize(token, lang=None, greedy=False, silent=True, initial=False):
def lemmatize(token: str, lang: Optional[Union[str, Tuple[str]]]=None, greedy: bool=False, silent: bool=True, initial: bool=False) -> str:
"""Try to reduce a token to its lemma form according to the
language list passed as input.
Returns a string.
Expand All @@ -394,7 +395,7 @@ def lemmatize(token, lang=None, greedy=False, silent=True, initial=False):
#if greedy is None:
# greedy = _define_greediness(language)
# determine lemma
candidate = _return_lemma(token, l.dict, greedy=greedy, lang=l.code, initial=initial)
candidate = _return_lemma(token, l.dict, greedy=greedy, lang=l.code, initial=initial) # type: ignore
if candidate is not None:
if i != 1:
LOGGER.debug('%s found in %s', token, l.code)
Expand All @@ -407,7 +408,7 @@ def lemmatize(token, lang=None, greedy=False, silent=True, initial=False):
return token


def text_lemmatizer(text, lang=None, greedy=False, silent=True):
def text_lemmatizer(text: str, lang: Optional[Union[str, Tuple[str]]]=None, greedy: bool=False, silent: bool=True) -> List[str]:
"""Convenience function to lemmatize a text using a simple tokenizer.
Returns a list of tokens and lemmata."""
lemmata = []
Expand All @@ -424,7 +425,7 @@ def text_lemmatizer(text, lang=None, greedy=False, silent=True):
return lemmata


def lemma_iterator(text, lang=None, greedy=False, silent=True):
def lemma_iterator(text: str, lang: Optional[Union[str, Tuple[str]]]=None, greedy: bool=False, silent: bool=True) -> Iterator[str]:
"""Convenience function to lemmatize a text using a simple tokenizer.
Returns a list of tokens and lemmata."""
last = '.' # beginning is initial
Expand Down
4 changes: 3 additions & 1 deletion simplemma/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import re

from typing import Iterator, List, Match, Union


TOKREGEX = re.compile(r'(?:(?:[0-9][0-9.,:%-]*|St\.)[\w_€-]+|https?://[^ ]+|[@#§$]?\w[\w*_-]*|[,;:\.?!¿¡‽⸮…()\[\]–{}—―/‒_“„”⹂‚‘’‛′″‟\'"«»‹›<>=+−×÷•·]+)')


def simple_tokenizer(text, iterate=False):
def simple_tokenizer(text: str, iterate: bool=False) -> Union[Iterator[Match[str]], List[str]]:
"""Simple regular expression adapted from NLTK.
Takes a string as input and returns a list of tokens.
Provided for convenience and educational purposes."""
Expand Down

0 comments on commit 28f4487

Please sign in to comment.