From e9bf10662714ebbc7e49c507d6ecc8b2c88563ba Mon Sep 17 00:00:00 2001 From: Sourcery AI <> Date: Fri, 2 Sep 2022 15:50:16 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- shoten/filters.py | 24 ++++++++++++------------ shoten/shoten.py | 5 ++--- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/shoten/filters.py b/shoten/filters.py index 37a2c57..5d4c6c0 100644 --- a/shoten/filters.py +++ b/shoten/filters.py @@ -185,7 +185,7 @@ def frequency_filter(vocab: Dict[str, Entry], max_perc: float=50, min_perc: floa return vocab -def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent +def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent '''Reduce dict size by deleting hyphenated tokens when the parts are frequent.''' deletions, old_len = [], len(vocab) myfreqs = np.array(sum_entries(vocab)) @@ -196,7 +196,7 @@ def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=Fal if firstpart in vocab and sum_entry(vocab[firstpart]) > threshold or \ secondpart in vocab and sum_entry(vocab[secondpart]) > threshold: deletions.append(word) - if verbose is True: + if verbose: print(sorted(deletions)) for item in deletions: del vocab[item] @@ -255,12 +255,10 @@ def freshness_filter(vocab: Dict[str, Entry], percentage: float=10) -> Dict[str, thresh = len(series)*(percentage/100) freshnessindex = sum(series[-ceil(thresh):]) oldnessindex = sum(series[:ceil(thresh)]) - if oldnessindex < datethreshold: - #if oldnessindex < np.percentile(series, percentage): - # continue - if freshnessindex < np.percentile(series, percentage): - deletions.append(token) - # print(vocab[token], freshnessindex, oldnessindex, token) + if oldnessindex < datethreshold and freshnessindex < np.percentile( + series, percentage + ): + deletions.append(token) for item in deletions: del vocab[item] print_changes('freshness', old_len, len(vocab)) @@ -326,10 +324,12 @@ def sources_filter(vocab: Dict[str, Entry], myset: Set[str]) -> Dict[str, Entry] def wordlist_filter(vocab: Dict[str, Entry], mylist: List[str], keep_words: bool=False) -> Dict[str, Entry]: '''Keep or discard words present in the input list.''' intersection = set(vocab) & set(mylist) - if keep_words is False: - deletions = list(intersection) - else: - deletions = [w for w in vocab if w not in intersection] + deletions = ( + [w for w in vocab if w not in intersection] + if keep_words + else list(intersection) + ) + old_len = len(vocab) for word in deletions: del vocab[word] diff --git a/shoten/shoten.py b/shoten/shoten.py index ac9f592..ce90575 100644 --- a/shoten/shoten.py +++ b/shoten/shoten.py @@ -36,8 +36,7 @@ def find_files(dirname: str, maxdiff: int) -> Iterator[str]: for thepath, _, files in walk(dirname): # check for dates in directory names if '-' in thepath: - match = DATESEARCH.search(thepath) - if match: + if match := DATESEARCH.search(thepath): thisdiff = calc_timediff(match[0]) if thisdiff is not None and thisdiff > maxdiff: continue @@ -181,7 +180,7 @@ def read_file(filepath: str, *, maxdiff: int=1000, mindiff: int=0, authorregex: source = mytree.findtext('.//tei:publisher', namespaces=NSPACE) # headings headwords = set() - if details is True: + if details: bow = [' '.join(h.itertext()) for h in mytree.xpath('.//tei:fw|.//tei:head', namespaces=NSPACE)] headwords = {t for t in simple_tokenizer(' '.join(bow)) if is_relevant_input(t)} # process