-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery refactored main branch #6
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -185,7 +185,7 @@ def frequency_filter(vocab: Dict[str, Entry], max_perc: float=50, min_perc: floa | |
return vocab | ||
|
||
|
||
def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent | ||
def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent | ||
'''Reduce dict size by deleting hyphenated tokens when the parts are frequent.''' | ||
deletions, old_len = [], len(vocab) | ||
myfreqs = np.array(sum_entries(vocab)) | ||
|
@@ -196,7 +196,7 @@ def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=Fal | |
if firstpart in vocab and sum_entry(vocab[firstpart]) > threshold or \ | ||
secondpart in vocab and sum_entry(vocab[secondpart]) > threshold: | ||
deletions.append(word) | ||
if verbose is True: | ||
if verbose: | ||
print(sorted(deletions)) | ||
for item in deletions: | ||
del vocab[item] | ||
|
@@ -255,12 +255,10 @@ def freshness_filter(vocab: Dict[str, Entry], percentage: float=10) -> Dict[str, | |
thresh = len(series)*(percentage/100) | ||
freshnessindex = sum(series[-ceil(thresh):]) | ||
oldnessindex = sum(series[:ceil(thresh)]) | ||
if oldnessindex < datethreshold: | ||
#if oldnessindex < np.percentile(series, percentage): | ||
# continue | ||
if freshnessindex < np.percentile(series, percentage): | ||
deletions.append(token) | ||
# print(vocab[token], freshnessindex, oldnessindex, token) | ||
if oldnessindex < datethreshold and freshnessindex < np.percentile( | ||
series, percentage | ||
): | ||
deletions.append(token) | ||
Comment on lines
-258
to
+261
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
for item in deletions: | ||
del vocab[item] | ||
print_changes('freshness', old_len, len(vocab)) | ||
|
@@ -326,10 +324,12 @@ def sources_filter(vocab: Dict[str, Entry], myset: Set[str]) -> Dict[str, Entry] | |
def wordlist_filter(vocab: Dict[str, Entry], mylist: List[str], keep_words: bool=False) -> Dict[str, Entry]: | ||
'''Keep or discard words present in the input list.''' | ||
intersection = set(vocab) & set(mylist) | ||
if keep_words is False: | ||
deletions = list(intersection) | ||
else: | ||
deletions = [w for w in vocab if w not in intersection] | ||
deletions = ( | ||
[w for w in vocab if w not in intersection] | ||
if keep_words | ||
else list(intersection) | ||
) | ||
|
||
Comment on lines
-329
to
+332
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
old_len = len(vocab) | ||
for word in deletions: | ||
del vocab[word] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,8 +36,7 @@ def find_files(dirname: str, maxdiff: int) -> Iterator[str]: | |
for thepath, _, files in walk(dirname): | ||
# check for dates in directory names | ||
if '-' in thepath: | ||
match = DATESEARCH.search(thepath) | ||
if match: | ||
if match := DATESEARCH.search(thepath): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
thisdiff = calc_timediff(match[0]) | ||
if thisdiff is not None and thisdiff > maxdiff: | ||
continue | ||
|
@@ -181,7 +180,7 @@ def read_file(filepath: str, *, maxdiff: int=1000, mindiff: int=0, authorregex: | |
source = mytree.findtext('.//tei:publisher', namespaces=NSPACE) | ||
# headings | ||
headwords = set() | ||
if details is True: | ||
if details: | ||
Comment on lines
-184
to
+183
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
bow = [' '.join(h.itertext()) for h in mytree.xpath('.//tei:fw|.//tei:head', namespaces=NSPACE)] | ||
headwords = {t for t in simple_tokenizer(' '.join(bow)) if is_relevant_input(t)} | ||
# process | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
hyphenated_filter
refactored with the following changes:simplify-boolean-comparison
)