Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored main branch #6

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions shoten/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def frequency_filter(vocab: Dict[str, Entry], max_perc: float=50, min_perc: floa
return vocab


def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent
def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=False) -> Dict[str, Entry]: # threshold in percent
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function hyphenated_filter refactored with the following changes:

'''Reduce dict size by deleting hyphenated tokens when the parts are frequent.'''
deletions, old_len = [], len(vocab)
myfreqs = np.array(sum_entries(vocab))
Expand All @@ -196,7 +196,7 @@ def hyphenated_filter(vocab: Dict[str, Entry], perc: float=50, verbose: bool=Fal
if firstpart in vocab and sum_entry(vocab[firstpart]) > threshold or \
secondpart in vocab and sum_entry(vocab[secondpart]) > threshold:
deletions.append(word)
if verbose is True:
if verbose:
print(sorted(deletions))
for item in deletions:
del vocab[item]
Expand Down Expand Up @@ -255,12 +255,10 @@ def freshness_filter(vocab: Dict[str, Entry], percentage: float=10) -> Dict[str,
thresh = len(series)*(percentage/100)
freshnessindex = sum(series[-ceil(thresh):])
oldnessindex = sum(series[:ceil(thresh)])
if oldnessindex < datethreshold:
#if oldnessindex < np.percentile(series, percentage):
# continue
if freshnessindex < np.percentile(series, percentage):
deletions.append(token)
# print(vocab[token], freshnessindex, oldnessindex, token)
if oldnessindex < datethreshold and freshnessindex < np.percentile(
series, percentage
):
deletions.append(token)
Comment on lines -258 to +261
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function freshness_filter refactored with the following changes:

This removes the following comments ( why? ):

#    continue
#if oldnessindex < np.percentile(series, percentage):
# print(vocab[token], freshnessindex, oldnessindex, token)

for item in deletions:
del vocab[item]
print_changes('freshness', old_len, len(vocab))
Expand Down Expand Up @@ -326,10 +324,12 @@ def sources_filter(vocab: Dict[str, Entry], myset: Set[str]) -> Dict[str, Entry]
def wordlist_filter(vocab: Dict[str, Entry], mylist: List[str], keep_words: bool=False) -> Dict[str, Entry]:
'''Keep or discard words present in the input list.'''
intersection = set(vocab) & set(mylist)
if keep_words is False:
deletions = list(intersection)
else:
deletions = [w for w in vocab if w not in intersection]
deletions = (
[w for w in vocab if w not in intersection]
if keep_words
else list(intersection)
)

Comment on lines -329 to +332
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function wordlist_filter refactored with the following changes:

old_len = len(vocab)
for word in deletions:
del vocab[word]
Expand Down
5 changes: 2 additions & 3 deletions shoten/shoten.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ def find_files(dirname: str, maxdiff: int) -> Iterator[str]:
for thepath, _, files in walk(dirname):
# check for dates in directory names
if '-' in thepath:
match = DATESEARCH.search(thepath)
if match:
if match := DATESEARCH.search(thepath):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function find_files refactored with the following changes:

thisdiff = calc_timediff(match[0])
if thisdiff is not None and thisdiff > maxdiff:
continue
Expand Down Expand Up @@ -181,7 +180,7 @@ def read_file(filepath: str, *, maxdiff: int=1000, mindiff: int=0, authorregex:
source = mytree.findtext('.//tei:publisher', namespaces=NSPACE)
# headings
headwords = set()
if details is True:
if details:
Comment on lines -184 to +183
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function read_file refactored with the following changes:

bow = [' '.join(h.itertext()) for h in mytree.xpath('.//tei:fw|.//tei:head', namespaces=NSPACE)]
headwords = {t for t in simple_tokenizer(' '.join(bow)) if is_relevant_input(t)}
# process
Expand Down