-
Notifications
You must be signed in to change notification settings - Fork 0
/
occam.py
162 lines (119 loc) · 5.15 KB
/
occam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""A module for removing embellishing adjectives and adverbs from text.
Functions:
occam
"""
from typing import List, Tuple
import spacy
import click
import math
nlp = spacy.load("de_core_news_sm")
@click.command()
@click.option("--file", "-f", "filename_", default="", help="File with text input.")
@click.option("--text", "-t", default="", help="String with text input.")
@click.option("--output", "-o", default="", help="Path to output file (optional, otherwise print to CLI).")
@click.option("--adj", "-j", is_flag=True, help="Only remove adjectives (default: remove adjectives and adverbs).")
@click.option("--adv", "-v", is_flag=True, help="Only remove adverbs.")
@click.option("--sub", "-s", default="", help="String by which adjectives and adverbs shall be replaced.")
@click.option("--count", "-c", is_flag=True, help="Display the number of removed words.")
def occam(filename_: str, text: str, output: str, adj: bool, adv: bool, sub: str, count: bool):
"""Remove adjectives and adverbs from text."""
input_text = determine_input_text(filename_, text)
text_length = len(input_text)
doc = nlp(input_text)
removables = determine_removables(adj, adv)
output_text, counter = remove_removables(doc, text_length, removables, sub)
if count:
print_counter(counter, text_length)
output_text = output_text.strip()
show_output_text(output, output_text)
def determine_input_text(filepath_: str, text: str) -> str:
"""Determine whether input was supplied directly in a string or indirectly in a file.
:param filepath_: str representing the path to an text file (empty if no file supplied by user)
:param text: str representing the text input (empty if no string supplied by user)
:return: the input text as a str
"""
input_text = ""
if text:
input_text += text
# if both a string and a file path are supplied, they are concatenated to a single input text
if filepath_:
with open(filepath_) as f:
input_text += f.read()
if not input_text:
raise click.BadParameter(
"You must supply the option --file or --text for text input."
)
return input_text
def determine_removables(adj: bool, adv: bool) -> List[str]:
"""Decide if only adjectives, only adverbs or both shall be removed.
:param adj: bool representing whether adjectives shall be removed
:param adv: bool representing whether adverbs shall be removed
:return: a list of part of speech tags representing the POS to be removed
"""
if adj:
return ["ADJ"]
elif adv:
return ["ADV"]
else:
return ["ADJ", "ADV"]
def remove_removables(doc, text_length: int, removables: List[str], sub: str) -> Tuple[str, int]:
"""Remove adjectives and/or adverbs from doc.
:param doc: the input text as a spacy Doc object
:param text_length: the input length as an int
:param removables: a List of part of speech tags representing the POS to be removed
:param sub: str by which the removables shall be replaced (empty if not supplied by user)
:return: the output text without removables, the number of removed tokens
"""
output_text = ""
counter = 0
for i, token in enumerate(doc):
if token.pos_ in removables:
counter += 1
if sub:
output_text += sub
if whitespace_needed(i, text_length, token):
output_text += " "
else:
output_text += token.text
if whitespace_needed(i, text_length, token):
output_text += " "
return output_text, counter
def whitespace_needed(index: int, text_length: int, token) -> bool:
"""Determine if a token and its predecessor should be seperated by whitespace.
:param index: the int index of token in the input text
:param text_length: the length of the input text
:param token: the spacy Token object to be considered
:return: True if whitespace is needed
False otherwise
"""
return (
index < text_length - 1
and token.pos_ != "SPACE"
and token.nbor().pos_ not in ["PUNCT", "SPACE", "X"]
)
def print_counter(counter: int, text_length: int):
"""Print the number of removed words relative to the text length.
:param counter: int representing the number of removed words
:param text_length: int representing the overall text length
"""
if text_length > 0 and counter > 0:
removed_percentage = math.ceil(counter / text_length * 100)
print(
"-- {} words out of {} ({}%) have been removed --".format(
counter, text_length, removed_percentage
)
)
else:
print("-- nothing to remove: Occam would be proud of you :) --")
def show_output_text(output: str, output_text: str):
"""Provide the output text to the user, either by printing it or writing it to a file.
:param output: the path to an output file (empty if not supplied by the user)
:param output_text: the result str
"""
if output:
with open(output, mode="w") as f:
f.write(output_text)
else:
print(output_text)
if __name__ == "__main__":
occam()