-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery refactored master branch #8
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,7 +55,7 @@ def regex_or(*items): | |
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains? | ||
urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)" | ||
urlBody = r"(?:[^\.\s<>][^\s<>]*?)?" | ||
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?" | ||
urlExtraCrapBeforeEnd = f"{regex_or(punctChars, entity)}+?" | ||
urlEnd = r"(?:\.\.+|[<>]|\s|$)" | ||
url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")" | ||
|
||
|
@@ -102,34 +102,36 @@ def regex_or(*items): | |
s5 = "(?:[.][_]+[.])" | ||
# myleott: in Python the (?i) flag affects the whole expression | ||
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 | ||
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5 | ||
basicface = f"(?:{bfLeft}{bfCenter}{bfRight})|{s3}|{s4}|{s5}" | ||
|
||
eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+" | ||
eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8') | ||
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]" | ||
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight | ||
eastEmote = f"{eeLeft}(?:{basicface}|{eeSymbol})+{eeRight}" | ||
|
||
oOEmote = r"(?:[oO]" + bfCenter + r"[oO])" | ||
oOEmote = f"(?:[oO]{bfCenter}[oO])" | ||
|
||
|
||
emoticon = regex_or( | ||
# Standard version :) :( :] :D :P | ||
"(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths), | ||
|
||
# reversed version (: D: use positive lookbehind to remove "(word):" | ||
# because eyes on the right side is more ambiguous with the standard usage of : ; | ||
regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?", | ||
|
||
#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style | ||
eastEmote.replace("2", "1", 1), basicface, | ||
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] | ||
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this | ||
|
||
# myleott: o.O and O.o are two of the biggest sources of differences | ||
# between this and the Java version. One little hack won't hurt... | ||
oOEmote | ||
f"(?:>|>)?{regex_or(normalEyes, wink)}" | ||
+ regex_or(noseArea, "[Oo]") | ||
+ regex_or( | ||
tongue + r"(?=\W|$|RT|rt|Rt)", | ||
otherMouths + r"(?=\W|$|RT|rt|Rt)", | ||
sadMouths, | ||
happyMouths, | ||
), | ||
regex_or("(?<=(?: ))", "(?<=(?:^))") | ||
+ regex_or(sadMouths, happyMouths, otherMouths) | ||
+ noseArea | ||
+ regex_or(normalEyes, wink) | ||
+ "(?:<|<)?", | ||
eastEmote.replace("2", "1", 1), | ||
basicface, | ||
oOEmote, | ||
) | ||
|
||
|
||
Hearts = "(?:<+/?3+)+" #the other hearts are in decorations | ||
|
||
Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8')) | ||
|
@@ -188,11 +190,16 @@ def regex_or(*items): | |
# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) | ||
#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols) | ||
edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols) | ||
edgePunct = "[" + edgePunctChars + "]" | ||
edgePunct = f"[{edgePunctChars}]" | ||
notEdgePunct = "[a-zA-Z0-9]" # content characters | ||
offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):" | ||
EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE) | ||
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE) | ||
EdgePunctLeft = re.compile( | ||
f"{offEdge}({edgePunct}+)({notEdgePunct})", re.UNICODE | ||
) | ||
|
||
EdgePunctRight = re.compile( | ||
f"({notEdgePunct})({edgePunct}+){offEdge}", re.UNICODE | ||
) | ||
|
||
def splitEdgePunct(input): | ||
input = EdgePunctLeft.sub(r"\1\2 \3", input) | ||
|
@@ -230,8 +237,7 @@ def simpleTokenize(text): | |
# has an even length and no indices are the same | ||
indices = [0] | ||
for (first, second) in badSpans: | ||
indices.append(first) | ||
indices.append(second) | ||
indices.extend((first, second)) | ||
Comment on lines
-233
to
+240
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
indices.append(textLength) | ||
|
||
# Group the indices and map them to their respective portion of the string | ||
|
@@ -271,8 +277,7 @@ def squeezeWhitespace(input): | |
|
||
# Final pass tokenization based on special patterns | ||
def splitToken(token): | ||
m = Contractions.search(token) | ||
if m: | ||
if m := Contractions.search(token): | ||
Comment on lines
-274
to
+280
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return [m.group(1), m.group(2)] | ||
return [token] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,10 +22,7 @@ | |
n = args.number if args.number is not None else len(w) | ||
|
||
def show(feat): | ||
if args.printfeat: | ||
return feat | ||
else: | ||
return repr(feat) | ||
return feat if args.printfeat else repr(feat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
if args.column is not None: | ||
for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,7 +97,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P | |
# Work out the set of features to compute IG | ||
features = set() | ||
for i in range(1, max_order+1): | ||
d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i) | ||
d = {k: doc_count[k] for k in doc_count if len(k) == i} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order]) | ||
features = sorted(features) | ||
|
||
|
@@ -124,11 +124,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P | |
if not(args.tokens) and not(args.tokens_per_order): | ||
args.tokens_per_order = TOKENS_PER_ORDER | ||
|
||
if args.features: | ||
feature_path = args.features | ||
else: | ||
feature_path = os.path.join(args.model, 'DFfeats') | ||
|
||
feature_path = args.features or os.path.join(args.model, 'DFfeats') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
bucketlist_path = os.path.join(args.model, 'bucketlist') | ||
|
||
# display paths | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,7 +130,7 @@ def pass_IG(bucket): | |
else: | ||
# binarized event space | ||
# Compute IG binarized with respect to each event | ||
ig = list() | ||
ig = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
for event_id in range(num_event): | ||
num_doc = __dist.sum() | ||
prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc | ||
|
@@ -196,11 +196,7 @@ def read_dist(path): | |
if not(args.domain or args.lang) or (args.domain and args.lang): | ||
parser.error("exactly one of domain(-d) or language (-l) must be specified") | ||
|
||
if args.features: | ||
feature_path = args.features | ||
else: | ||
feature_path = os.path.join(args.model, 'DFfeats') | ||
|
||
feature_path = args.features or os.path.join(args.model, 'DFfeats') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
bucketlist_path = os.path.join(args.model, 'bucketlist') | ||
|
||
if not os.path.exists(feature_path): | ||
|
@@ -218,10 +214,9 @@ def read_dist(path): | |
else: | ||
raise ValueError("no event specified") | ||
|
||
if args.weights: | ||
weights_path = args.weights | ||
else: | ||
weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else '')) | ||
weights_path = args.weights or os.path.join( | ||
args.model, f'IGweights{suffix}' + ('.bin' if args.binarize else '') | ||
) | ||
|
||
# display paths | ||
print("model path:", args.model ) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,7 +88,7 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): | |
|
||
lang_w_path = os.path.join(args.model, 'IGweights.lang.bin') | ||
domain_w_path = os.path.join(args.model, 'IGweights.domain') | ||
feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats') | ||
feature_path = args.output or os.path.join(args.model, 'LDfeats') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
|
||
# display paths | ||
print("model path:", args.model) | ||
|
@@ -97,11 +97,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): | |
print("feature output path:", feature_path) | ||
|
||
lang_w = read_weights(lang_w_path) | ||
domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None | ||
domain_w = None if args.no_domain_ig else read_weights(domain_w_path) | ||
|
||
features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig) | ||
if args.per_lang: | ||
with open(feature_path + '.perlang', 'w') as f: | ||
with open(f'{feature_path}.perlang', 'w') as f: | ||
writer = csv.writer(f) | ||
for i in range(len(features_per_lang)): | ||
writer.writerow(map(repr,features_per_lang[i])) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,8 +58,7 @@ | |
def offsets(chunks): | ||
# Work out the path chunk start offsets | ||
chunk_offsets = [0] | ||
for c in chunks: | ||
chunk_offsets.append(chunk_offsets[-1] + len(c)) | ||
chunk_offsets.extend(chunk_offsets[-1] + len(c) for c in chunks) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return chunk_offsets | ||
|
||
def state_trace(path): | ||
|
@@ -154,8 +153,7 @@ def learn_pc(cm): | |
@returns nb_pc: log(P(C)) | ||
""" | ||
pc = np.log(cm.sum(0)) | ||
nb_pc = array.array('d', pc) | ||
return nb_pc | ||
return array.array('d', pc) | ||
Comment on lines
-157
to
+156
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def generate_cm(items, num_classes): | ||
""" | ||
|
@@ -185,7 +183,11 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args): | |
chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize) | ||
|
||
# TODO: Set the output dir | ||
b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ] | ||
b_dirs = [ | ||
tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path) | ||
for _ in range(args.buckets) | ||
] | ||
|
||
Comment on lines
-188
to
+190
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
output_states = set(tk_output) | ||
|
||
|
@@ -240,21 +242,9 @@ def cleanup(): | |
parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) | ||
args = parser.parse_args() | ||
|
||
if args.temp: | ||
temp_path = args.temp | ||
else: | ||
temp_path = os.path.join(args.model, 'buckets') | ||
|
||
if args.scanner: | ||
scanner_path = args.scanner | ||
else: | ||
scanner_path = os.path.join(args.model, 'LDfeats.scanner') | ||
|
||
if args.output: | ||
output_path = args.output | ||
else: | ||
output_path = os.path.join(args.model, 'model') | ||
|
||
temp_path = args.temp or os.path.join(args.model, 'buckets') | ||
scanner_path = args.scanner or os.path.join(args.model, 'LDfeats.scanner') | ||
output_path = args.output or os.path.join(args.model, 'model') | ||
Comment on lines
-243
to
+247
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
index_path = os.path.join(args.model, 'paths') | ||
lang_path = os.path.join(args.model, 'lang_index') | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,9 +35,10 @@ def chunk(seq, chunksize): | |
""" | ||
seq_iter = iter(seq) | ||
while True: | ||
chunk = tuple(islice(seq_iter, chunksize)) | ||
if not chunk: break | ||
yield chunk | ||
if chunk := tuple(islice(seq_iter, chunksize)): | ||
yield chunk | ||
else: | ||
break | ||
Comment on lines
-38
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def unmarshal_iter(path): | ||
""" | ||
|
@@ -131,13 +132,10 @@ def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=No | |
|
||
if processes > 1: | ||
with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool: | ||
f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize) | ||
yield f | ||
yield lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize) | ||
else: | ||
if initializer is not None: | ||
initializer(*initargs) | ||
f = imap | ||
yield f | ||
|
||
yield imap | ||
Comment on lines
-134
to
+139
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
if processes > 1: | ||
pool.join() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,10 +91,10 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non | |
self.domain_index = defaultdict(Enumerator()) | ||
else: | ||
# pre-specified domain set | ||
self.domain_index = dict((k,v) for v,k in enumerate(domains)) | ||
self.domain_index = {k: v for v,k in enumerate(domains)} | ||
|
||
self.coverage_index = defaultdict(set) | ||
self.items = list() | ||
self.items = [] | ||
Comment on lines
-94
to
+97
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
self.index(root) | ||
self.prune_min_domain(self.min_domain) | ||
|
@@ -139,15 +139,14 @@ def prune_min_domain(self, min_domain): | |
for langs in self.coverage_index.values(): | ||
for lang in langs: | ||
lang_domain_count[lang] += 1 | ||
reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain) | ||
|
||
# Remove the languages from the indexer | ||
if reject_langs: | ||
if reject_langs := { | ||
l for l in lang_domain_count if lang_domain_count[l] < min_domain | ||
}: | ||
#print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs)) | ||
reject_ids = set(self.lang_index[l] for l in reject_langs) | ||
reject_ids = {self.lang_index[l] for l in reject_langs} | ||
|
||
new_lang_index = defaultdict(Enumerator()) | ||
lm = dict() | ||
lm = {} | ||
Comment on lines
-142
to
+149
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
for k,v in self.lang_index.items(): | ||
if v not in reject_ids: | ||
new_id = new_lang_index[k] | ||
|
@@ -215,11 +214,7 @@ def paths(self): | |
args = parser.parse_args() | ||
|
||
corpus_name = os.path.basename(args.corpus) | ||
if args.model: | ||
model_dir = args.model | ||
else: | ||
model_dir = os.path.join('.', corpus_name+'.model') | ||
|
||
model_dir = args.model or os.path.join('.', f'{corpus_name}.model') | ||
Comment on lines
-218
to
+217
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
makedir(model_dir) | ||
|
||
langs_path = os.path.join(model_dir, 'lang_index') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lines
58-195
refactored with the following changes:use-fstring-for-concatenation
)This removes the following comments ( why? ):