Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #8

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 31 additions & 26 deletions py3langid/examples/_twokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def regex_or(*items):
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlExtraCrapBeforeEnd = f"{regex_or(punctChars, entity)}+?"
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 58-195 refactored with the following changes:

This removes the following comments ( why? ):

# Standard version  :) :( :] :D :P
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
#          between this and the Java version. One little hack won't hurt...
# reversed version (: D:  use positive lookbehind to remove "(word):"
# myleott: o.O and O.o are two of the biggest sources of differences
# because eyes on the right side is more ambiguous with the standard usage of : ;

urlEnd = r"(?:\.\.+|[<>]|\s|$)"
url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"

Expand Down Expand Up @@ -102,34 +102,36 @@ def regex_or(*items):
s5 = "(?:[.][_]+[.])"
# myleott: in Python the (?i) flag affects the whole expression
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = f"(?:{bfLeft}{bfCenter}{bfRight})|{s3}|{s4}|{s5}"

eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
eastEmote = f"{eeLeft}(?:{basicface}|{eeSymbol})+{eeRight}"

oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
oOEmote = f"(?:[oO]{bfCenter}[oO])"


emoticon = regex_or(
# Standard version :) :( :] :D :P
"(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),

# reversed version (: D: use positive lookbehind to remove "(word):"
# because eyes on the right side is more ambiguous with the standard usage of : ;
regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",

#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
eastEmote.replace("2", "1", 1), basicface,
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this

# myleott: o.O and O.o are two of the biggest sources of differences
# between this and the Java version. One little hack won't hurt...
oOEmote
f"(?:>|&gt;)?{regex_or(normalEyes, wink)}"
+ regex_or(noseArea, "[Oo]")
+ regex_or(
tongue + r"(?=\W|$|RT|rt|Rt)",
otherMouths + r"(?=\W|$|RT|rt|Rt)",
sadMouths,
happyMouths,
),
regex_or("(?<=(?: ))", "(?<=(?:^))")
+ regex_or(sadMouths, happyMouths, otherMouths)
+ noseArea
+ regex_or(normalEyes, wink)
+ "(?:<|&lt;)?",
eastEmote.replace("2", "1", 1),
basicface,
oOEmote,
)


Hearts = "(?:<+/?3+)+" #the other hearts are in decorations

Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
Expand Down Expand Up @@ -188,11 +190,16 @@ def regex_or(*items):
# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct = "[" + edgePunctChars + "]"
edgePunct = f"[{edgePunctChars}]"
notEdgePunct = "[a-zA-Z0-9]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
EdgePunctLeft = re.compile(
f"{offEdge}({edgePunct}+)({notEdgePunct})", re.UNICODE
)

EdgePunctRight = re.compile(
f"({notEdgePunct})({edgePunct}+){offEdge}", re.UNICODE
)

def splitEdgePunct(input):
input = EdgePunctLeft.sub(r"\1\2 \3", input)
Expand Down Expand Up @@ -230,8 +237,7 @@ def simpleTokenize(text):
# has an even length and no indices are the same
indices = [0]
for (first, second) in badSpans:
indices.append(first)
indices.append(second)
indices.extend((first, second))
Comment on lines -233 to +240
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function simpleTokenize refactored with the following changes:

indices.append(textLength)

# Group the indices and map them to their respective portion of the string
Expand Down Expand Up @@ -271,8 +277,7 @@ def squeezeWhitespace(input):

# Final pass tokenization based on special patterns
def splitToken(token):
m = Contractions.search(token)
if m:
if m := Contractions.search(token):
Comment on lines -274 to +280
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function splitToken refactored with the following changes:

return [m.group(1), m.group(2)]
return [token]

Expand Down
5 changes: 1 addition & 4 deletions py3langid/tools/printfeats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
n = args.number if args.number is not None else len(w)

def show(feat):
if args.printfeat:
return feat
else:
return repr(feat)
return feat if args.printfeat else repr(feat)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function show refactored with the following changes:


if args.column is not None:
for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]:
Expand Down
8 changes: 2 additions & 6 deletions py3langid/train/DFfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
# Work out the set of features to compute IG
features = set()
for i in range(1, max_order+1):
d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i)
d = {k: doc_count[k] for k in doc_count if len(k) == i}
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function ngram_select refactored with the following changes:

features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order])
features = sorted(features)

Expand All @@ -124,11 +124,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
if not(args.tokens) and not(args.tokens_per_order):
args.tokens_per_order = TOKENS_PER_ORDER

if args.features:
feature_path = args.features
else:
feature_path = os.path.join(args.model, 'DFfeats')

feature_path = args.features or os.path.join(args.model, 'DFfeats')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 127-131 refactored with the following changes:

bucketlist_path = os.path.join(args.model, 'bucketlist')

# display paths
Expand Down
15 changes: 5 additions & 10 deletions py3langid/train/IGweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def pass_IG(bucket):
else:
# binarized event space
# Compute IG binarized with respect to each event
ig = list()
ig = []
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function pass_IG refactored with the following changes:

for event_id in range(num_event):
num_doc = __dist.sum()
prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc
Expand Down Expand Up @@ -196,11 +196,7 @@ def read_dist(path):
if not(args.domain or args.lang) or (args.domain and args.lang):
parser.error("exactly one of domain(-d) or language (-l) must be specified")

if args.features:
feature_path = args.features
else:
feature_path = os.path.join(args.model, 'DFfeats')

feature_path = args.features or os.path.join(args.model, 'DFfeats')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 199-224 refactored with the following changes:

bucketlist_path = os.path.join(args.model, 'bucketlist')

if not os.path.exists(feature_path):
Expand All @@ -218,10 +214,9 @@ def read_dist(path):
else:
raise ValueError("no event specified")

if args.weights:
weights_path = args.weights
else:
weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))
weights_path = args.weights or os.path.join(
args.model, f'IGweights{suffix}' + ('.bin' if args.binarize else '')
)

# display paths
print("model path:", args.model )
Expand Down
6 changes: 3 additions & 3 deletions py3langid/train/LDfeatureselect.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):

lang_w_path = os.path.join(args.model, 'IGweights.lang.bin')
domain_w_path = os.path.join(args.model, 'IGweights.domain')
feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')
feature_path = args.output or os.path.join(args.model, 'LDfeats')
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 91-104 refactored with the following changes:


# display paths
print("model path:", args.model)
Expand All @@ -97,11 +97,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
print("feature output path:", feature_path)

lang_w = read_weights(lang_w_path)
domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
domain_w = None if args.no_domain_ig else read_weights(domain_w_path)

features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig)
if args.per_lang:
with open(feature_path + '.perlang', 'w') as f:
with open(f'{feature_path}.perlang', 'w') as f:
writer = csv.writer(f)
for i in range(len(features_per_lang)):
writer.writerow(map(repr,features_per_lang[i]))
Expand Down
30 changes: 10 additions & 20 deletions py3langid/train/NBtrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@
def offsets(chunks):
# Work out the path chunk start offsets
chunk_offsets = [0]
for c in chunks:
chunk_offsets.append(chunk_offsets[-1] + len(c))
chunk_offsets.extend(chunk_offsets[-1] + len(c) for c in chunks)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function offsets refactored with the following changes:

return chunk_offsets

def state_trace(path):
Expand Down Expand Up @@ -154,8 +153,7 @@ def learn_pc(cm):
@returns nb_pc: log(P(C))
"""
pc = np.log(cm.sum(0))
nb_pc = array.array('d', pc)
return nb_pc
return array.array('d', pc)
Comment on lines -157 to +156
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function learn_pc refactored with the following changes:


def generate_cm(items, num_classes):
"""
Expand Down Expand Up @@ -185,7 +183,11 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize)

# TODO: Set the output dir
b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ]
b_dirs = [
tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
for _ in range(args.buckets)
]

Comment on lines -188 to +190
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function learn_ptc refactored with the following changes:


output_states = set(tk_output)

Expand Down Expand Up @@ -240,21 +242,9 @@ def cleanup():
parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
args = parser.parse_args()

if args.temp:
temp_path = args.temp
else:
temp_path = os.path.join(args.model, 'buckets')

if args.scanner:
scanner_path = args.scanner
else:
scanner_path = os.path.join(args.model, 'LDfeats.scanner')

if args.output:
output_path = args.output
else:
output_path = os.path.join(args.model, 'model')

temp_path = args.temp or os.path.join(args.model, 'buckets')
scanner_path = args.scanner or os.path.join(args.model, 'LDfeats.scanner')
output_path = args.output or os.path.join(args.model, 'model')
Comment on lines -243 to +247
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 243-257 refactored with the following changes:

index_path = os.path.join(args.model, 'paths')
lang_path = os.path.join(args.model, 'lang_index')

Expand Down
14 changes: 6 additions & 8 deletions py3langid/train/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def chunk(seq, chunksize):
"""
seq_iter = iter(seq)
while True:
chunk = tuple(islice(seq_iter, chunksize))
if not chunk: break
yield chunk
if chunk := tuple(islice(seq_iter, chunksize)):
yield chunk
else:
break
Comment on lines -38 to +41
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function chunk refactored with the following changes:


def unmarshal_iter(path):
"""
Expand Down Expand Up @@ -131,13 +132,10 @@ def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=No

if processes > 1:
with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
yield f
yield lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
else:
if initializer is not None:
initializer(*initargs)
f = imap
yield f

yield imap
Comment on lines -134 to +139
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MapPool refactored with the following changes:

if processes > 1:
pool.join()
21 changes: 8 additions & 13 deletions py3langid/train/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non
self.domain_index = defaultdict(Enumerator())
else:
# pre-specified domain set
self.domain_index = dict((k,v) for v,k in enumerate(domains))
self.domain_index = {k: v for v,k in enumerate(domains)}

self.coverage_index = defaultdict(set)
self.items = list()
self.items = []
Comment on lines -94 to +97
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function CorpusIndexer.__init__ refactored with the following changes:


self.index(root)
self.prune_min_domain(self.min_domain)
Expand Down Expand Up @@ -139,15 +139,14 @@ def prune_min_domain(self, min_domain):
for langs in self.coverage_index.values():
for lang in langs:
lang_domain_count[lang] += 1
reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain)

# Remove the languages from the indexer
if reject_langs:
if reject_langs := {
l for l in lang_domain_count if lang_domain_count[l] < min_domain
}:
#print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
reject_ids = set(self.lang_index[l] for l in reject_langs)
reject_ids = {self.lang_index[l] for l in reject_langs}

new_lang_index = defaultdict(Enumerator())
lm = dict()
lm = {}
Comment on lines -142 to +149
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function CorpusIndexer.prune_min_domain refactored with the following changes:

This removes the following comments ( why? ):

# Remove the languages from the indexer

for k,v in self.lang_index.items():
if v not in reject_ids:
new_id = new_lang_index[k]
Expand Down Expand Up @@ -215,11 +214,7 @@ def paths(self):
args = parser.parse_args()

corpus_name = os.path.basename(args.corpus)
if args.model:
model_dir = args.model
else:
model_dir = os.path.join('.', corpus_name+'.model')

model_dir = args.model or os.path.join('.', f'{corpus_name}.model')
Comment on lines -218 to +217
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 218-222 refactored with the following changes:

makedir(model_dir)

langs_path = os.path.join(model_dir, 'lang_index')
Expand Down