Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #3

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 31 additions & 26 deletions py3langid/examples/_twokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def regex_or(*items):
r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
urlExtraCrapBeforeEnd = f'{regex_or(punctChars, entity)}+?'
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 58-195 refactored with the following changes:

This removes the following comments ( why? ):

# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
# Standard version  :) :( :] :D :P
# myleott: o.O and O.o are two of the biggest sources of differences
# reversed version (: D:  use positive lookbehind to remove "(word):"
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
#          between this and the Java version. One little hack won't hurt...
#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
# because eyes on the right side is more ambiguous with the standard usage of : ;

urlEnd = r"(?:\.\.+|[<>]|\s|$)"
url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"

Expand Down Expand Up @@ -102,34 +102,36 @@ def regex_or(*items):
s5 = "(?:[.][_]+[.])"
# myleott: in Python the (?i) flag affects the whole expression
#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
basicface = f"(?:{bfLeft}{bfCenter}{bfRight})|{s3}|{s4}|{s5}"

eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
eastEmote = f'{eeLeft}(?:{basicface}|{eeSymbol})+{eeRight}'

oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
oOEmote = f"(?:[oO]{bfCenter}[oO])"


emoticon = regex_or(
# Standard version :) :( :] :D :P
"(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),

# reversed version (: D: use positive lookbehind to remove "(word):"
# because eyes on the right side is more ambiguous with the standard usage of : ;
regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",

#inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
eastEmote.replace("2", "1", 1), basicface,
# iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
# TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this

# myleott: o.O and O.o are two of the biggest sources of differences
# between this and the Java version. One little hack won't hurt...
oOEmote
f"(?:>|&gt;)?{regex_or(normalEyes, wink)}"
+ regex_or(noseArea, "[Oo]")
+ regex_or(
tongue + r"(?=\W|$|RT|rt|Rt)",
otherMouths + r"(?=\W|$|RT|rt|Rt)",
sadMouths,
happyMouths,
),
regex_or("(?<=(?: ))", "(?<=(?:^))")
+ regex_or(sadMouths, happyMouths, otherMouths)
+ noseArea
+ regex_or(normalEyes, wink)
+ "(?:<|&lt;)?",
eastEmote.replace("2", "1", 1),
basicface,
oOEmote,
)


Hearts = "(?:<+/?3+)+" #the other hearts are in decorations

Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
Expand Down Expand Up @@ -188,11 +190,16 @@ def regex_or(*items):
# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
#edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
edgePunct = "[" + edgePunctChars + "]"
edgePunct = f"[{edgePunctChars}]"
notEdgePunct = "[a-zA-Z0-9]" # content characters
offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
EdgePunctLeft = re.compile(
f'{offEdge}({edgePunct}+)({notEdgePunct})', re.UNICODE
)

EdgePunctRight = re.compile(
f"({notEdgePunct})({edgePunct}+){offEdge}", re.UNICODE
)

def splitEdgePunct(input):
input = EdgePunctLeft.sub(r"\1\2 \3", input)
Expand Down Expand Up @@ -230,8 +237,7 @@ def simpleTokenize(text):
# has an even length and no indices are the same
indices = [0]
for (first, second) in badSpans:
indices.append(first)
indices.append(second)
indices.extend((first, second))
adbar marked this conversation as resolved.
Show resolved Hide resolved
indices.append(textLength)

# Group the indices and map them to their respective portion of the string
Expand Down Expand Up @@ -271,8 +277,7 @@ def squeezeWhitespace(input):

# Final pass tokenization based on special patterns
def splitToken(token):
m = Contractions.search(token)
if m:
if m := Contractions.search(token):
adbar marked this conversation as resolved.
Show resolved Hide resolved
return [m.group(1), m.group(2)]
return [token]

Expand Down
13 changes: 7 additions & 6 deletions py3langid/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def from_pickled_model(cls, pickled_file, *args, **kwargs):
filepath = str(Path(__file__).parent / pickled_file)
with lzma.open(filepath) as filehandle:
nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.load(filehandle)
nb_numfeats = int(len(nb_ptc) / len(nb_pc))
nb_numfeats = len(nb_ptc) // len(nb_pc)
adbar marked this conversation as resolved.
Show resolved Hide resolved

# reconstruct pc and ptc
nb_pc = np.array(nb_pc)
Expand All @@ -165,7 +165,7 @@ def from_pickled_model(cls, pickled_file, *args, **kwargs):
def from_modelstring(cls, string, *args, **kwargs):
# load data
nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.loads(bz2.decompress(base64.b64decode(string)))
nb_numfeats = int(len(nb_ptc) / len(nb_pc))
nb_numfeats = len(nb_ptc) // len(nb_pc)
adbar marked this conversation as resolved.
Show resolved Hide resolved

# reconstruct pc and ptc
nb_pc = np.array(nb_pc)
Expand Down Expand Up @@ -229,7 +229,7 @@ def set_languages(self, langs=None):
# to speed up processing.
for lang in langs:
if lang not in nb_classes:
raise ValueError("Unknown language code %s" % lang)
raise ValueError(f"Unknown language code {lang}")
adbar marked this conversation as resolved.
Show resolved Hide resolved

subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool)
self.nb_classes = [c for c in nb_classes if c in langs]
Expand Down Expand Up @@ -409,11 +409,12 @@ def application(environ, start_response):
# Unsupported method
status = '405 Method Not Allowed' # HTTP Status
response = {
'responseData': None,
'responseStatus': 405,
'responseDetails': '%s not allowed' % environ['REQUEST_METHOD']
'responseData': None,
'responseStatus': 405,
'responseDetails': f"{environ['REQUEST_METHOD']} not allowed",
}

adbar marked this conversation as resolved.
Show resolved Hide resolved

if data is not None:
if path == 'detect':
pred, conf = classify(data)
Expand Down
5 changes: 1 addition & 4 deletions py3langid/tools/printfeats.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
n = args.number if args.number is not None else len(w)

def show(feat):
if args.printfeat:
return feat
else:
return repr(feat)
return feat if args.printfeat else repr(feat)
adbar marked this conversation as resolved.
Show resolved Hide resolved

if args.column is not None:
for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]:
Expand Down
2 changes: 1 addition & 1 deletion py3langid/train/IGweight.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def pass_IG(buckets):
else:
# binarized event space
# Compute IG binarized with respect to each event
ig = list()
ig = []
adbar marked this conversation as resolved.
Show resolved Hide resolved
for event_id in xrange(num_event):
num_doc = __dist.sum()
prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc
Expand Down
6 changes: 5 additions & 1 deletion py3langid/train/NBtrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,11 @@ def pass_tokenize(arg):

# Distribute the aggregated counts into buckets
__procname = mp.current_process().name
__buckets = [gzip.open(os.path.join(p,__procname+'.index'), 'a') for p in __b_dirs]
__buckets = [
gzip.open(os.path.join(p, f'{__procname}.index'), 'a')
for p in __b_dirs
]

adbar marked this conversation as resolved.
Show resolved Hide resolved
bucket_count = len(__buckets)
for doc_id, f_id in term_freq:
bucket_index = hash(f_id) % bucket_count
Expand Down
7 changes: 2 additions & 5 deletions py3langid/train/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,10 @@ def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=No

if processes > 1:
with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
yield f
yield lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
else:
if initializer is not None:
initializer(*initargs)
f = imap
yield f

yield imap
Comment on lines -138 to +142
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MapPool refactored with the following changes:

if processes > 1:
pool.join()
13 changes: 4 additions & 9 deletions py3langid/train/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non
# root supplied was the root of a directory structure
candidates = []
for dirpath, dirnames, filenames in os.walk(root, followlinks=True):
for docname in filenames:
candidates.append(os.path.join(dirpath, docname))
candidates.extend(os.path.join(dirpath, docname) for docname in filenames)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function CorpusIndexer.__init__ refactored with the following changes:

else:
# root supplied was a file, interpet as list of paths
candidates = map(str.strip, open(root))
Expand Down Expand Up @@ -180,13 +179,9 @@ def prune_min_domain(self, min_domain):
for langs in self.coverage_index.values():
for lang in langs:
lang_domain_count[lang] += 1
reject_langs = {
l
for l in lang_domain_count if lang_domain_count[l] < min_domain
}

# Remove the languages from the indexer
if reject_langs:
if reject_langs := {
l for l in lang_domain_count if lang_domain_count[l] < min_domain
}:
Comment on lines -183 to +184
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function CorpusIndexer.prune_min_domain refactored with the following changes:

This removes the following comments ( why? ):

# Remove the languages from the indexer

#print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
reject_ids = {self.lang_index[l] for l in reject_langs}

Expand Down
11 changes: 9 additions & 2 deletions py3langid/train/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,15 @@ def pass_tokenize(chunk_items):

# Output the counts to the relevant bucket files.
__procname = mp.current_process().name
b_freq_lang = [gzip.open(os.path.join(p,__procname+'.lang'),'a') for p in __b_dirs]
b_freq_domain = [gzip.open(os.path.join(p,__procname+'.domain'),'a') for p in __b_dirs]
b_freq_lang = [
gzip.open(os.path.join(p, f'{__procname}.lang'), 'a') for p in __b_dirs
]

b_freq_domain = [
gzip.open(os.path.join(p, f'{__procname}.domain'), 'a')
for p in __b_dirs
]

adbar marked this conversation as resolved.
Show resolved Hide resolved

for term in term_lng_freq:
bucket_index = hash(term) % len(b_freq_lang)
Expand Down