adbar · sourcery-ai · Mar 14, 2022 · Mar 14, 2022 · Mar 14, 2022 · Mar 14, 2022
diff --git a/py3langid/examples/_twokenize.py b/py3langid/examples/_twokenize.py
@@ -55,7 +55,7 @@ def regex_or(*items):
 r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"   #TODO: remove obscure country domains?
 urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
 urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
-urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
+urlExtraCrapBeforeEnd = f'{regex_or(punctChars, entity)}+?'
 urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
 url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
 
@@ -102,34 +102,36 @@ def regex_or(*items):
 s5 = "(?:[.][_]+[.])"
 # myleott: in Python the (?i) flag affects the whole expression
 #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
-basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+basicface = f"(?:{bfLeft}{bfCenter}{bfRight})|{s3}|{s4}|{s5}"
 
 eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
 eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+".encode('utf-8')
 eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
-eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
+eastEmote = f'{eeLeft}(?:{basicface}|{eeSymbol})+{eeRight}'
 
-oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
+oOEmote = f"(?:[oO]{bfCenter}[oO])"
 
 
 emoticon = regex_or(
-        # Standard version  :) :( :] :D :P
-        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
-
-        # reversed version (: D:  use positive lookbehind to remove "(word):"
-        # because eyes on the right side is more ambiguous with the standard usage of : ;
-        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
-
-        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
-        eastEmote.replace("2", "1", 1), basicface,
-        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
-        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
-
-        # myleott: o.O and O.o are two of the biggest sources of differences
-        #          between this and the Java version. One little hack won't hurt...
-        oOEmote
+    f"(?:>|&gt;)?{regex_or(normalEyes, wink)}"
+    + regex_or(noseArea, "[Oo]")
+    + regex_or(
+        tongue + r"(?=\W|$|RT|rt|Rt)",
+        otherMouths + r"(?=\W|$|RT|rt|Rt)",
+        sadMouths,
+        happyMouths,
+    ),
+    regex_or("(?<=(?: ))", "(?<=(?:^))")
+    + regex_or(sadMouths, happyMouths, otherMouths)
+    + noseArea
+    + regex_or(normalEyes, wink)
+    + "(?:<|&lt;)?",
+    eastEmote.replace("2", "1", 1),
+    basicface,
+    oOEmote,
 )
 
+
 Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
 
 Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
@@ -188,11 +190,16 @@ def regex_or(*items):
 # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
 #edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
 edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
-edgePunct    = "[" + edgePunctChars + "]"
+edgePunct = f"[{edgePunctChars}]"
 notEdgePunct = "[a-zA-Z0-9]" # content characters
 offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
-EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
-EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
+EdgePunctLeft = re.compile(
+    f'{offEdge}({edgePunct}+)({notEdgePunct})', re.UNICODE
+)
+
+EdgePunctRight = re.compile(
+    f"({notEdgePunct})({edgePunct}+){offEdge}", re.UNICODE
+)
 
 def splitEdgePunct(input):
     input = EdgePunctLeft.sub(r"\1\2 \3", input)
@@ -230,8 +237,7 @@ def simpleTokenize(text):
     # has an even length and no indices are the same
     indices = [0]
     for (first, second) in badSpans:
-        indices.append(first)
-        indices.append(second)
+        indices.extend((first, second))
     indices.append(textLength)
 
     # Group the indices and map them to their respective portion of the string
@@ -271,8 +277,7 @@ def squeezeWhitespace(input):
 
 # Final pass tokenization based on special patterns
 def splitToken(token):
-    m = Contractions.search(token)
-    if m:
+    if m := Contractions.search(token):
         return [m.group(1), m.group(2)]
     return [token]
 

diff --git a/py3langid/langid.py b/py3langid/langid.py
@@ -152,7 +152,7 @@ def from_pickled_model(cls, pickled_file, *args, **kwargs):
         filepath = str(Path(__file__).parent / pickled_file)
         with lzma.open(filepath) as filehandle:
             nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.load(filehandle)
-        nb_numfeats = int(len(nb_ptc) / len(nb_pc))
+        nb_numfeats = len(nb_ptc) // len(nb_pc)
 
         # reconstruct pc and ptc
         nb_pc = np.array(nb_pc)
@@ -165,7 +165,7 @@ def from_pickled_model(cls, pickled_file, *args, **kwargs):
     def from_modelstring(cls, string, *args, **kwargs):
         # load data
         nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = pickle.loads(bz2.decompress(base64.b64decode(string)))
-        nb_numfeats = int(len(nb_ptc) / len(nb_pc))
+        nb_numfeats = len(nb_ptc) // len(nb_pc)
 
         # reconstruct pc and ptc
         nb_pc = np.array(nb_pc)
@@ -229,7 +229,7 @@ def set_languages(self, langs=None):
             # to speed up processing.
             for lang in langs:
                 if lang not in nb_classes:
-                    raise ValueError("Unknown language code %s" % lang)
+                    raise ValueError(f"Unknown language code {lang}")
 
             subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool)
             self.nb_classes = [c for c in nb_classes if c in langs]
@@ -409,11 +409,12 @@ def application(environ, start_response):
             # Unsupported method
             status = '405 Method Not Allowed'  # HTTP Status
             response = {
-              'responseData': None,
-              'responseStatus': 405,
-              'responseDetails': '%s not allowed' % environ['REQUEST_METHOD']
+                'responseData': None,
+                'responseStatus': 405,
+                'responseDetails': f"{environ['REQUEST_METHOD']} not allowed",
             }
 
+
         if data is not None:
             if path == 'detect':
                 pred, conf = classify(data)

diff --git a/py3langid/tools/printfeats.py b/py3langid/tools/printfeats.py
@@ -22,10 +22,7 @@
     n = args.number if args.number is not None else len(w)
 
     def show(feat):
-        if args.printfeat:
-            return feat
-        else:
-            return repr(feat)
+        return feat if args.printfeat else repr(feat)
 
     if args.column is not None:
         for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]:

diff --git a/py3langid/train/IGweight.py b/py3langid/train/IGweight.py
@@ -131,7 +131,7 @@ def pass_IG(buckets):
     else:
         # binarized event space
         # Compute IG binarized with respect to each event
-        ig = list()
+        ig = []
         for event_id in xrange(num_event):
             num_doc = __dist.sum()
             prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc

diff --git a/py3langid/train/NBtrain.py b/py3langid/train/NBtrain.py
@@ -112,7 +112,11 @@ def pass_tokenize(arg):
 
     # Distribute the aggregated counts into buckets
     __procname = mp.current_process().name
-    __buckets = [gzip.open(os.path.join(p,__procname+'.index'), 'a') for p in __b_dirs]
+    __buckets = [
+        gzip.open(os.path.join(p, f'{__procname}.index'), 'a')
+        for p in __b_dirs
+    ]
+
     bucket_count = len(__buckets)
     for doc_id, f_id in term_freq:
         bucket_index = hash(f_id) % bucket_count

diff --git a/py3langid/train/common.py b/py3langid/train/common.py
@@ -135,13 +135,10 @@ def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=No
 
     if processes > 1:
         with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
-            f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
-            yield f
+            yield lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
     else:
         if initializer is not None:
             initializer(*initargs)
-        f = imap
-        yield f
-
+        yield imap
     if processes > 1:
         pool.join()
diff --git a/py3langid/train/index.py b/py3langid/train/index.py
@@ -98,8 +98,7 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non
             # root supplied was the root of a directory structure
             candidates = []
             for dirpath, dirnames, filenames in os.walk(root, followlinks=True):
-                for docname in filenames:
-                    candidates.append(os.path.join(dirpath, docname))
+                candidates.extend(os.path.join(dirpath, docname) for docname in filenames)
         else:
             # root supplied was a file, interpet as list of paths
             candidates = map(str.strip, open(root))
@@ -180,13 +179,9 @@ def prune_min_domain(self, min_domain):
         for langs in self.coverage_index.values():
             for lang in langs:
                 lang_domain_count[lang] += 1
-        reject_langs = {
-            l
-            for l in lang_domain_count if lang_domain_count[l] < min_domain
-        }
-
-        # Remove the languages from the indexer
-        if reject_langs:
+        if reject_langs := {
+            l for l in lang_domain_count if lang_domain_count[l] < min_domain
+        }:
             #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
             reject_ids = {self.lang_index[l] for l in reject_langs}
 

diff --git a/py3langid/train/tokenize.py b/py3langid/train/tokenize.py
@@ -174,8 +174,15 @@ def pass_tokenize(chunk_items):
 
     # Output the counts to the relevant bucket files.
     __procname = mp.current_process().name
-    b_freq_lang = [gzip.open(os.path.join(p,__procname+'.lang'),'a') for p in __b_dirs]
-    b_freq_domain = [gzip.open(os.path.join(p,__procname+'.domain'),'a') for p in __b_dirs]
+    b_freq_lang = [
+        gzip.open(os.path.join(p, f'{__procname}.lang'), 'a') for p in __b_dirs
+    ]
+
+    b_freq_domain = [
+        gzip.open(os.path.join(p, f'{__procname}.domain'), 'a')
+        for p in __b_dirs
+    ]
+
 
     for term in term_lng_freq:
         bucket_index = hash(term) % len(b_freq_lang)