adbar · sourcery-ai · Sep 2, 2022 · sourcery-ai · Sep 2, 2022 · sourcery-ai
diff --git a/py3langid/examples/_twokenize.py b/py3langid/examples/_twokenize.py
@@ -55,7 +55,7 @@ def regex_or(*items):
 r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"   #TODO: remove obscure country domains?
 urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
 urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
-urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
+urlExtraCrapBeforeEnd = f"{regex_or(punctChars, entity)}+?"
 urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
 url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
 
@@ -102,34 +102,36 @@ def regex_or(*items):
 s5 = "(?:[.][_]+[.])"
 # myleott: in Python the (?i) flag affects the whole expression
 #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
-basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
+basicface = f"(?:{bfLeft}{bfCenter}{bfRight})|{s3}|{s4}|{s5}"
 
 eeLeft = r"[＼\\ƪԄ\(（<>;ヽ\-=~\*]+"
 eeRight= u"[\\-=\\);'\u0022<>ʃ）/／ノﾉ丿╯σっµ~\\*]+".encode('utf-8')
 eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
-eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
+eastEmote = f"{eeLeft}(?:{basicface}|{eeSymbol})+{eeRight}"
 
-oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
+oOEmote = f"(?:[oO]{bfCenter}[oO])"
 
 
 emoticon = regex_or(
-        # Standard version  :) :( :] :D :P
-        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
-
-        # reversed version (: D:  use positive lookbehind to remove "(word):"
-        # because eyes on the right side is more ambiguous with the standard usage of : ;
-        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
-
-        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
-        eastEmote.replace("2", "1", 1), basicface,
-        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
-        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
-
-        # myleott: o.O and O.o are two of the biggest sources of differences
-        #          between this and the Java version. One little hack won't hurt...
-        oOEmote
+    f"(?:>|&gt;)?{regex_or(normalEyes, wink)}"
+    + regex_or(noseArea, "[Oo]")
+    + regex_or(
+        tongue + r"(?=\W|$|RT|rt|Rt)",
+        otherMouths + r"(?=\W|$|RT|rt|Rt)",
+        sadMouths,
+        happyMouths,
+    ),
+    regex_or("(?<=(?: ))", "(?<=(?:^))")
+    + regex_or(sadMouths, happyMouths, otherMouths)
+    + noseArea
+    + regex_or(normalEyes, wink)
+    + "(?:<|&lt;)?",
+    eastEmote.replace("2", "1", 1),
+    basicface,
+    oOEmote,
 )
 
+
 Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
 
 Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
@@ -188,11 +190,16 @@ def regex_or(*items):
 # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
 #edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
 edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
-edgePunct    = "[" + edgePunctChars + "]"
+edgePunct = f"[{edgePunctChars}]"
 notEdgePunct = "[a-zA-Z0-9]" # content characters
 offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
-EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
-EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
+EdgePunctLeft = re.compile(
+    f"{offEdge}({edgePunct}+)({notEdgePunct})", re.UNICODE
+)
+
+EdgePunctRight = re.compile(
+    f"({notEdgePunct})({edgePunct}+){offEdge}", re.UNICODE
+)
 
 def splitEdgePunct(input):
     input = EdgePunctLeft.sub(r"\1\2 \3", input)
@@ -230,8 +237,7 @@ def simpleTokenize(text):
     # has an even length and no indices are the same
     indices = [0]
     for (first, second) in badSpans:
-        indices.append(first)
-        indices.append(second)
+        indices.extend((first, second))
     indices.append(textLength)
 
     # Group the indices and map them to their respective portion of the string
@@ -271,8 +277,7 @@ def squeezeWhitespace(input):
 
 # Final pass tokenization based on special patterns
 def splitToken(token):
-    m = Contractions.search(token)
-    if m:
+    if m := Contractions.search(token):
         return [m.group(1), m.group(2)]
     return [token]
 

diff --git a/py3langid/tools/printfeats.py b/py3langid/tools/printfeats.py
@@ -22,10 +22,7 @@
     n = args.number if args.number is not None else len(w)
 
     def show(feat):
-        if args.printfeat:
-            return feat
-        else:
-            return repr(feat)
+        return feat if args.printfeat else repr(feat)
 
     if args.column is not None:
         for key in sorted(w, key=lambda x:w[x][args.column], reverse=True)[:n]:

diff --git a/py3langid/train/DFfeatureselect.py b/py3langid/train/DFfeatureselect.py
@@ -97,7 +97,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
     # Work out the set of features to compute IG
     features = set()
     for i in range(1, max_order+1):
-        d = dict( (k, doc_count[k]) for k in doc_count if len(k) == i)
+        d = {k: doc_count[k] for k in doc_count if len(k) == i}
         features |= set(sorted(d, key=d.get, reverse=True)[:tokens_per_order])
     features = sorted(features)
 
@@ -124,11 +124,7 @@ def ngram_select(doc_count, max_order=MAX_NGRAM_ORDER, tokens_per_order=TOKENS_P
     if not(args.tokens) and not(args.tokens_per_order):
         args.tokens_per_order = TOKENS_PER_ORDER
 
-    if args.features:
-        feature_path = args.features
-    else:
-        feature_path = os.path.join(args.model, 'DFfeats')
-
+    feature_path = args.features or os.path.join(args.model, 'DFfeats')
     bucketlist_path = os.path.join(args.model, 'bucketlist')
 
     # display paths

diff --git a/py3langid/train/IGweight.py b/py3langid/train/IGweight.py
@@ -130,7 +130,7 @@ def pass_IG(bucket):
     else:
         # binarized event space
         # Compute IG binarized with respect to each event
-        ig = list()
+        ig = []
         for event_id in range(num_event):
             num_doc = __dist.sum()
             prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc
@@ -196,11 +196,7 @@ def read_dist(path):
     if not(args.domain or args.lang) or (args.domain and args.lang):
         parser.error("exactly one of domain(-d) or language (-l) must be specified")
 
-    if args.features:
-        feature_path = args.features
-    else:
-        feature_path = os.path.join(args.model, 'DFfeats')
-
+    feature_path = args.features or os.path.join(args.model, 'DFfeats')
     bucketlist_path = os.path.join(args.model, 'bucketlist')
 
     if not os.path.exists(feature_path):
@@ -218,10 +214,9 @@ def read_dist(path):
     else:
         raise ValueError("no event specified")
 
-    if args.weights:
-        weights_path = args.weights
-    else:
-        weights_path = os.path.join(args.model, 'IGweights' + suffix + ('.bin' if args.binarize else ''))
+    weights_path = args.weights or os.path.join(
+        args.model, f'IGweights{suffix}' + ('.bin' if args.binarize else '')
+    )
 
     # display paths
     print("model path:", args.model )

diff --git a/py3langid/train/LDfeatureselect.py b/py3langid/train/LDfeatureselect.py
@@ -88,7 +88,7 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
 
     lang_w_path = os.path.join(args.model, 'IGweights.lang.bin')
     domain_w_path = os.path.join(args.model, 'IGweights.domain')
-    feature_path = args.output if args.output else os.path.join(args.model, 'LDfeats')
+    feature_path = args.output or os.path.join(args.model, 'LDfeats')
 
     # display paths
     print("model path:", args.model)
@@ -97,11 +97,11 @@ def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False):
     print("feature output path:", feature_path)
 
     lang_w = read_weights(lang_w_path)
-    domain_w = read_weights(domain_w_path) if not args.no_domain_ig else None
+    domain_w = None if args.no_domain_ig else read_weights(domain_w_path)
 
     features_per_lang = select_LD_features(lang_w, domain_w, args.feats_per_lang, ignore_domain=args.no_domain_ig)
     if args.per_lang:
-        with open(feature_path + '.perlang', 'w') as f:
+        with open(f'{feature_path}.perlang', 'w') as f:
             writer = csv.writer(f)
             for i in range(len(features_per_lang)):
                 writer.writerow(map(repr,features_per_lang[i]))

diff --git a/py3langid/train/NBtrain.py b/py3langid/train/NBtrain.py
@@ -58,8 +58,7 @@
 def offsets(chunks):
     # Work out the path chunk start offsets
     chunk_offsets = [0]
-    for c in chunks:
-        chunk_offsets.append(chunk_offsets[-1] + len(c))
+    chunk_offsets.extend(chunk_offsets[-1] + len(c) for c in chunks)
     return chunk_offsets
 
 def state_trace(path):
@@ -154,8 +153,7 @@ def learn_pc(cm):
     @returns nb_pc: log(P(C))
     """
     pc = np.log(cm.sum(0))
-    nb_pc = array.array('d', pc)
-    return nb_pc
+    return array.array('d', pc)
 
 def generate_cm(items, num_classes):
     """
@@ -185,7 +183,11 @@ def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
         chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize)
 
     # TODO: Set the output dir
-    b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ]
+    b_dirs = [
+        tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
+        for _ in range(args.buckets)
+    ]
+
 
     output_states = set(tk_output)
 
@@ -240,21 +242,9 @@ def cleanup():
     parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
     args = parser.parse_args()
 
-    if args.temp:
-        temp_path = args.temp
-    else:
-        temp_path = os.path.join(args.model, 'buckets')
-
-    if args.scanner:
-        scanner_path = args.scanner
-    else:
-        scanner_path = os.path.join(args.model, 'LDfeats.scanner')
-
-    if args.output:
-        output_path = args.output
-    else:
-        output_path = os.path.join(args.model, 'model')
-
+    temp_path = args.temp or os.path.join(args.model, 'buckets')
+    scanner_path = args.scanner or os.path.join(args.model, 'LDfeats.scanner')
+    output_path = args.output or os.path.join(args.model, 'model')
     index_path = os.path.join(args.model, 'paths')
     lang_path = os.path.join(args.model, 'lang_index')
 

diff --git a/py3langid/train/common.py b/py3langid/train/common.py
@@ -35,9 +35,10 @@ def chunk(seq, chunksize):
     """
     seq_iter = iter(seq)
     while True:
-        chunk = tuple(islice(seq_iter, chunksize))
-        if not chunk: break
-        yield chunk
+        if chunk := tuple(islice(seq_iter, chunksize)):
+            yield chunk
+        else:
+            break
 
 def unmarshal_iter(path):
     """
@@ -131,13 +132,10 @@ def MapPool(processes=None, initializer=None, initargs=None, maxtasksperchild=No
 
     if processes > 1:
         with closing( mp.Pool(processes, initializer, initargs, maxtasksperchild)) as pool:
-            f = lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
-            yield f
+            yield lambda fn, chunks: pool.imap_unordered(fn, chunks, chunksize=chunksize)
     else:
         if initializer is not None:
             initializer(*initargs)
-        f = imap
-        yield f
-
+        yield imap
     if processes > 1:
         pool.join()
diff --git a/py3langid/train/index.py b/py3langid/train/index.py
@@ -91,10 +91,10 @@ def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=Non
             self.domain_index = defaultdict(Enumerator())
         else:
             # pre-specified domain set
-            self.domain_index = dict((k,v) for v,k in enumerate(domains))
+            self.domain_index = {k: v for v,k in enumerate(domains)}
 
         self.coverage_index = defaultdict(set)
-        self.items = list()
+        self.items = []
 
         self.index(root)
         self.prune_min_domain(self.min_domain)
@@ -139,15 +139,14 @@ def prune_min_domain(self, min_domain):
         for langs in self.coverage_index.values():
             for lang in langs:
                 lang_domain_count[lang] += 1
-        reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain)
-
-        # Remove the languages from the indexer
-        if reject_langs:
+        if reject_langs := {
+            l for l in lang_domain_count if lang_domain_count[l] < min_domain
+        }:
             #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
-            reject_ids = set(self.lang_index[l] for l in reject_langs)
+            reject_ids = {self.lang_index[l] for l in reject_langs}
 
             new_lang_index = defaultdict(Enumerator())
-            lm = dict()
+            lm = {}
             for k,v in self.lang_index.items():
                 if v not in reject_ids:
                     new_id = new_lang_index[k]
@@ -215,11 +214,7 @@ def paths(self):
     args = parser.parse_args()
 
     corpus_name = os.path.basename(args.corpus)
-    if args.model:
-        model_dir = args.model
-    else:
-        model_dir = os.path.join('.', corpus_name+'.model')
-
+    model_dir = args.model or os.path.join('.', f'{corpus_name}.model')
     makedir(model_dir)
 
     langs_path = os.path.join(model_dir, 'lang_index')