From b186a968dc7e0ac52f3303d59758f65f5a2a3d98 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 22 Feb 2022 18:49:10 +0100
Subject: [PATCH 01/32] Fixed bug in guess mode

---
 codext/__common__.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index 89522e8..9b7936f 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1243,11 +1243,9 @@ def __develop(encodings):
 
 
 def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(),
-            stop=True, show=False, scoring_heuristic=False, extended=False, debug=False, regex=False):
+            stop=True, show=False, scoring_heuristic=False, extended=False, debug=False):
     """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """
     if depth > min_depth and stop_func(input):
-        if regex:
-            stop = True
         if not stop and (show or debug) and found not in result:
             s = repr(input)
             s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s
@@ -1288,7 +1286,7 @@ def expand(items, descr=None, transform=None):
         if debug:
             print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding))
         __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result,
-                found + (encoding, ), stop, show, scoring_heuristic, extended, debug, regex)
+                found + (encoding, ), stop, show, scoring_heuristic, extended, debug)
 
 
 def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False):
@@ -1371,8 +1369,8 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F
                         expf = expf(f, encoding)
                     except TypeError:
                         expf = expf(f)
-                elif isinstance(expf, (int, float)):
-                    epxf = f - .1 <= expf <= f + .1
+                if isinstance(expf, (int, float)):
+                    expf = (f - .1 <= expf <= f + .1)
                 elif isinstance(expf, (tuple, list)) and len(expf) == 2:
                     expf = f - expf[1] <= expf[0] <= expf[1] + .1
                 s += [-1., .1][expf]
@@ -1414,17 +1412,15 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_cat
     if len(found) > 0:
         for encoding in found:
             input = decode(input, encoding)
-    regex = False
     if isinstance(stop_func, string_types):
         stop_func = stopfunc.regex(stop_func)
-        regex = True
     result = {}
     if len(input) > 0:
         try:
             # breadth-first search
             for d in range(max_depth):
                 __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop,
-                        show, scoring_heuristic, extended, debug, regex)
+                        show, scoring_heuristic, extended, debug)
                 if stop and len(result) > 0:
                     return result
         except KeyboardInterrupt:

From b3253142fa93bc5d089788c26e58512940fbf367 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Wed, 23 Feb 2022 00:03:22 +0100
Subject: [PATCH 02/32] Fixed minor issues

---
 codext/__common__.py | 12 ++++++++----
 codext/__init__.py   |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index 9b7936f..35d1fc5 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1303,9 +1303,13 @@ class _Text(object):
     __slots__ = ["entropy", "lcharset", "len", "padding", "printables"]
     
     def __init__(self, text, pad_char=None):
+        c = text[-1]
+        last_char = c if isinstance(c, int) else ord(c)
+        self.padding = pad_char is not None and last_char == ord(pad_char)
+        if self.padding:
+            text = text.rstrip(pad_char)
         self.len = len(text)
         self.lcharset = len(set(text))
-        self.padding = pad_char is not None and text[-1] in [pad_char, b(pad_char)]
         self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len
         self.entropy = entropy(text)
 
@@ -1363,16 +1367,16 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F
                     s += .1
             expf = sc.get('expansion_factor', 1.)
             if expf:
-                f = float(len(new_input)) / obj.len
+                f = obj.len / float(len(new_input))  # expansion while encoding => at decoding: 1/f
                 if isinstance(expf, type(lambda: None)):
                     try:  # this case allows to consider the current encoding name from the current codec
                         expf = expf(f, encoding)
                     except TypeError:
                         expf = expf(f)
                 if isinstance(expf, (int, float)):
-                    expf = (f - .1 <= expf <= f + .1)
+                    expf = (1/f - .1 <= 1/expf <= 1/f + .1)
                 elif isinstance(expf, (tuple, list)) and len(expf) == 2:
-                    expf = f - expf[1] <= expf[0] <= expf[1] + .1
+                    expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1]
                 s += [-1., .1][expf]
             # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
             #  number of input characters to take bad entropies of shorter strings into account
diff --git a/codext/__init__.py b/codext/__init__.py
index 3b98af4..0fa49d5 100644
--- a/codext/__init__.py
+++ b/codext/__init__.py
@@ -88,7 +88,7 @@ def main():
         "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base",
     ])
     parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter)
-    sparsers = parser.add_subparsers(dest="command", help="command to be executed")
+    sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed")
     parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)")
     parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)")
     parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip",
@@ -140,7 +140,7 @@ def main():
     search = sparsers.add_parser("search", help="search for codecs")
     search.add_argument("pattern", nargs="+", help="encoding pattern to search")
     listi = sparsers.add_parser("list", help="list items")
-    lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed")
+    lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed")
     liste = lsparsers.add_parser("encodings", help="list encodings")
     liste.add_argument("category", nargs="*", help="selected categories")
     listm = lsparsers.add_parser("macros", help="list macros")

From 3cf1f5dbd67b78a771b55885d5e2c20eaf31fd81 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Wed, 23 Feb 2022 00:03:29 +0100
Subject: [PATCH 03/32] Refined tests

---
 tests/test_base.py   | 14 ++++++++------
 tests/test_common.py |  4 ++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/test_base.py b/tests/test_base.py
index 33eff65..7b3dae0 100644
--- a/tests/test_base.py
+++ b/tests/test_base.py
@@ -44,6 +44,7 @@ def test_codec_base1(self):
         for i in range(3):
             self.assertIsNotNone(codecs.encode(i * C, "base1"))
         self.assertRaises(ValueError, codecs.encode, 4 * C, "unary")
+        self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05")
     
     def test_codec_base2(self):
         STR = "test"
@@ -181,7 +182,7 @@ def test_codec_base62(self):
             self.assertEqual(codecs.decode(b(b62), enc), b(STR))
     
     def test_codec_base64(self):
-        for b64, enc in zip(["dGhpcyBpcyBhIHRlc3QK", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]):
+        for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]):
             self.assertEqual(codecs.encode(STR, enc), b64)
             self.assertEqual(codecs.encode(b(STR), enc), b(b64))
             self.assertEqual(codecs.decode(b64, enc), STR)
@@ -224,11 +225,12 @@ def test_base_main(self):
         tfile = "test-base-main.txt"
         with open(tfile, 'w') as f:
             f.write("This is a long test string for the sake of causing line wrapping based on default parameters.")
-        sys.argv = [tmp[0], tfile]
-        for m in main32, main64url:
-            self.assertEqual(m(), 0)
-        sys.argv = [tmp[0], tfile, "-d"]
-        self.assertEqual(main2(), 1)
+        for swap_arg in [[], ["-s"]]:
+            sys.argv = [tmp[0], tfile] + swap_arg
+            for m in main32, main64url:
+                self.assertEqual(m(), 0)
+            sys.argv = [tmp[0], tfile, "-d"] + swap_arg
+            self.assertEqual(main2(), 1)
         os.remove(tfile)
         sys.argv[:] = tmp
 
diff --git a/tests/test_common.py b/tests/test_common.py
index ec57aaa..a35abfd 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -150,7 +150,7 @@ def test_guess_decode(self):
         self.assertIsNone(codext.stopfunc._reload_lang())
         _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None
         codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
-                   "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
+                   "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
         self.assertIn("test-codec", codext.list_encodings("test"))
         self.assertEqual(codext.decode("TEST=", "test"), "TEST")
         self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2,
@@ -204,7 +204,7 @@ def test_guess_decode(self):
     
     def test_rank_input(self):
         codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
-                   "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
+                   "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
         STR = "This is a test string !"
         ENC = codext.encode(STR, "base64")
         self.assertTrue(len(codext.rank(ENC)) > 20)

From cb6656845940db1386a563e08ea13ac918f51bb5 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Wed, 23 Feb 2022 00:03:46 +0100
Subject: [PATCH 04/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 6b89d58..81f3632 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.12.2
+1.12.3

From 544e1cc39a7b4e793864087d3d2cf4cc77d73038 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sat, 26 Feb 2022 16:01:45 +0100
Subject: [PATCH 05/32] Fixed multiple base codecs

---
 codext/base/base100.py | 2 +-
 codext/base/base122.py | 4 ++--
 codext/base/base91.py  | 4 ++--
 codext/base/baseN.py   | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/codext/base/base100.py b/codext/base/base100.py
index db0b3c9..f5faa1d 100755
--- a/codext/base/base100.py
+++ b/codext/base/base100.py
@@ -37,7 +37,7 @@ def base100_encode(input, errors="strict"):
         return bytes(r), len(input)
     
     def base100_decode(input, errors="strict"):
-        input = b(input)
+        input = b(_stripl(input, True, True))
         if errors == "ignore":
             input = input.replace(b"\n", "")
         if len(input) % 4 != 0:
diff --git a/codext/base/base122.py b/codext/base/base122.py
index 33a42ad..f580ff8 100755
--- a/codext/base/base122.py
+++ b/codext/base/base122.py
@@ -98,9 +98,9 @@ def _get_7bits(currB, bob, B, decoded):
                 currB, bob = _get_7bits(currB, bob, input[i] & 127, r)
             else:
                 currB, bob = _get_7bits(currB, bob, input[i], r)
-        return "".join(map(chr, r)), len(input)
+        return "".join(map(chr, r)).rstrip("\0"), len(input)
 
 
 add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085)
-main122 = main(122, "<http://blog.kevinalbs.com/base122>")
+main122 = main(122, "<http://blog.kevinalbs.com/base122>", wrap=False)
 
diff --git a/codext/base/base91.py b/codext/base/base91.py
index 6f0d6ec..21a21d5 100755
--- a/codext/base/base91.py
+++ b/codext/base/base91.py
@@ -72,7 +72,7 @@ def encode(text, errors="strict"):
 def base91_decode(mode):
     b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))}
     def decode(text, errors="strict"):
-        t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
+        t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
         ehandler = handle_error("base91", errors, decode=True)
         for i in range(0, len(t), 2):
             try:
@@ -103,7 +103,7 @@ def decode(text, errors="strict"):
                 bits = bits[8:]
         elif not alt and len(bits) > 0 and not set(bits) == {"0"}:
             s += chr(int(bits, 2))
-        return s, len(t)
+        return s.rstrip("\0"), len(t)
     return decode
 
 
diff --git a/codext/base/baseN.py b/codext/base/baseN.py
index f935bf9..3c63453 100755
--- a/codext/base/baseN.py
+++ b/codext/base/baseN.py
@@ -82,7 +82,7 @@
     r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ",
 }
 base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$",
-     guess=["base58", "base58-ripple", "base58-flickr"])
+     guess=["base58-bitcoin", "base58-ripple", "base58-flickr"])
 main58bc = main(58, "<https://en.bitcoinwiki.org/wiki/Base58>", "bitcoin")
 main58rp = main(58, "<https://en.bitcoinwiki.org/wiki/Base58>", "ripple")
 main58fl = main(58, "<https://en.bitcoinwiki.org/wiki/Base58>", "flickr")
@@ -119,7 +119,7 @@
 
 B128 = {r'': "".join(chr(i) for i in range(128))}
 base(B128, r"^base[-_]?128$", padding_char="=")
-main128 = main(128, None, False)
+main128 = main(128, None, False, wrap=False)
 
 
 # generic base encodings, to be added after all others as they have the precedence

From 95f4b80825e101e3e44cc7b5d961a993c5736cce Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sat, 26 Feb 2022 16:02:39 +0100
Subject: [PATCH 06/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 81f3632..89c881b 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.12.3
+1.12.4

From f8bd7b741c181b2789f4cf4785e10ac0e67c237c Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:24:29 +0100
Subject: [PATCH 07/32] Fixed codec: shift

---
 codext/crypto/shift.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codext/crypto/shift.py b/codext/crypto/shift.py
index 89ca992..599e60d 100755
--- a/codext/crypto/shift.py
+++ b/codext/crypto/shift.py
@@ -19,12 +19,12 @@
 
 
 def ord_shift_decode(i):
-    return ord_shift_encode(-i)
+    return ord_shift_encode(-int(i))
 
 
 def ord_shift_encode(i):
     def encode(text, errors="strict"):
-        r = "".join(chr((ord(c) + i) % 256) for c in text)
+        r = "".join(chr((ord(c) + int(i)) % 256) for c in text)
         return r, len(r)
     return encode
 

From d9cc79ae047cf9d384fd0068006d43b52a73771a Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:24:42 +0100
Subject: [PATCH 08/32] Fixed codec: scytale

---
 codext/crypto/scytale.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/crypto/scytale.py b/codext/crypto/scytale.py
index 32e6e96..7490241 100755
--- a/codext/crypto/scytale.py
+++ b/codext/crypto/scytale.py
@@ -17,7 +17,7 @@
     'enc(scytale2|scytale-2|scytale_2)':   {'this is a test': "ti satshsi  et"},
     'enc(scytale5|scytale-5|scytale_5)':   {'this is a test': "tithsei ssat  "},
 }
-__guess__ = ["scytale-%d" % i for i in range(10)]
+__guess__ = ["scytale-%d" % i for i in range(1, 10)]
 
 
 PADDING_CHAR = ""

From 1e31eab38491fc5768383a21332a06f974c5f1e6 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:25:27 +0100
Subject: [PATCH 09/32] Fixed bug in base

---
 codext/base/_base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/codext/base/_base.py b/codext/base/_base.py
index 05aaed0..fce8b9a 100755
--- a/codext/base/_base.py
+++ b/codext/base/_base.py
@@ -191,7 +191,11 @@ def _decode(input, errors="strict"):
     kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs)
     kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05))
     n = "base{}".format(n) if name is None else name
-    kwargs['guess'] = kwargs.get('guess', [n])
+    try:
+        g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n]
+    except AttributeError:
+        g = [n]
+    kwargs['guess'] = kwargs.get('guess', g)
     add(n, encode, decode, pattern, entropy=nb, **kwargs)
 
 

From d09dd0be29ffb3ce1a42c5e42eb1c58e0e3e5faf Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:25:41 +0100
Subject: [PATCH 10/32] Improved unbase tool

---
 codext/base/__init__.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/codext/base/__init__.py b/codext/base/__init__.py
index 5859f6b..8c0d220 100755
--- a/codext/base/__init__.py
+++ b/codext/base/__init__.py
@@ -19,7 +19,7 @@ def main():
 With no FILE, or when FILE is -, read standard input.
 
 Optional arguments:
-  -e, --extended        also consider generic base codecs while guess-decoding
+  -E, --extended        also consider generic base codecs while guess-decoding
   -f, --stop-function   set the result chceking function (default: text)
                          format: printables|text|flag|lang_[bigram]
   -M, --max-depth       maximum codec search depth (default: 5)
@@ -36,28 +36,23 @@ def main():
 """
     parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False)
     parser.format_help = MethodType(lambda s: s.description, parser)
+    group = parser.add_mutually_exclusive_group()
     parser.add_argument("file", nargs="?")
-    parser.add_argument("-e", "--extended", action="store_true")
-    parser.add_argument("-f", "--stop-function", default="text")
+    parser.add_argument("-E", "--extended", action="store_true")
+    group.add_argument("-f", "--stop-function", default="text")
     parser.add_argument("-M", "--max-depth", type=int, default=10)
     parser.add_argument("-m", "--min-depth", type=int, default=0)
-    parser.add_argument("-p", "--pattern")
+    group.add_argument("-p", "--pattern")
     parser.add_argument("-s", "--show", action="store_true")
     parser.add_argument("--help", action="help")
     parser.add_argument("--version", action="version")
     parser.add_argument("--verbose", action="store_true")
     parser.version = "CodExt " + __version__
     args = parser.parse_args()
-    excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function
-    if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
-        stopfunc._reload_lang(stopfunc.LANG_BACKEND)
-    #TODO: validate args.stop_function
-    #TODO: make --stop-function and --pattern mutually exclusive
-    sfunc = getattr(stopfunc, s, s)
-    c = _input(args.file)
+    c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended]
     c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
-    r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base",
-                     stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose)
+    r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False,
+                     show=args.verbose, debug=args.verbose)
     if len(r) == 0:
         print("Could not decode :-(")
         return 0

From 0d132317fc4d214311d1f81071bbf916437d3532 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:26:07 +0100
Subject: [PATCH 11/32] Improved codext tool

---
 codext/__init__.py | 115 +++++++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 50 deletions(-)

diff --git a/codext/__init__.py b/codext/__init__.py
index 0fa49d5..692ab48 100644
--- a/codext/__init__.py
+++ b/codext/__init__.py
@@ -33,12 +33,23 @@
           pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native")
 
 
-def __literal_eval(o):
-    """ Non-failing ast.literal_eval alias function. """
-    try:
-        return literal_eval(str(o))
-    except ValueError:
-        return literal_eval("'" + str(o) + "'")
+def __format_list(items, include=True):
+    if items is None:
+        return
+    d = {-1: list_encodings() if include else []}
+    for n, i in enumerate(items):
+        try:
+            depth, i = i.split(":")
+            depth = int(depth.strip().replace("~", "-"))
+            if depth < 0:
+                depth = -1
+        except ValueError:
+            if n == 0:
+                d[-1] = []
+            depth = -1
+        d.setdefault(depth, [])
+        d[depth].append(i.strip())
+    return d
 
 
 def __print_tabular(lst, space=4):
@@ -70,6 +81,19 @@ def __print_tabular(lst, space=4):
 
 def main():
     import argparse, os
+
+    class _CustomFormatter(argparse.RawTextHelpFormatter):
+        def __init__(self, prog, **kwargs):
+            kwargs['max_help_position'] = 32
+            super(_CustomFormatter, self).__init__(prog, **kwargs)
+        
+        def _format_action_invocation(self, action):
+            if not action.option_strings:
+                metavar, = self._metavar_formatter(action, action.dest)(1)
+                return metavar
+            else:
+                return ", ".join(action.option_strings)
+    
     descr = "Codecs Extension (CodExt) {}\n\nAuthor   : {} ({})\nCopyright: {}\nLicense  : {}\nSource   : {}\n" \
             "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \
             .format(__version__, __author__, __email__, __copyright__, __license__, __source__)
@@ -87,62 +111,68 @@ def main():
         "echo -en \"test\" | codext encode base64 gzip | codext guess",
         "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base",
     ])
-    parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter)
-    sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed")
+    kw = {'formatter_class': _CustomFormatter}
+    parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw)
+    kw2 = {'required': True} if PY3 else {}
+    sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2)
     parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)")
     parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)")
     parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip",
                         help="strip newlines from input (default: False)")
-    encode = sparsers.add_parser("encode", help="encode input using the specified codecs")
+    encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw)
     encode.add_argument("encoding", nargs="+", help="list of encodings to apply")
     encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
                         help="error handling (default: strict)")
-    decode = sparsers.add_parser("decode", help="decode input using the specified codecs")
+    decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw)
     decode.add_argument("encoding", nargs="+", help="list of encodings to apply")
     decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
                         help="error handling (default: strict)")
-    guess = sparsers.add_parser("guess", help="try guessing the decoding codecs")
+    guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw)
     guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)")
-    guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; "
-                                                                   "format: string|tuple")
-    guess.add_argument("-d", "--min-depth", default=0, type=int, help="minimum codec search depth before triggering "
-                                                                "results (default: 0)")
-    guess.add_argument("-D", "--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)")
-    guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; "
-                                                                 "format: string|tuple")
+    guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                       help="categories, codecs and encodings to be explicitely not used ;\n "
+                            "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     guess.add_argument("-E", "--extended", action="store_true",
                        help="while using the scoring heuristic, also consider null scores (default: False)")
     lng = "lang_%s" % LANG
     def_func = lng if getattr(stopfunc, lng, None) else "text"
-    guess.add_argument("-f", "--stop-function", default=def_func, help="result checking function (default: %s) ; "
-                       "format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-sensitive ; add -i to "
-                       "force it as case-insensitive or add '(?i)' in front of the expression" % def_func)
-    guess.add_argument("-i", "--case-insensitive", dest="icase", action="store_true",
-                       help="while using the regex stop function, set it as case-insensitive (default: False)")
+    guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function "
+                       "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-"
+                       "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression"
+                       % def_func)
     guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down"
                        " the search but may be more accurate (default: False)")
+    guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                       help="categories, codecs and encodings to be explicitely used ;\n "
+                            "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
+    guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true",
+                       help="while using the regex stop function, set it as case-insensitive (default: False)")
     if len(stopfunc.LANG_BACKENDS) > 0:
         _lb = stopfunc.LANG_BACKEND
         guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"],
                            help="natural language detection backend (default: %s)" % _lb)
+    guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT",
+                       help="minimum codec search depth before triggering results (default: 0)")
+    guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT",
+                       help="maximum codec search depth (default: 5)")
     guess.add_argument("-s", "--do-not-stop", action="store_true",
                        help="do not stop if a valid output is found (default: False)")
     guess.add_argument("-v", "--verbose", action="store_true",
                        help="show guessing information and steps (default: False)")
-    rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input")
-    rank.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; "
-                                                       "format: string|tuple|list(strings|tuples)")
-    rank.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; "
-                                                     "format: string|tuple|list(strings|tuples)")
+    rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw)
+    rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY",
+                      help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)")
+    rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC",
+                      help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)")
     rank.add_argument("-E", "--extended", action="store_true",
                       help="while using the scoring heuristic, also consider null scores (default: False)")
     rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results")
     search = sparsers.add_parser("search", help="search for codecs")
     search.add_argument("pattern", nargs="+", help="encoding pattern to search")
     listi = sparsers.add_parser("list", help="list items")
-    lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed")
+    lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2)
     liste = lsparsers.add_parser("encodings", help="list encodings")
-    liste.add_argument("category", nargs="*", help="selected categories")
+    liste.add_argument("category", nargs="+", help="selected categories")
     listm = lsparsers.add_parser("macros", help="list macros")
     addm = sparsers.add_parser("add-macro", help="add a macro to the registry")
     addm.add_argument("name", help="macro's name")
@@ -150,15 +180,7 @@ def main():
     remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry")
     remm.add_argument("name", help="macro's name")
     args = parser.parse_args()
-    try:
-        args.codec_categories = _lst(map(__literal_eval, args.codec_categories))
-    except (AttributeError, TypeError):
-        pass
-    try:
-        args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs))
-    except (AttributeError, TypeError):
-        pass
-    #print(args.codec_categories, args.exclude_codecs)
+    args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False)
     try:
         # if a search pattern is given, only handle it
         if args.command == "search":
@@ -211,17 +233,9 @@ def main():
                all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
                 stopfunc._reload_lang(lb)
             r = codecs.guess(c,
-                             getattr(stopfunc, s, ["", "(?i)"][args.icase] + s),
-                             args.min_depth,
-                             args.max_depth,
-                             args.codec_categories,
-                             args.exclude_codecs,
-                             args.encoding,
-                             not args.do_not_stop,
-                             True,  # show
-                             not args.no_heuristic,
-                             args.extended,
-                             args.verbose)
+                             getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth,
+                             args.include, args.exclude, args.encoding, not args.do_not_stop, True,  # show
+                             not args.no_heuristic, args.extended, args.verbose)
             for i, o in enumerate(r.items()):
                 e, out = o
                 if len(e) > 0:
@@ -238,6 +252,7 @@ def main():
                 s = "[+] %.5f: %s" % (i[0], e)
                 print(s if len(s) <= 80 else s[:77] + "...")
     except Exception as e:
+        raise e
         m = str(e)
         print("codext: " + m[0].lower() + m[1:])
 

From ad01045ad33093658cc19fe48b7deb6bddfc4c47 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:26:18 +0100
Subject: [PATCH 12/32] Improved guess performance

---
 .coveragerc          |   7 +-
 codext/__common__.py | 382 +++++++++++++++++++++++++------------------
 tests/test_common.py |  22 +--
 3 files changed, 237 insertions(+), 174 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index 0baf7fa..4ccc970 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -11,7 +11,7 @@ exclude_lines =
     def main\(\)\:
     def __stdin_pipe\(\)\:
     for line in __stdin_pipe\(\)\:
-    def __literal_eval\(o\)\:
+    def __format_list\(items, include\=True\)\:
     def __print_tabular\(lst, space\=4\)\:
     except ImportError:
     except NameError:
@@ -20,3 +20,8 @@ exclude_lines =
     if PY3
     def encode\(self, input, final\=False\)\:
     def decode\(self, input, final\=False\)\:
+    def _detect\(text\)\:
+    def _lang\(lang\)\:
+    if stopfunc\.LANG_BACKEND\:
+    def _validate\(stop_function, lang_backend\=\"none\"\)\:
+    except KeyboardInterrupt\:
diff --git a/codext/__common__.py b/codext/__common__.py
index 35d1fc5..e45fb1e 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -45,6 +45,7 @@
 CODECS_REGISTRY = None
 CODECS_OVERWRITTEN = []
 CODECS_CATEGORIES = ["native", "custom"]
+CODECS_CACHE = {}
 LANG = getlocale()
 if LANG:
     LANG = (LANG[0] or "")[:2].lower()
@@ -674,17 +675,16 @@ def list_categories():
     # particular category, hardcoded from base/_base.py
     c += ["base-generic"]
     return c
+list_categories()
 
 
 def list_encodings(*categories):
     """ Get a list of all codecs. """
-    # first, determine the list of valid categories
-    valid_categories = list_categories()
-    # then, if "non-native" is in the input list, extend the list with the whole categories but "native"
+    # if "non-native" is in the input list, extend the list with the whole categories but "native"
     categories, exclude = list(categories), []
     for c in categories[:]:
         if c == "non-native":
-            for c in valid_categories:
+            for c in CODECS_CATEGORIES:
                 if c == "native" or c in categories:
                     continue
                 categories.append(c)
@@ -714,7 +714,7 @@ def list_encodings(*categories):
         if (len(categories) == 0 or c in categories) and c not in exclude:
             enc.append(name)
     for category in categories:
-        if category not in valid_categories:
+        if category not in CODECS_CATEGORIES:
             raise ValueError("Category '%s' does not exist" % category)
     return sorted(list(set(enc)), key=_human_keys)
 
@@ -1226,23 +1226,22 @@ def _load_lang_backend(backend=None):
 stopfunc._reload_lang = _load_lang_backend
 
 
-def __develop(encodings):
-    """ Private method for developing the input list of encodings, trying to extend it with every encoding name. """
-    enc = []
-    for e in (encodings or []):
-        try:
-            ci = lookup(e, False)
-            g = ci.parameters['guess']
-        except:
-            g = [e]
-        if e in g:  # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected
-            enc.append(e)
-        else:       # e.g. "rot"   => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected
-            enc.extend(g)
-    return enc
+def _validate(stop_function, lang_backend="none"):
+    s, lb = stop_function, lang_backend
+    if isinstance(s, string_types):
+        if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \
+           all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
+            stopfunc._reload_lang(lb)
+        f = getattr(stopfunc, s, None)
+        if f:
+            return f
+    elif not isinstance(s, FunctionType):
+        raise ValueError("Bad stop function")
+    return s
+stopfunc._validate = _validate
 
 
-def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(),
+def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(),
             stop=True, show=False, scoring_heuristic=False, extended=False, debug=False):
     """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """
     if depth > min_depth and stop_func(input):
@@ -1254,47 +1253,60 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_cat
         result[found] = input
     if depth >= max_depth or len(result) > 0 and stop:
         return
-    # compute included and excluded codecs for this depth
-    def expand(items, descr=None, transform=None):
-        items = items or []
-        # format 1: when string, take it as the only items at any depth
-        if isinstance(items, string_types):
-            r = (items, )
-        # format 2: when tuple, consider it as a list of items at any depth
-        elif isinstance(items, tuple):
-            r = items
-        # format 3: when list, consider it as the list of tuples of items with the order number corresponding to the
-        #            applicable depth
-        elif isinstance(items, list):
-            try:
-                r = items[depth] or ()
-                if isinstance(r, string_types):
-                    r = (r, )
-            except IndexError:
-                r = ()
-        else:
-            raise ValueError("Bad %sformat %s" % (["%s " % descr, ""][descr is None], items))
-        return r if transform is None else transform(*r)
-    # parse valid encodings, expanding included/excluded codecs
-    c, e = expand(codec_categories, "codec_categories", list_encodings), __develop(expand(exclude, "exclude"))
     prev_enc = found[-1] if len(found) > 0 else ""
-    for new_input, encoding in __rank(prev_input, input, prev_enc, c, scoring_heuristic, extended):
+    e = encodings.get(depth, encodings.get(-1, []))
+    for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended):
         if len(result) > 0 and stop:
             return
-        if encoding in e:
-            continue
         if debug:
             print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding))
-        __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result,
+        __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result,
                 found + (encoding, ), stop, show, scoring_heuristic, extended, debug)
 
 
-def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False):
+def __make_encodings_dict(include, exclude):
+    """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible
+         encoding names. It also creates a cache with the CodecInfo objects for improving performance. """
+    codecs = {}
+    def _develop(d, keep=True):
+        d = d or {}
+        for k, v in d.items():
+            l, cc = [], [e for e in v if e in CODECS_CATEGORIES]
+            for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \
+                        [e for e in v if e not in CODECS_CATEGORIES]):
+                try:
+                    g = lookup(enc, False).parameters['guess']
+                except:
+                    g = [enc]
+                if enc in g and not keep:  # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected
+                    l.append(enc)
+                else:                      # e.g. "rot"   => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected
+                    l.extend(g)
+            d[k] = l
+            if keep:
+                for e in l:
+                    # cache newly loaded CodecInfo objects
+                    ci = lookup(e, False)
+                    n = ci.name
+                    if n in CODECS_CACHE:
+                        ci = CODECS_CACHE[n]  # keep the cached object
+                    else:
+                        CODECS_CACHE[n] = ci  # cache the new object
+                    codecs[e] = ci
+        return d
+    exclude = _develop(exclude, False)
+    return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs
+
+
+def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False):
     """ Filter valid encodings and rank them by relevance. """
     ranking = {}
-    for codec in codecs:
-        for score, new_input, encoding in __score(prev_input, input, prev_encoding, codec, heuristic, extended):
-            ranking[encoding] = (score, new_input)
+    for encoding in encodings:
+        try:
+            score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended)
+        except TypeError:
+            continue
+        ranking[encoding] = (score, new)
     for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]):
         yield result if yield_score else result[1], encoding
 
@@ -1304,7 +1316,7 @@ class _Text(object):
     
     def __init__(self, text, pad_char=None):
         c = text[-1]
-        last_char = c if isinstance(c, int) else ord(c)
+        pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c))
         self.padding = pad_char is not None and last_char == ord(pad_char)
         if self.padding:
             text = text.rstrip(pad_char)
@@ -1314,136 +1326,182 @@ def __init__(self, text, pad_char=None):
         self.entropy = entropy(text)
 
 
-def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=False):
+def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False):
     """ Score relevant encodings given an input. """
-    obj, ci = None, lookup(codec, False)  # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
-    sc = ci.parameters.get('scoring', {})
-    no_error, transitive = ci.parameters.get('no_error', False), sc.get('transitive', False)
-    for encoding in ci.parameters.get('guess', [codec]):
-        # ignore encodings that fail to decode with their default errors handling value
-        try:
-            new_input = decode(input, encoding)
-        except:
-            continue
-        # ignore encodings that give an output identical to the input (identity transformation) or to the previous input
-        if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input):
-            continue
-        # ignore encodings that transitively give the same output (identity transformation by chaining twice a same
-        #  codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9)
-        if transitive and prev_encoding:
-            ci_prev = lookup(prev_encoding, False)
-            if ci_prev.parameters['name'] == ci.parameters['name']:
-                continue
-        # compute input's characteristics only once and only if the control flow reaches this point
-        pad = sc.get('padding_char')
-        if obj is None:
-            obj = _Text(input, pad)
-        if heuristic:
-            # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base
-            #  codecs) so that we can put the right one as early as possible and eventually exclude bad candidates
-            s = -sc.get('penalty', .0)
-            # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ;
-            #  on the contrary, if the length of input text's charset is strictly greater, give a penalty
-            lcs = sc.get('len_charset', 256)
-            if isinstance(lcs, type(lambda: None)):
-                lcs = int(lcs(encoding))
-            if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset:
-                s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1)
-            elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset:
-                s -= .2  # this can occur for encodings with no_error set to True
-            # then, take padding into account, giving a bonus if padding is to be encountered and effectively present,
-            #  or a penalty when it should not be encountered but it is present
-            if pad and obj.padding:
-                s += .2  # when padding is encountered while it is legitimate, it could be a good indication => bonus
-            elif not pad and obj.padding:
-                s -= .1  # it could arise a padding character is encountered while not being padding => small penalty
-            # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when
-            #  lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased)
-            if not no_error:
-                pr = sc.get('printables_rate', 0)
-                if isinstance(pr, type(lambda: None)):
-                    pr = float(pr(obj.printables))
-                if obj.printables - pr <= .05:
-                    s += .1
-            expf = sc.get('expansion_factor', 1.)
-            if expf:
-                f = obj.len / float(len(new_input))  # expansion while encoding => at decoding: 1/f
-                if isinstance(expf, type(lambda: None)):
-                    try:  # this case allows to consider the current encoding name from the current codec
-                        expf = expf(f, encoding)
-                    except TypeError:
-                        expf = expf(f)
-                if isinstance(expf, (int, float)):
-                    expf = (1/f - .1 <= 1/expf <= 1/f + .1)
-                elif isinstance(expf, (tuple, list)) and len(expf) == 2:
-                    expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1]
-                s += [-1., .1][expf]
-            # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
-            #  number of input characters to take bad entropies of shorter strings into account
-            entr = sc.get('entropy', {})
-            entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr
-            if isinstance(entr, type(lambda: None)):
+    obj = None
+    sc = codec.parameters.get('scoring', {})
+    no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False)
+    # ignore encodings that fail to decode with their default errors handling value
+    try:
+        new_input = decode(input, encoding)
+    except:
+        return
+    # ignore encodings that give an output identical to the input (identity transformation) or to the previous input
+    if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input):
+        return
+    # ignore encodings that transitively give the same output (identity transformation by chaining twice a same
+    #  codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9)
+    if transitive and prev_encoding:
+        ci_prev = lookup(prev_encoding, False)
+        if ci_prev.parameters['name'] == codec.parameters['name']:
+            return
+    # compute input's characteristics only once and only if the control flow reaches this point
+    pad = sc.get('padding_char')
+    if obj is None:
+        obj = _Text(input, pad)
+    if heuristic:
+        # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base
+        #  codecs) so that we can put the right one as early as possible and eventually exclude bad candidates
+        s = -sc.get('penalty', .0)
+        # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ;
+        #  on the contrary, if the length of input text's charset is strictly greater, give a penalty
+        lcs = sc.get('len_charset', 256)
+        if isinstance(lcs, type(lambda: None)):
+            lcs = int(lcs(encoding))
+        if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset:
+            s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1)
+        elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset:
+            s -= .2  # this can occur for encodings with no_error set to True
+        # then, take padding into account, giving a bonus if padding is to be encountered and effectively present,
+        #  or a penalty when it should not be encountered but it is present
+        if pad and obj.padding:
+            s += .2  # when padding is encountered while it is legitimate, it could be a good indication => bonus
+        elif not pad and obj.padding:
+            s -= .1  # it could arise a padding character is encountered while not being padding => small penalty
+        # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when
+        #  lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased)
+        if not no_error:
+            pr = sc.get('printables_rate', 0)
+            if isinstance(pr, type(lambda: None)):
+                pr = float(pr(obj.printables))
+            if obj.printables - pr <= .05:
+                s += .1
+        expf = sc.get('expansion_factor', 1.)
+        if expf:
+            f = obj.len / float(len(new_input))  # expansion while encoding => at decoding: 1/f
+            if isinstance(expf, type(lambda: None)):
                 try:  # this case allows to consider the current encoding name from the current codec
-                    entr = entr(obj.entropy, encoding)
+                    expf = expf(f, encoding)
                 except TypeError:
-                    entr = entr(obj.entropy)
-            if entr is not None:
-                # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1)
-                d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input))
-                if d_entr <= .5:
-                    s += .5 - d_entr
-            # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
-            bonus = sc.get('bonus_func')
-            if bonus is not None:
-                if isinstance(bonus, type(lambda: None)):
-                    bonus = bonus(obj, ci, encoding)
-                if bonus:
-                    s += .2
-        else:
-            s = 1.
-        # exclude negative (and eventually null) scores as they are (hopefully) not relevant
-        if extended and s >= .0 or not extended and s > .0:
-            yield s, new_input, encoding
+                    expf = expf(f)
+            if isinstance(expf, (int, float)):
+                expf = (1/f - .1 <= 1/expf <= 1/f + .1)
+            elif isinstance(expf, (tuple, list)) and len(expf) == 2:
+                expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1]
+            s += [-1., .1][expf]
+        # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
+        #  number of input characters to take bad entropies of shorter strings into account
+        entr = sc.get('entropy', {})
+        entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr
+        if isinstance(entr, type(lambda: None)):
+            try:  # this case allows to consider the current encoding name from the current codec
+                entr = entr(obj.entropy, encoding)
+            except TypeError:
+                entr = entr(obj.entropy)
+        if entr is not None:
+            # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1)
+            d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input))
+            if d_entr <= .5:
+                s += .5 - d_entr
+        # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
+        bonus = sc.get('bonus_func')
+        if bonus is not None:
+            if isinstance(bonus, type(lambda: None)):
+                bonus = bonus(obj, codec, encoding)
+            if bonus:
+                s += .2
+    else:
+        s = 1.
+    # exclude negative (and eventually null) scores as they are (hopefully) not relevant
+    if extended and s >= .0 or not extended and s > .0:
+        return s, new_input
 
 
-def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(),
+def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(),
           stop=True, show=False, scoring_heuristic=True, extended=False, debug=False):
-    """ Try decoding without the knowledge of the encoding(s). """
+    """ Try decoding without the knowledge of the encoding(s).
+    
+    :param input:             input text to be guessed
+    :param stop_func:         function defining the stop condition
+    :param min_depth:         minimum search depth
+    :param max_depth:         maximum search depth
+    ;param include:           inclusion item OR list with category, codec or encoding names OR dictionary with lists per
+                               depth (nothing means include every encoding)
+    :param exclude:           exclusion item OR list with category, codec or encoding names OR dictionary with lists per
+                               depth (nothing means exclude no encoding)
+    :param found:             tuple of already found encodings
+    :param stop:              whether to stop or not when a valid solution is found
+    :param show:              whether to immediately show once a solution is found
+    :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1.,
+                               meaning that every non-failing encoding will be considered with no order of precedence)
+    :param extended:          whether to also consider null scores with the heuristic
+    :param debug:             whether to show each attempt at each depth during computation
+    """
+    if len(input) == 0:
+        return ""
+    # check for min and max depths
     if max_depth <= 0:
         raise ValueError("Depth must be a non-null positive integer")
     if min_depth > max_depth:
         raise ValueError("Min depth shall be less than or equal to the max depth")
+    # take the tuple of found encodings into account
     if len(found) > 0:
         for encoding in found:
             input = decode(input, encoding)
+    # handle the stop function as a regex if a string was given
     if isinstance(stop_func, string_types):
         stop_func = stopfunc.regex(stop_func)
+    # reformat include and exclude arguments ; supported formats:
+    for n, l in zip(["inc", "exc"], [include, exclude]):
+        if l is None:
+            if n == "inc":
+                include = l = {-1: CODECS_CATEGORIES}
+            else:
+                exclude = l = {}
+        #  "category" OR "enc_name" OR whatever => means a single item for all depths
+        if isinstance(l, string_types):
+            if n == "inc":
+                include = l = {-1: [l]}
+            else:
+                exclude = l = {-1: [l]}
+        #  ["enc_name1", "enc_name2", ...] => means for all depths
+        if isinstance(l, (list, tuple)):
+            if n == "inc":
+                include = l = {-1: l}
+            else:
+                exclude = l = {-1: l}
+        #  {-1: [...], 2: [...], ...}      => means prefedined depths with their lists of in-/excluded encodings
+        if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()):
+            raise ValueError("Include argument shall be a list or a dictionary with integer keys")
+    # precompute encodings lists per depth and cache the related CodecInfo objects
+    encodings, codecs = __make_encodings_dict(include, exclude)
     result = {}
-    if len(input) > 0:
-        try:
-            # breadth-first search
-            for d in range(max_depth):
-                __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop,
-                        show, scoring_heuristic, extended, debug)
-                if stop and len(result) > 0:
-                    return result
-        except KeyboardInterrupt:
-            pass
+    try:
+        # breadth-first search
+        for d in range(max_depth):
+            __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show,
+                    scoring_heuristic, extended, debug)
+            if stop and len(result) > 0:
+                break
+    except KeyboardInterrupt:
+        pass
+    CODECS_CACHE = {}
     return result
 codecs.guess = guess
 
 
-def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None):
-    """ Rank the most probable encodings based on the given input. """
-    if isinstance(codec_categories, string_types):
-        codec_categories = (codec_categories, )
-    codecs = list_encodings(*(codec_categories or ()))
-    for e in __develop(exclude):
-        try:
-            codecs.remove(e)
-        except ValueError:
-            pass
-    r = list(__rank(None, input, "", codecs, True, extended, True))
+def rank(input, extended=False, limit=-1, include=None, exclude=None):
+    """ Rank the most probable encodings based on the given input.
+    
+    :param input:    input text to be evaluated
+    :param extended: whether to consider null scores too (NB: negative scores are not output !)
+    :param limit:    number of encodings to be returned (-1 means all of them)
+    :param include:  inclusion list with category, codec or encoding names (nothing means include every encoding)
+    :param exclude:  exclusion list with category, codec or encoding names (nothing means exclude no encoding)
+    """
+    encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []})
+    r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True))
+    CODECS_CACHE = {}
     return r[:limit] if len(r) > 1 else r
 codecs.rank = rank
 
diff --git a/tests/test_common.py b/tests/test_common.py
index a35abfd..6eddd7e 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -148,14 +148,15 @@ def test_encode_multiple_rounds(self):
     
     def test_guess_decode(self):
         self.assertIsNone(codext.stopfunc._reload_lang())
+        self.assertIsNotNone(codext.stopfunc._validate("flag"))
         _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None
         codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
                    "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
         self.assertIn("test-codec", codext.list_encodings("test"))
         self.assertEqual(codext.decode("TEST=", "test"), "TEST")
-        self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2,
+        self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2,
                                            scoring_heuristic=False).items())[0][1], "TEST")
-        self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories=["test", "base"],
+        self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"],
                                            max_depth=2).items())[0][1], "TEST")
         STR = "This is a test"
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1)))
@@ -163,12 +164,12 @@ def test_guess_decode(self):
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True,
                                               exclude=["base100"])))
         self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"])))
-        self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=[None])), 0)
+        self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0)
         self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False,
                                               show=True)))
-        self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base",
+        self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base",
                                           exclude=("base64", "base64-url"))), 0)
-        self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base",
+        self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base",
                                           scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0)
         self.assertRaises(ValueError, codext.guess, STR, max_depth=0)
         self.assertRaises(ValueError, codext.guess, STR, exclude=42)
@@ -198,8 +199,7 @@ def test_guess_decode(self):
                                 self.assertEqual(encoding, found_encodings[0])
         txt = "".join(chr(i) for i in range(256))
         b64 = codext.encode(txt, "base64")
-        self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True,
-                                              codec_categories="base")))
+        self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base")))
         self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST")
     
     def test_rank_input(self):
@@ -210,10 +210,10 @@ def test_rank_input(self):
         self.assertTrue(len(codext.rank(ENC)) > 20)
         self.assertEqual(len(codext.rank(ENC, limit=20)), 20)
         self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64")
-        self.assertEqual(codext.rank(ENC, codec_categories="base")[0][0][1], STR)
-        self.assertEqual(codext.rank(ENC, codec_categories=["base"])[0][0][1], STR)
-        self.assertIsNotNone(codext.rank(ENC, codec_categories=["base"], exclude=["does_not_exist"])[0][0][1], STR)
-        self.assertIsNotNone(codext.rank("TEST=", codec_categories=["test", "base"])[0][0][1], "TEST")
+        self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR)
+        self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR)
+        self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR)
+        self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST")
     
     def test_handle_macros(self):
         MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2"

From c79e2bdf6633df8e1c643afeea728fc6033315b9 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 20:32:16 +0100
Subject: [PATCH 13/32] Fixed codec: baudot

---
 codext/binary/baudot.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/codext/binary/baudot.py b/codext/binary/baudot.py
index ae5fc32..a57e1ea 100755
--- a/codext/binary/baudot.py
+++ b/codext/binary/baudot.py
@@ -10,9 +10,9 @@
 from ..__common__ import *
 
 
-__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us", "murray", "uk"]
+__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"]
 if PY3:
-    __CODES.extend(["ita2_meteo", "mtk2"])
+    __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"])
 __guess__     = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]]
 __examples1__ = {
     'enc(baudot-BAD_ALPHABET)': None,
@@ -51,7 +51,7 @@
 
 
 PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \
-          (r"|mtk2" if PY3 else r"") + r"|murray|uk|us_tty)(?:[-_](?:lsb|msb))?)?$"
+          (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$"
 # reserved character
 RES_CHR = "\xff"
 
@@ -116,20 +116,22 @@
         "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff",
     ]
 # Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code)
-MURRAY = [
-    "00100", "11011",
-    " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", 
-    "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \
-        "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*",
-]
+if PY3:
+    MURRAY = [
+        "00100", "11011",
+        " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", 
+        "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \
+            "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*",
+    ]
 # English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code
 #                                                                https://en.wikipedia.org/wiki/Baudot_code)
-UK = [
-    "10000", "01000",
-    "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", 
-    "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \
-        "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+",
-]
+if PY3:
+    UK = [
+        "10000", "01000",
+        "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", 
+        "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \
+            "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+",
+    ]
 
 
 def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}):

From 3076c9ff0a1182caf5a361e54e9d99887b7f5348 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 27 Feb 2022 21:06:40 +0100
Subject: [PATCH 14/32] New release

---
 codext/VERSION.txt | 2 +-
 codext/__init__.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 89c881b..feaae22 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.12.4
+1.13.0
diff --git a/codext/__init__.py b/codext/__init__.py
index 692ab48..486dd2f 100644
--- a/codext/__init__.py
+++ b/codext/__init__.py
@@ -180,7 +180,8 @@ def _format_action_invocation(self, action):
     remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry")
     remm.add_argument("name", help="macro's name")
     args = parser.parse_args()
-    args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False)
+    if args.command in ["guess", "rank"]:
+        args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False)
     try:
         # if a search pattern is given, only handle it
         if args.command == "search":

From 803e211c1922dd8e98a0d77eb466ec4d8eeb7e75 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 28 Feb 2022 09:22:40 +0100
Subject: [PATCH 15/32] Added codec: base11

---
 codext/base/baseN.py |  7 ++++++-
 docs/enc/base.md     | 31 +++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/codext/base/baseN.py b/codext/base/baseN.py
index 3c63453..cf4abe4 100755
--- a/codext/base/baseN.py
+++ b/codext/base/baseN.py
@@ -39,10 +39,15 @@
 
 
 B10 = {r'': "0123456789"}
-base(B10, r"^(?:base[-_]?10|int(?:eger)?)$")
+base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$")
 main10 = main(10)
 
 
+B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"}
+base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$")
+main11 = main(11)
+
+
 B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits}
 base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.)
 main16 = main(16, "RFC 4648")
diff --git a/docs/enc/base.md b/docs/enc/base.md
index 73b78ff..757965e 100644
--- a/docs/enc/base.md
+++ b/docs/enc/base.md
@@ -12,12 +12,12 @@ Common base encodings with N a power of 2:
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | 
-`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | charset: `1234`
-`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | charset: `abcdefgh`
+`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`)
+`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`)
+`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`)
 `base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | 
-`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)` | 
-`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | human-oriented Base32
+`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex
+`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32
 `base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | 
 
 !!! note "Aliases"
@@ -62,10 +62,12 @@ Note that for `base64`, it overwrites the native `base64_codec` to also support
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`base3` | text <-> Base3 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | 
+`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`)
+`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | 
+`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | 
 `base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | 
 `base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | 
-`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | supports Bitcoin, Ripple and short URL
+`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL
 `base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | 
 `base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | 
 `base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | 
@@ -131,11 +133,7 @@ This encoding implements various different versions of Base85.
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`base85` | text <-> ascii85 | `ascii85` | 
-`base85` | text <-> z85 | `z85`, `base85-zeromq` | 
-`base85` | text <-> base85-ipv6 | `base85-ipv6`, `base85-rfc1924` | 
-`base85` | text <-> base85-adobe | `base85-adobe` | 
-`base85` | text <-> base85-btoa | `base85-btoa`, `base85-xbtoa` | 
+`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | 
 
 ```python
 >>> codext.encode("this is a test", "ascii85")
@@ -156,16 +154,9 @@ This encoding implements various different versions of Base85.
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`base85` | text <-> Base85 encoded text | `base[-_]?85` | Python 3 only (relies on `base64` module)
 `base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only
 `base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only
-
-```python
->>> codecs.encode("this is a test", "base85")
-'bZBXFAZc?TVIXv6b94'
->>> codecs.decode("bZBXFAZc?TVIXv6b94", "base85")
-'this is a test'
-```
+`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset
 
 ```python
 >>> codecs.encode("this is a test", "base100")

From b37e8a15b9b1f309a5229433e65318f16bd47b3c Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 28 Feb 2022 09:22:54 +0100
Subject: [PATCH 16/32] Improved docs about crypto

---
 README.md          |  1 +
 docs/enc/crypto.md | 13 ++++---------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 0b69bfb..2ce70be 100644
--- a/README.md
+++ b/README.md
@@ -219,6 +219,7 @@ o
 - [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet)
 - [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet)
 - [X] `base10`: simple conversion to decimal
+- [X] `base11`: conversion to digits with a "*a*"
 - [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted)
 - [X] `base26`: conversion to alphabet letters
 - [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html))
diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md
index 974f49d..e59ab0f 100644
--- a/docs/enc/crypto.md
+++ b/docs/enc/crypto.md
@@ -152,9 +152,8 @@ This is a dynamic encoding, that is, it can be called with an integer to define
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_1`, `caesar1` | 
-`rot` | text <-> rot(X) ciphertext | ... | 
-`rot` | text <-> rot(25) ciphertext | `rot25`, `rot-25`, `rot_25`, `caesar25` | 
+`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[
+`rot47` | text <-> rot47 ciphertext |  | 
 
 ```python
 >>> codext.encode("this is a test", "rot-15")
@@ -173,9 +172,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-1`, `shift_1` | 
-`shift` | text <-> shift(X) ciphertext | ... | 
-`shift` | text <-> shift(255) ciphertext | `shift255`, `shift-255`, `shift_255` | 
+`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[
 
 ```python
 >>> codext.encode("this is a test", "shift-3")
@@ -194,9 +191,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor1`, `xor-1`, `xor_1` | 
-`xor` | text <-> XOR(X) ciphertext | ... | 
-`xor` | text <-> XOR(255) ciphertext | `XOR255`, `xor255`, `xor-255`, `xor_255` | 
+`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[
 
 ```python
 >>> codext.encode("this is a test", "xor-10")

From d64c4f34b4c96b59976105680cbd35232be33e17 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 28 Feb 2022 09:23:29 +0100
Subject: [PATCH 17/32] Applied minor improvement

---
 codext/__common__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index e45fb1e..94ad19f 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1333,7 +1333,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False,
     no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False)
     # ignore encodings that fail to decode with their default errors handling value
     try:
-        new_input = decode(input, encoding)
+        new_input = codec.decode(input)[0]
     except:
         return
     # ignore encodings that give an output identical to the input (identity transformation) or to the previous input

From e3092151c35da08d048a92c8dc03269ee95aed5c Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 28 Feb 2022 18:40:33 +0100
Subject: [PATCH 18/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index feaae22..b50dd27 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.13.0
+1.13.1

From df8ff0fe870315c15eee63428516b47a1a74628b Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sat, 12 Mar 2022 23:20:40 +0100
Subject: [PATCH 19/32] Fixed codec: uu

---
 codext/__common__.py      |  3 ++-
 codext/__init__.py        |  6 -----
 codext/others/__init__.py |  1 +
 codext/others/uuencode.py | 47 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 7 deletions(-)
 create mode 100644 codext/others/uuencode.py

diff --git a/codext/__common__.py b/codext/__common__.py
index 94ad19f..0c7ec17 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1312,9 +1312,10 @@ def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False,
 
 
 class _Text(object):
-    __slots__ = ["entropy", "lcharset", "len", "padding", "printables"]
+    __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"]
     
     def __init__(self, text, pad_char=None):
+        self.text = text
         c = text[-1]
         pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c))
         self.padding = pad_char is not None and last_char == ord(pad_char)
diff --git a/codext/__init__.py b/codext/__init__.py
index 486dd2f..661357a 100644
--- a/codext/__init__.py
+++ b/codext/__init__.py
@@ -27,12 +27,6 @@
 reset()
 
 
-# overwritten native codec
-add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e),
-          lambda i, e="strict": orig_lookup("uu").decode(b(i), e),
-          pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native")
-
-
 def __format_list(items, include=True):
     if items is None:
         return
diff --git a/codext/others/__init__.py b/codext/others/__init__.py
index 22d6830..aa7ffa2 100755
--- a/codext/others/__init__.py
+++ b/codext/others/__init__.py
@@ -2,4 +2,5 @@
 from .dna import *
 from .letters import *
 from .markdown import *
+from .uuencode import *
 
diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py
new file mode 100644
index 0000000..5377493
--- /dev/null
+++ b/codext/others/uuencode.py
@@ -0,0 +1,47 @@
+# -*- coding: UTF-8 -*-
+"""UU Codec - UU content encoding, relying on the native uu package.
+
+This codec:
+- en/decodes strings from str to str
+- en/decodes strings from bytes to bytes
+- decodes file content to str (read)
+- encodes file content from str to bytes (write)
+"""
+from io import BytesIO
+from uu import decode as _dec, encode as _enc
+
+from ..__common__ import *
+
+
+__examples__ = {
+    'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI<R!I<R!A('1E<W0 \n \nend\n"},
+    'dec(uu-encode)':   {'.=&AI<R!I<R!A(\'1E<W0 ': "this is a test"},
+    'dec(uuencode)':    {'.=&AI<R!I<R!A(\'1E<W0`': "this is a test"},
+    'dec(uu-codec)':    {'begin 666 -\n.=&AI<R!I<R!A(\'1E<W0`': "this is a test"},
+    'dec(uu_codec)':    {'\n.=&AI<R!I<R!A(\'1E<W0`\n\n\n`\nend': "this is a test"},
+}
+
+
+def uu_encode(text, errors="strict"):
+    out = BytesIO()
+    _enc(BytesIO(b(text)), out)
+    return out.getvalue(), len(text)
+
+
+def uu_decode(text, errors="strict"):
+    t = b(text).strip(b"\n")
+    if not re.match(b"^begin [1-7]{3} .*$", t.split(b"\n")[0]):
+        t = b"begin 666 -\n" + t
+    if not re.match(b"^end$", t.split(b"\n")[-1]):
+        t += [b"", b"`"][t[-1] == b"`"] + b"\nend"
+    out = BytesIO()
+    _dec(BytesIO(t), out, quiet=True)
+    out = out.getvalue()
+    while out.endswith(b"\x00" * 42):
+        out = out[:-42]
+    return out, len(text)
+
+
+add("uu", uu_encode, uu_decode, pattern=r"^uu(?:[-_]?encode|[-_]codec)?$",
+    bonus_func=lambda o, *a: re.match(b"^begin [1-7]{3} .*\n.*\nend$", b(o.text).strip(b"\n"), re.M))
+

From 96239cd2bbd5d2d7afcfd323726bea43b9c8233d Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sat, 12 Mar 2022 23:20:52 +0100
Subject: [PATCH 20/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index b50dd27..61ce01b 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.13.1
+1.13.2

From 837e91a4eb427b926089550509c7cea91f6accab Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 28 Mar 2022 08:03:38 +0200
Subject: [PATCH 21/32] New release

---
 codext/VERSION.txt     | 2 +-
 codext/common/dummy.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 61ce01b..01b7568 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.13.2
+1.13.3
diff --git a/codext/common/dummy.py b/codext/common/dummy.py
index f2dd2fb..7f4be19 100755
--- a/codext/common/dummy.py
+++ b/codext/common/dummy.py
@@ -35,7 +35,11 @@ def code(input, errors="strict"):
 reverse = lambda i, e="strict": (i[::-1], len(i))
 add("reverse", reverse, reverse)
 
-word_reverse = lambda i, e="strict": (" ".join(w[::-1] for w in i.split()), len(i))
+_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \
+                                    if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i))
+line_reverse = lambda i, e="strict": (_revl(i), len(i))
+add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$")
+word_reverse = lambda i, e="strict": (_revl(i, True), len(i))
 add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$")
 
 strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i))

From 564aa5a3743da63ab8b2de8564e9efe41288296b Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 01:04:03 +0200
Subject: [PATCH 22/32] Refactored codec: uu

---
 codext/others/uuencode.py | 49 +++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py
index 5377493..a2f2fb6 100644
--- a/codext/others/uuencode.py
+++ b/codext/others/uuencode.py
@@ -7,39 +7,48 @@
 - decodes file content to str (read)
 - encodes file content from str to bytes (write)
 """
-from io import BytesIO
-from uu import decode as _dec, encode as _enc
+from binascii import a2b_uu as _dec, b2a_uu as _enc
 
 from ..__common__ import *
 
 
 __examples__ = {
     'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI<R!I<R!A('1E<W0 \n \nend\n"},
-    'dec(uu-encode)':   {'.=&AI<R!I<R!A(\'1E<W0 ': "this is a test"},
-    'dec(uuencode)':    {'.=&AI<R!I<R!A(\'1E<W0`': "this is a test"},
-    'dec(uu-codec)':    {'begin 666 -\n.=&AI<R!I<R!A(\'1E<W0`': "this is a test"},
-    'dec(uu_codec)':    {'\n.=&AI<R!I<R!A(\'1E<W0`\n\n\n`\nend': "this is a test"},
+    'dec(uu-encode)':   {'.=&AI<R!I<R!A(\'1E<W0 ': "this is a test", '.=&AI<R!I<R!A(\'1E<W0`': "this is a test"},
+    'dec(uu-codec)':    {'begin 666 -\n.=&AI<R!I<R!A(\'1E<W0`': None, '.=&AI<R!I<R!A(\'1E<W0`\n\n\n`\nend': None},
+    'dec(uu_codec)':    {'begin 777 test.txt\n.=&AI<R!I<R!A(\'1E<W0`\n\n\n`\nend': "this is a test"},
+    'enc-dec(uu)':      ["@random{512,1024,2048}"],
 }
 
 
 def uu_encode(text, errors="strict"):
-    out = BytesIO()
-    _enc(BytesIO(b(text)), out)
-    return out.getvalue(), len(text)
+    r, t = b"begin 666 -\n", b(text)
+    for i in range(0, len(t), 45):
+        r += _enc(t[i:i+45])
+    return r + b" \nend\n", len(text)
 
 
 def uu_decode(text, errors="strict"):
-    t = b(text).strip(b"\n")
-    if not re.match(b"^begin [1-7]{3} .*$", t.split(b"\n")[0]):
-        t = b"begin 666 -\n" + t
-    if not re.match(b"^end$", t.split(b"\n")[-1]):
-        t += [b"", b"`"][t[-1] == b"`"] + b"\nend"
-    out = BytesIO()
-    _dec(BytesIO(t), out, quiet=True)
-    out = out.getvalue()
-    while out.endswith(b"\x00" * 42):
-        out = out[:-42]
-    return out, len(text)
+    h = handle_error("uu", "strict", decode=True, kind="token", item="line")
+    lines = b(text).strip(b" \t\r\n\f").split(b"\n")
+    start, end = re.match(b"^begin [1-7]{3} .*$", lines[0]), re.match(b"^end$", lines[-1])
+    if start and end:
+        lines = lines[1:-1]
+    elif not start and not end:
+        pass
+    else:
+        if errors == "ignore":
+            lines = lines[1:] if start else lines[:-1]
+        elif end:
+            h(lines[0], 0)
+        elif start:
+            h(lines[-1], len(lines)-1)
+    while len(lines) > 0 and lines[-1].strip(b" \t\r\n\f") in [b"", b"`"]:
+        lines = lines[:-1]
+    r = b""
+    for l in lines:
+        r += _dec(l.strip(b" \t\r\n\f"))
+    return r, len(text)
 
 
 add("uu", uu_encode, uu_decode, pattern=r"^uu(?:[-_]?encode|[-_]codec)?$",

From fb292e998df7ef2eb0193cf52b0c1b2cd84ab614 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 01:05:03 +0200
Subject: [PATCH 23/32] Improved guessing and ranking

---
 codext/__common__.py | 130 +++++++++++++++++++++----------------------
 codext/__init__.py   |  12 ++--
 2 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index 0c7ec17..ea1281a 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -744,6 +744,10 @@ def remove(name):
             json.dump(PERS_MACROS, f, indent=2)
     except KeyError:
         pass
+    try:
+        del CODECS_CACHE[name]
+    except KeyError:
+        pass
     for s in ["En", "De"]:
         try:
             delattr(builtins, "%s%scodeError" % (name.capitalize(), s))
@@ -864,6 +868,7 @@ def _handle_error(token, position, output="", eename=None):
         """
         if errors == "strict":
             msg = "'%s' codec can't %scode %s '%s' in %s %d"
+            token = ensure_str(token)
             token = token[:7] + "..." if len(token) > 10 else token
             err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position))
             err.output = output
@@ -968,36 +973,37 @@ def __register(search_function):
 codecs.register = __register
 
 
-def search(encoding_regex):
+def search(encoding_regex, extended=True):
     """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way
          into the local registry but also tries a simple lookup with the original lookup function. """
     matches = []
-    for search_function in __codecs_registry:
+    for search_function in CODECS_OVERWRITTEN + __codecs_registry:
         n = search_function.__name__
         for name in [n, n.replace("_", "-")]:
             if re.search(encoding_regex, name):
-                matches.append(name)
+                matches.append(n.replace("_", "-"))
                 continue
-        # in some cases, encoding_regex can match a generated string that uses a particular portion of its generating
-        #  pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also find "morse" or
-        #  "atbash" very rarely because of their dynamic patterns and the limited number of randomly generated strings
-        # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of matches ;
-        #  executing 5 times the string generation for a given codec but adding the codec to the list of matches only
-        #  if we get at least 3 matches ensures that we consider up to 2 failures that could be stochastic, therefore
-        #  drastically decreasing the probability to get a "junk" encoding in the matches list
-        c = 0
-        for i in range(5):
-            for s in generate_strings_from_regex(search_function.__pattern__):
-                if re.search(encoding_regex, s):
-                    c += 1
+        if extended:
+            # in some cases, encoding_regex can match a generated string that uses a particular portion of its
+            #  generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also
+            #  find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly
+            #  generated strings
+            # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of
+            #  matches ; executing 5 times the string generation for a given codec but adding the codec to the list of
+            #  matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be
+            #  stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list
+            c = 0
+            for i in range(5):
+                for s in generate_strings_from_regex(search_function.__pattern__):
+                    if re.search(encoding_regex, s):
+                        c += 1
+                        break
+                if c >= 3:
+                    matches.append(n)
                     break
-            if c >= 3:
-                matches.append(n)
-                break
     for s, n in ALIASES.items():
         if re.search(encoding_regex, s) or re.search(encoding_regex, n):
             matches.append(n)
-            break
     return sorted(list(set(matches)), key=_human_keys)
 codecs.search = search
 
@@ -1241,7 +1247,7 @@ def _validate(stop_function, lang_backend="none"):
 stopfunc._validate = _validate
 
 
-def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(),
+def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(),
             stop=True, show=False, scoring_heuristic=False, extended=False, debug=False):
     """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """
     if depth > min_depth and stop_func(input):
@@ -1255,58 +1261,53 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings
         return
     prev_enc = found[-1] if len(found) > 0 else ""
     e = encodings.get(depth, encodings.get(-1, []))
-    for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended):
+    for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended):
         if len(result) > 0 and stop:
             return
         if debug:
             print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding))
-        __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result,
-                found + (encoding, ), stop, show, scoring_heuristic, extended, debug)
+        __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ),
+                stop, show, scoring_heuristic, extended, debug)
 
 
 def __make_encodings_dict(include, exclude):
     """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible
          encoding names. It also creates a cache with the CodecInfo objects for improving performance. """
-    codecs = {}
     def _develop(d, keep=True):
         d = d or {}
         for k, v in d.items():
             l, cc = [], [e for e in v if e in CODECS_CATEGORIES]
-            for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \
+            # list from in-scope categories and then everything that is not a category
+            for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \
                         [e for e in v if e not in CODECS_CATEGORIES]):
-                try:
-                    g = lookup(enc, False).parameters['guess']
-                except:
-                    g = [enc]
-                if enc in g and not keep:  # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected
+                g = []
+                for e in (search(enc, False) or [enc]):
+                    try:
+                        ci = lookup(e, False)
+                        g.extend(ci.parameters['guess'])
+                    except:
+                        pass
+                if enc in g:  # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected
                     l.append(enc)
-                else:                      # e.g. "rot"   => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected
+                else:         # e.g. "rot"   => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected
                     l.extend(g)
-            d[k] = l
-            if keep:
-                for e in l:
-                    # cache newly loaded CodecInfo objects
-                    ci = lookup(e, False)
-                    n = ci.name
-                    if n in CODECS_CACHE:
-                        ci = CODECS_CACHE[n]  # keep the cached object
-                    else:
-                        CODECS_CACHE[n] = ci  # cache the new object
-                    codecs[e] = ci
+            d[k] = list(set(l))
         return d
     exclude = _develop(exclude, False)
-    return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs
+    return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}
 
 
-def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False):
+def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False):
     """ Filter valid encodings and rank them by relevance. """
     ranking = {}
-    for encoding in encodings:
+    for e in encodings:
         try:
-            score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended)
-        except TypeError:
-            continue
-        ranking[encoding] = (score, new)
+            codec = CODECS_CACHE[e]
+        except KeyError:
+            CODECS_CACHE[e] = codec = lookup(e, False)
+        t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended)
+        if t:
+            ranking[e] = t
     for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]):
         yield result if yield_score else result[1], encoding
 
@@ -1315,16 +1316,16 @@ class _Text(object):
     __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"]
     
     def __init__(self, text, pad_char=None):
-        self.text = text
-        c = text[-1]
-        pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c))
-        self.padding = pad_char is not None and last_char == ord(pad_char)
+        self.text = ensure_str(text)
+        c = self.text[-1]
+        pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c)
+        self.padding = pad_char is not None and last_char == pad_char
         if self.padding:
             text = text.rstrip(pad_char)
-        self.len = len(text)
-        self.lcharset = len(set(text))
-        self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len
-        self.entropy = entropy(text)
+        self.len = len(self.text)
+        self.lcharset = len(set(self.text))
+        self.printables = float(len([c for c in self.text if c in printable])) / self.len
+        self.entropy = entropy(self.text)
 
 
 def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False):
@@ -1386,13 +1387,14 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False,
                 except TypeError:
                     expf = expf(f)
             if isinstance(expf, (int, float)):
+                tmp = expf
                 expf = (1/f - .1 <= 1/expf <= 1/f + .1)
             elif isinstance(expf, (tuple, list)) and len(expf) == 2:
                 expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1]
             s += [-1., .1][expf]
         # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the
         #  number of input characters to take bad entropies of shorter strings into account
-        entr = sc.get('entropy', {})
+        entr = sc.get('entropy', lambda e: e)
         entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr
         if isinstance(entr, type(lambda: None)):
             try:  # this case allows to consider the current encoding name from the current codec
@@ -1401,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False,
                 entr = entr(obj.entropy)
         if entr is not None:
             # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1)
-            d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input))
+            d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input))
             if d_entr <= .5:
                 s += .5 - d_entr
         # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
@@ -1475,12 +1477,11 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=N
         if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()):
             raise ValueError("Include argument shall be a list or a dictionary with integer keys")
     # precompute encodings lists per depth and cache the related CodecInfo objects
-    encodings, codecs = __make_encodings_dict(include, exclude)
-    result = {}
+    encodings, result = __make_encodings_dict(include, exclude), {}
     try:
         # breadth-first search
         for d in range(max_depth):
-            __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show,
+            __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show,
                     scoring_heuristic, extended, debug)
             if stop and len(result) > 0:
                 break
@@ -1500,9 +1501,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None):
     :param include:  inclusion list with category, codec or encoding names (nothing means include every encoding)
     :param exclude:  exclusion list with category, codec or encoding names (nothing means exclude no encoding)
     """
-    encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []})
-    r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True))
-    CODECS_CACHE = {}
+    encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []})
+    r = list(__rank(None, input, "", encodings[-1], True, extended, True))
     return r[:limit] if len(r) > 1 else r
 codecs.rank = rank
 
diff --git a/codext/__init__.py b/codext/__init__.py
index 661357a..f95abb8 100644
--- a/codext/__init__.py
+++ b/codext/__init__.py
@@ -154,12 +154,14 @@ def _format_action_invocation(self, action):
     guess.add_argument("-v", "--verbose", action="store_true",
                        help="show guessing information and steps (default: False)")
     rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw)
-    rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY",
-                      help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)")
-    rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC",
-                      help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)")
+    rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                      help="categories, codecs and encodings to be explicitely not used ;\n "
+                           "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     rank.add_argument("-E", "--extended", action="store_true",
                       help="while using the scoring heuristic, also consider null scores (default: False)")
+    rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                      help="categories, codecs and encodings to be explicitely used ;\n "
+                           "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results")
     search = sparsers.add_parser("search", help="search for codecs")
     search.add_argument("pattern", nargs="+", help="encoding pattern to search")
@@ -243,7 +245,7 @@ def _format_action_invocation(self, action):
             if len(r) == 0:
                 print("Could not decode :-(")
         elif args.command == "rank":
-            for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs):
+            for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude):
                 s = "[+] %.5f: %s" % (i[0], e)
                 print(s if len(s) <= 80 else s[:77] + "...")
     except Exception as e:

From b4a29503392cfdbb89f6e58d48441c9f60e8e6e4 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 01:05:19 +0200
Subject: [PATCH 24/32] Refined case codecs

---
 codext/common/cases.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/codext/common/cases.py b/codext/common/cases.py
index 65fbdf2..8aa87e4 100644
--- a/codext/common/cases.py
+++ b/codext/common/cases.py
@@ -20,20 +20,20 @@
 
 capitalize = lambda i, e="strict": (i.capitalize(), len(i))
 uncapitalize = lambda i, e="strict": (i[0].lower() + i[1:] if len(i) > 0 else "", len(i))
-add("capitalize", capitalize, uncapitalize)
+add("capitalize", capitalize, uncapitalize, penalty=.2)
 
 lowercase, uppercase = lambda i, e="strict": (i.lower(), len(i)), lambda i, e="strict": (i.upper(), len(i))
-add("uppercase", uppercase, lowercase, r"^upper(?:case)?$")
-add("lowercase", lowercase, uppercase, r"^lower(?:case)?$")
+add("uppercase", uppercase, lowercase, r"^upper(?:case)?$", penalty=.2)
+add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2)
 
 slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i))
 add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$")
 add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$")
 
 swapcase = lambda i, e="strict": (i.swapcase(), len(i))
-add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$")
+add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2)
 
 title = lambda i, e="strict": (i.title(), len(i))
 untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i))
-add("title", title, untitle)
+add("title", title, untitle, penalty=.2)
 

From 224b2d005eb3020b9470ab8b6cf182bcd0a49b89 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 01:05:44 +0200
Subject: [PATCH 25/32] Refined tests/test_common

---
 tests/test_common.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 6eddd7e..934155f 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -151,7 +151,7 @@ def test_guess_decode(self):
         self.assertIsNotNone(codext.stopfunc._validate("flag"))
         _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None
         codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
-                   "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
+                   "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
         self.assertIn("test-codec", codext.list_encodings("test"))
         self.assertEqual(codext.decode("TEST=", "test"), "TEST")
         self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2,
@@ -203,13 +203,15 @@ def test_guess_decode(self):
         self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST")
     
     def test_rank_input(self):
+        codext.remove("test_codec")
+        self.assertRaises(LookupError, codext.encode, "TEST", "test")
         codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1),
-                   "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5)
+                   "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.)
         STR = "This is a test string !"
         ENC = codext.encode(STR, "base64")
         self.assertTrue(len(codext.rank(ENC)) > 20)
         self.assertEqual(len(codext.rank(ENC, limit=20)), 20)
-        self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64")
+        self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"])
         self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR)
         self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR)
         self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR)

From dcbeba184f89ac7fd31bf7b4dfbca2fc1809ee70 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 01:31:45 +0200
Subject: [PATCH 26/32] Fixed scoring for compression codecs

---
 codext/__common__.py            | 4 ++--
 codext/compressions/__init__.py | 6 ++++++
 codext/compressions/gzipp.py    | 2 +-
 codext/compressions/lz77.py     | 2 +-
 codext/compressions/pkzip.py    | 6 +++---
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index ea1281a..f65d210 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1402,8 +1402,8 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False,
             except TypeError:
                 entr = entr(obj.entropy)
         if entr is not None:
-            # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1)
-            d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input))
+            # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1)
+            d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input))
             if d_entr <= .5:
                 s += .5 - d_entr
         # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
diff --git a/codext/compressions/__init__.py b/codext/compressions/__init__.py
index 37f1fa5..606a1dc 100755
--- a/codext/compressions/__init__.py
+++ b/codext/compressions/__init__.py
@@ -4,3 +4,9 @@
 from .lz78 import *
 from .pkzip import *
 
+
+for e in list_encodings("compression"):
+    ci = lookup(e, False)
+    ci.parameters['scoring']['entropy'] = 7.9
+    ci.parameters['scoring']['expansion_factor'] = lambda f: f
+
diff --git a/codext/compressions/gzipp.py b/codext/compressions/gzipp.py
index da52b5a..14e65bc 100755
--- a/codext/compressions/gzipp.py
+++ b/codext/compressions/gzipp.py
@@ -40,5 +40,5 @@ def gzip_decompress(data, errors="strict"):
     return r, len(r)
 
 
-add("gzip", gzip_compress, gzip_decompress, entropy=7.9)
+add("gzip", gzip_compress, gzip_decompress)
 
diff --git a/codext/compressions/lz77.py b/codext/compressions/lz77.py
index 662f02c..bdfcf13 100644
--- a/codext/compressions/lz77.py
+++ b/codext/compressions/lz77.py
@@ -70,5 +70,5 @@ def lz77_decompress(input, errors="strict"):
     return out, len(out)
 
 
-add("lz77", lz77_compress, lz77_decompress, entropy=7.9)
+add("lz77", lz77_compress, lz77_decompress)
 
diff --git a/codext/compressions/pkzip.py b/codext/compressions/pkzip.py
index ebbcbce..47d9cd5 100755
--- a/codext/compressions/pkzip.py
+++ b/codext/compressions/pkzip.py
@@ -46,11 +46,11 @@ def _decode(data, errors="strict"):
 
 
     add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate",
-        entropy=7.9, examples=__examples1__, guess=["deflate"])
+        examples=__examples1__, guess=["deflate"])
 
     add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2",
-        entropy=7.9, examples=__examples2__, guess=["bz2"])
+        examples=__examples2__, guess=["bz2"])
 
     add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma",
-        entropy=7.9, examples=__examples3__, guess=["lzma"])
+        examples=__examples3__, guess=["lzma"])
 

From a1b41fab747e36bf2eaf7f3037715f9fc7a28ddf Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 08:31:49 +0200
Subject: [PATCH 27/32] Fixed minor issues

---
 codext/__common__.py | 4 ++--
 tests/test_common.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index f65d210..41cb5b2 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -1308,7 +1308,7 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende
         t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended)
         if t:
             ranking[e] = t
-    for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]):
+    for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])):
         yield result if yield_score else result[1], encoding
 
 
@@ -1403,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False,
                 entr = entr(obj.entropy)
         if entr is not None:
             # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1)
-            d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input))
+            d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy)
             if d_entr <= .5:
                 s += .5 - d_entr
         # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched)
diff --git a/tests/test_common.py b/tests/test_common.py
index 934155f..8bbf410 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -211,7 +211,7 @@ def test_rank_input(self):
         ENC = codext.encode(STR, "base64")
         self.assertTrue(len(codext.rank(ENC)) > 20)
         self.assertEqual(len(codext.rank(ENC, limit=20)), 20)
-        self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"])
+        self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"])
         self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR)
         self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR)
         self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR)

From 3d7f43dea12c40e330b6be8b8fb5011d3c8ee13b Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Tue, 29 Mar 2022 08:33:03 +0200
Subject: [PATCH 28/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 01b7568..80138e7 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.13.3
+1.13.4

From 281ca1bacbbc0891f7e2987bbbd161f507823bd3 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Wed, 7 Sep 2022 20:36:19 +0200
Subject: [PATCH 29/32] Added codec: tokenize

---
 codext/common/dummy.py | 12 +++++++++++-
 docs/manipulations.md  | 10 +++++++++-
 tests/test_manual.py   |  2 ++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/codext/common/dummy.py b/codext/common/dummy.py
index 7f4be19..b45c023 100755
--- a/codext/common/dummy.py
+++ b/codext/common/dummy.py
@@ -22,7 +22,7 @@ def code(input, errors="strict"):
 # important note:                                              ^
 #                                           using "{2}" here instead will break the codec
 #  this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will
-#   faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo
+#   fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo
 
 
 def substitute(token, replacement):
@@ -45,3 +45,13 @@ def code(input, errors="strict"):
 strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i))
 add("strip-spaces", strip_spaces, strip_spaces, guess=None)
 
+def tokenize(n):
+    tlen = int(n[8:].lstrip("-_"))
+    def code(input, errors="strict"):
+        l = len(input)
+        if tlen > l:
+            raise LookupError("unknown encoding: %s" % n)
+        return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l
+    return code
+add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None)
+
diff --git a/docs/manipulations.md b/docs/manipulations.md
index 7962278..8857ca7 100644
--- a/docs/manipulations.md
+++ b/docs/manipulations.md
@@ -43,11 +43,12 @@ These transformation functions are simple string transformations.
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`replace` | text <-> text with single-char replaced |  | 
+`replace` | text <-> text with multi-chars replaced |  | parametrized with a _string_ and its _replacement_
 `reverse` | text <-> reversed text |  | 
 `reverse-words` | text <-> reversed words |  | same as `reverse` but not on the whole text, only on the words (text split by whitespace)
 `strip-spaces` | text <-> all whitespaces stripped |  | 
 `substitute` | text <-> text with token substituted |  | 
+`tokenize` | text <-> text split in tokens of length N |  | parametrized with _N_
 
 As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)).
 
@@ -58,6 +59,13 @@ $ echo -en "test string" | codext encode reverse-words | codext encode reverse r
 string_test
 ```
 
+Another example:
+
+```sh
+$ echo -en "3132333435" | codext encode tokenize-2
+31 32 33 34 35
+```
+
 Or using encodings chaining:
 
 ```sh
diff --git a/tests/test_manual.py b/tests/test_manual.py
index 4211df7..64b1843 100644
--- a/tests/test_manual.py
+++ b/tests/test_manual.py
@@ -100,6 +100,8 @@ def test_codec_dummy_str_manips(self):
         self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR)
         self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that"))
         self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR)
+        self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is  i s  a  te st")
+        self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200")
     
     def test_codec_hash_functions(self):
         STR = b"This is a test string!"

From 4792a99b3a3780765b80c68f0bbcb46da27a2f7b Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 11 Sep 2022 19:13:41 +0200
Subject: [PATCH 30/32] Fixed minor bugs

---
 codext/__common__.py    | 26 +++++++++++++++-----------
 tests/test_generated.py | 12 +++++++++---
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/codext/__common__.py b/codext/__common__.py
index 41cb5b2..9d9400c 100644
--- a/codext/__common__.py
+++ b/codext/__common__.py
@@ -109,10 +109,11 @@ def __new__(cls, name):
         for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items():
             if re.match(r"enc(-dec)?\(", action):
                 for e in (examples.keys() if action.startswith("enc(") else examples or []):
-                    rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e)
+                    rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e)
                     if rd:
-                        for n in (rd.group(1) or "512").split(","):
-                            self.encode("".join(chr(randint(0, 255)) for i in range(int(n))))
+                        for n in (rd.group(2) or "512").split(","):
+                            s = "".join(chr(randint(0, 255)) for i in range(int(n)))
+                            self.encode(s.lower() if rd.group(1) else s)
                         continue
                     self.encode(e)
         
@@ -1276,10 +1277,9 @@ def __make_encodings_dict(include, exclude):
     def _develop(d, keep=True):
         d = d or {}
         for k, v in d.items():
-            l, cc = [], [e for e in v if e in CODECS_CATEGORIES]
+            l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES]
             # list from in-scope categories and then everything that is not a category
-            for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \
-                        [e for e in v if e not in CODECS_CATEGORIES]):
+            for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc):
                 g = []
                 for e in (search(enc, False) or [enc]):
                     try:
@@ -1293,8 +1293,8 @@ def _develop(d, keep=True):
                     l.extend(g)
             d[k] = list(set(l))
         return d
-    exclude = _develop(exclude, False)
-    return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}
+    _excl, _incl = _develop(exclude, False), _develop(include)
+    return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()}
 
 
 def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False):
@@ -1304,7 +1304,10 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende
         try:
             codec = CODECS_CACHE[e]
         except KeyError:
-            CODECS_CACHE[e] = codec = lookup(e, False)
+            try:
+                CODECS_CACHE[e] = codec = lookup(e, False)
+            except LookupError:
+                continue
         t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended)
         if t:
             ranking[e] = t
@@ -1321,7 +1324,7 @@ def __init__(self, text, pad_char=None):
         pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c)
         self.padding = pad_char is not None and last_char == pad_char
         if self.padding:
-            text = text.rstrip(pad_char)
+            text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char)
         self.len = len(self.text)
         self.lcharset = len(set(self.text))
         self.printables = float(len([c for c in self.text if c in printable])) / self.len
@@ -1501,7 +1504,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None):
     :param include:  inclusion list with category, codec or encoding names (nothing means include every encoding)
     :param exclude:  exclusion list with category, codec or encoding names (nothing means exclude no encoding)
     """
-    encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []})
+    encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES},
+                                      exclude if isinstance(exclude, dict) else {-1: exclude or []})
     r = list(__rank(None, input, "", encodings[-1], True, extended, True))
     return r[:limit] if len(r) > 1 else r
 codecs.rank = rank
diff --git a/tests/test_generated.py b/tests/test_generated.py
index 6b89129..614562f 100644
--- a/tests/test_generated.py
+++ b/tests/test_generated.py
@@ -36,6 +36,11 @@ def _template(self):
                 for ename in m.groups():
                     if ename is None:
                         continue
+                    # buggy generated encoding names
+                    try:
+                        lookup(ename)
+                    except LookupError:
+                        continue
                     # erroneous encoding name test
                     if examples is None:
                         self.assertRaises(LookupError, f1, "test", ename)
@@ -72,11 +77,12 @@ def _template(self):
                     # examples validation tests
                     if k.startswith("enc-dec") and isinstance(examples, list):
                         for e in examples[:]:
-                            rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e)
+                            rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e)
                             if rd:
                                 examples.remove(e)
-                                for n in (rd.group(1) or "512").split(","):
-                                    examples.append("".join(chr(randint(0, 255)) for i in range(int(n))))
+                                for n in (rd.group(2) or "512").split(","):
+                                    s = "".join(chr(randint(0, 255)) for i in range(int(n)))
+                                    examples.append(s.lower() if rd.group(1) else s)
                         for s in [""] + examples:
                             self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s))
                             self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s)))

From b4e1eb66fb8764df992cc6434f0e69a6eedbd9b5 Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Sun, 11 Sep 2022 19:13:57 +0200
Subject: [PATCH 31/32] Added codec: kbshift

---
 codext/others/__init__.py |  1 +
 codext/others/kbshift.py  | 66 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100755 codext/others/kbshift.py

diff --git a/codext/others/__init__.py b/codext/others/__init__.py
index aa7ffa2..3bbf102 100755
--- a/codext/others/__init__.py
+++ b/codext/others/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: UTF-8 -*-
 from .dna import *
+from .kbshift import *
 from .letters import *
 from .markdown import *
 from .uuencode import *
diff --git a/codext/others/kbshift.py b/codext/others/kbshift.py
new file mode 100755
index 0000000..2bd0991
--- /dev/null
+++ b/codext/others/kbshift.py
@@ -0,0 +1,66 @@
+# -*- coding: UTF-8 -*-
+"""Keyboard-Shift Codec - keyboard line shifting content encoding.
+
+This codec:
+- en/decodes strings from str to str
+- en/decodes strings from bytes to bytes
+- decodes file content to str (read)
+- encodes file content from str to bytes (write)
+"""
+from ..__common__ import *
+
+
+LAYOUTS = {
+    'ansi':      "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./",
+    'azerty':    "azertyuiop\nqsdfghjklm\nwxcvbn",
+    'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~",
+    'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!",
+    'dvorak':    "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;",
+    'qwerty':    "qwertyuiop\nasdfghjkl\nzxcvbnm",
+    'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./",
+}
+__per_len = {}
+for k, s in LAYOUTS.items():
+    i = max(map(len, s.split("\n")))
+    __per_len.setdefault(i, [])
+    __per_len[i].append(k)
+
+
+__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()}
+__guess__ = []
+for mlen, kbs in __per_len.items():
+    for k in kbs:
+        __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)])
+
+
+def _kbshift(text, keyboard="azerty", n=1, decode=False):
+    r = ""
+    for c in text:
+        nc = None
+        for l in LAYOUTS[keyboard].splitlines():
+            if c.lower() in l:
+                nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)]
+                break
+        r += c if nc is None else nc
+    return r
+
+
+def kbshift_encode(scheme):
+    kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups()
+    def encode(text, errors="strict"):
+        r = _kbshift(ensure_str(text), kb, int(shift))
+        return r, len(r)
+    return encode
+
+
+def kbshift_decode(scheme):
+    kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups()
+    def decode(text, errors="strict"):
+        r = _kbshift(ensure_str(text), kb, int(shift), True)
+        return r, len(r)
+    return decode
+
+
+add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True,
+    pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$")
+

From cd234d5d97867f1470b45499694f3776aa74569b Mon Sep 17 00:00:00 2001
From: dhondta <alexandre.dhondt@gmail.com>
Date: Mon, 12 Sep 2022 21:53:13 +0200
Subject: [PATCH 32/32] New release

---
 codext/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
index 80138e7..850e742 100644
--- a/codext/VERSION.txt
+++ b/codext/VERSION.txt
@@ -1 +1 @@
-1.13.4
+1.14.0