bbhunter · pull · Feb 22, 2022 · Feb 22, 2022 · Feb 22, 2022 · Feb 22, 2022
diff --git a/.coveragerc b/.coveragerc
@@ -11,7 +11,7 @@ exclude_lines =
     def main\(\)\:
     def __stdin_pipe\(\)\:
     for line in __stdin_pipe\(\)\:
-    def __literal_eval\(o\)\:
+    def __format_list\(items, include\=True\)\:
     def __print_tabular\(lst, space\=4\)\:
     except ImportError:
     except NameError:
@@ -20,3 +20,8 @@ exclude_lines =
     if PY3
     def encode\(self, input, final\=False\)\:
     def decode\(self, input, final\=False\)\:
+    def _detect\(text\)\:
+    def _lang\(lang\)\:
+    if stopfunc\.LANG_BACKEND\:
+    def _validate\(stop_function, lang_backend\=\"none\"\)\:
+    except KeyboardInterrupt\:
diff --git a/README.md b/README.md
@@ -219,6 +219,7 @@ o
 - [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet)
 - [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet)
 - [X] `base10`: simple conversion to decimal
+- [X] `base11`: conversion to digits with a "*a*"
 - [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted)
 - [X] `base26`: conversion to alphabet letters
 - [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html))

diff --git a/codext/VERSION.txt b/codext/VERSION.txt
@@ -1 +1 @@
-1.12.2
+1.14.0
diff --git a/codext/__common__.py b/codext/__common__.py
diff --git a/codext/__init__.py b/codext/__init__.py
@@ -27,18 +27,23 @@
 reset()
 
 
-# overwritten native codec
-add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e),
-          lambda i, e="strict": orig_lookup("uu").decode(b(i), e),
-          pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native")
-
-
-def __literal_eval(o):
-    """ Non-failing ast.literal_eval alias function. """
-    try:
-        return literal_eval(str(o))
-    except ValueError:
-        return literal_eval("'" + str(o) + "'")
+def __format_list(items, include=True):
+    if items is None:
+        return
+    d = {-1: list_encodings() if include else []}
+    for n, i in enumerate(items):
+        try:
+            depth, i = i.split(":")
+            depth = int(depth.strip().replace("~", "-"))
+            if depth < 0:
+                depth = -1
+        except ValueError:
+            if n == 0:
+                d[-1] = []
+            depth = -1
+        d.setdefault(depth, [])
+        d[depth].append(i.strip())
+    return d
 
 
 def __print_tabular(lst, space=4):
@@ -70,6 +75,19 @@ def __print_tabular(lst, space=4):
 
 def main():
     import argparse, os
+
+    class _CustomFormatter(argparse.RawTextHelpFormatter):
+        def __init__(self, prog, **kwargs):
+            kwargs['max_help_position'] = 32
+            super(_CustomFormatter, self).__init__(prog, **kwargs)
+
+        def _format_action_invocation(self, action):
+            if not action.option_strings:
+                metavar, = self._metavar_formatter(action, action.dest)(1)
+                return metavar
+            else:
+                return ", ".join(action.option_strings)
+
     descr = "Codecs Extension (CodExt) {}\n\nAuthor   : {} ({})\nCopyright: {}\nLicense  : {}\nSource   : {}\n" \
             "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \
             .format(__version__, __author__, __email__, __copyright__, __license__, __source__)
@@ -87,78 +105,79 @@ def main():
         "echo -en \"test\" | codext encode base64 gzip | codext guess",
         "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base",
     ])
-    parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter)
-    sparsers = parser.add_subparsers(dest="command", help="command to be executed")
+    kw = {'formatter_class': _CustomFormatter}
+    parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw)
+    kw2 = {'required': True} if PY3 else {}
+    sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2)
     parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)")
     parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)")
     parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip",
                         help="strip newlines from input (default: False)")
-    encode = sparsers.add_parser("encode", help="encode input using the specified codecs")
+    encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw)
     encode.add_argument("encoding", nargs="+", help="list of encodings to apply")
     encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
                         help="error handling (default: strict)")
-    decode = sparsers.add_parser("decode", help="decode input using the specified codecs")
+    decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw)
     decode.add_argument("encoding", nargs="+", help="list of encodings to apply")
     decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
                         help="error handling (default: strict)")
-    guess = sparsers.add_parser("guess", help="try guessing the decoding codecs")
+    guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw)
     guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)")
-    guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; "
-                                                                   "format: string|tuple")
-    guess.add_argument("-d", "--min-depth", default=0, type=int, help="minimum codec search depth before triggering "
-                                                                "results (default: 0)")
-    guess.add_argument("-D", "--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)")
-    guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; "
-                                                                 "format: string|tuple")
+    guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                       help="categories, codecs and encodings to be explicitely not used ;\n "
+                            "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     guess.add_argument("-E", "--extended", action="store_true",
                        help="while using the scoring heuristic, also consider null scores (default: False)")
     lng = "lang_%s" % LANG
     def_func = lng if getattr(stopfunc, lng, None) else "text"
-    guess.add_argument("-f", "--stop-function", default=def_func, help="result checking function (default: %s) ; "
-                       "format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-sensitive ; add -i to "
-                       "force it as case-insensitive or add '(?i)' in front of the expression" % def_func)
-    guess.add_argument("-i", "--case-insensitive", dest="icase", action="store_true",
-                       help="while using the regex stop function, set it as case-insensitive (default: False)")
+    guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function "
+                       "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-"
+                       "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression"
+                       % def_func)
     guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down"
                        " the search but may be more accurate (default: False)")
+    guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                       help="categories, codecs and encodings to be explicitely used ;\n "
+                            "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
+    guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true",
+                       help="while using the regex stop function, set it as case-insensitive (default: False)")
     if len(stopfunc.LANG_BACKENDS) > 0:
         _lb = stopfunc.LANG_BACKEND
         guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"],
                            help="natural language detection backend (default: %s)" % _lb)
+    guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT",
+                       help="minimum codec search depth before triggering results (default: 0)")
+    guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT",
+                       help="maximum codec search depth (default: 5)")
     guess.add_argument("-s", "--do-not-stop", action="store_true",
                        help="do not stop if a valid output is found (default: False)")
     guess.add_argument("-v", "--verbose", action="store_true",
                        help="show guessing information and steps (default: False)")
-    rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input")
-    rank.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; "
-                                                       "format: string|tuple|list(strings|tuples)")
-    rank.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; "
-                                                     "format: string|tuple|list(strings|tuples)")
+    rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw)
+    rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                      help="categories, codecs and encodings to be explicitely not used ;\n "
+                           "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     rank.add_argument("-E", "--extended", action="store_true",
                       help="while using the scoring heuristic, also consider null scores (default: False)")
+    rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
+                      help="categories, codecs and encodings to be explicitely used ;\n "
+                           "format: [category|codec|encoding] OR depth:[category|codec|encoding]")
     rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results")
     search = sparsers.add_parser("search", help="search for codecs")
     search.add_argument("pattern", nargs="+", help="encoding pattern to search")
     listi = sparsers.add_parser("list", help="list items")
-    lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed")
+    lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2)
     liste = lsparsers.add_parser("encodings", help="list encodings")
-    liste.add_argument("category", nargs="*", help="selected categories")
+    liste.add_argument("category", nargs="+", help="selected categories")
     listm = lsparsers.add_parser("macros", help="list macros")
     addm = sparsers.add_parser("add-macro", help="add a macro to the registry")
     addm.add_argument("name", help="macro's name")
     addm.add_argument("encoding", nargs="+", help="list of encodings to chain")
     remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry")
     remm.add_argument("name", help="macro's name")
     args = parser.parse_args()
-    try:
-        args.codec_categories = _lst(map(__literal_eval, args.codec_categories))
-    except (AttributeError, TypeError):
-        pass
-    try:
-        args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs))
-    except (AttributeError, TypeError):
-        pass
-    #print(args.codec_categories, args.exclude_codecs)
+    if args.command in ["guess", "rank"]:
+        args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False)
     try:
         # if a search pattern is given, only handle it
         if args.command == "search":
@@ -211,17 +230,9 @@ def main():
                all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
                 stopfunc._reload_lang(lb)
             r = codecs.guess(c,
-                             getattr(stopfunc, s, ["", "(?i)"][args.icase] + s),
-                             args.min_depth,
-                             args.max_depth,
-                             args.codec_categories,
-                             args.exclude_codecs,
-                             args.encoding,
-                             not args.do_not_stop,
-                             True,  # show
-                             not args.no_heuristic,
-                             args.extended,
-                             args.verbose)
+                             getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth,
+                             args.include, args.exclude, args.encoding, not args.do_not_stop, True,  # show
+                             not args.no_heuristic, args.extended, args.verbose)
             for i, o in enumerate(r.items()):
                 e, out = o
                 if len(e) > 0:
@@ -234,10 +245,11 @@ def main():
             if len(r) == 0:
                 print("Could not decode :-(")
         elif args.command == "rank":
-            for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs):
+            for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude):
                 s = "[+] %.5f: %s" % (i[0], e)
                 print(s if len(s) <= 80 else s[:77] + "...")
     except Exception as e:
+        raise e
         m = str(e)
         print("codext: " + m[0].lower() + m[1:])
 
diff --git a/codext/base/__init__.py b/codext/base/__init__.py
@@ -19,7 +19,7 @@ def main():
 With no FILE, or when FILE is -, read standard input.
 
 Optional arguments:
-  -e, --extended        also consider generic base codecs while guess-decoding
+  -E, --extended        also consider generic base codecs while guess-decoding
   -f, --stop-function   set the result chceking function (default: text)
                          format: printables|text|flag|lang_[bigram]
   -M, --max-depth       maximum codec search depth (default: 5)
@@ -36,28 +36,23 @@ def main():
 """
     parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False)
     parser.format_help = MethodType(lambda s: s.description, parser)
+    group = parser.add_mutually_exclusive_group()
     parser.add_argument("file", nargs="?")
-    parser.add_argument("-e", "--extended", action="store_true")
-    parser.add_argument("-f", "--stop-function", default="text")
+    parser.add_argument("-E", "--extended", action="store_true")
+    group.add_argument("-f", "--stop-function", default="text")
     parser.add_argument("-M", "--max-depth", type=int, default=10)
     parser.add_argument("-m", "--min-depth", type=int, default=0)
-    parser.add_argument("-p", "--pattern")
+    group.add_argument("-p", "--pattern")
     parser.add_argument("-s", "--show", action="store_true")
     parser.add_argument("--help", action="help")
     parser.add_argument("--version", action="version")
     parser.add_argument("--verbose", action="store_true")
     parser.version = "CodExt " + __version__
     args = parser.parse_args()
-    excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function
-    if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
-        stopfunc._reload_lang(stopfunc.LANG_BACKEND)
-    #TODO: validate args.stop_function
-    #TODO: make --stop-function and --pattern mutually exclusive
-    sfunc = getattr(stopfunc, s, s)
-    c = _input(args.file)
+    c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended]
     c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
-    r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base",
-                     stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose)
+    r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False,
+                     show=args.verbose, debug=args.verbose)
     if len(r) == 0:
         print("Could not decode :-(")
         return 0

diff --git a/codext/base/_base.py b/codext/base/_base.py
@@ -191,7 +191,11 @@ def _decode(input, errors="strict"):
     kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs)
     kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05))
     n = "base{}".format(n) if name is None else name
-    kwargs['guess'] = kwargs.get('guess', [n])
+    try:
+        g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n]
+    except AttributeError:
+        g = [n]
+    kwargs['guess'] = kwargs.get('guess', g)
     add(n, encode, decode, pattern, entropy=nb, **kwargs)
 
 

diff --git a/codext/base/base100.py b/codext/base/base100.py
@@ -37,7 +37,7 @@ def base100_encode(input, errors="strict"):
         return bytes(r), len(input)
 
     def base100_decode(input, errors="strict"):
-        input = b(input)
+        input = b(_stripl(input, True, True))
         if errors == "ignore":
             input = input.replace(b"\n", "")
         if len(input) % 4 != 0:

diff --git a/codext/base/base122.py b/codext/base/base122.py
@@ -98,9 +98,9 @@ def _get_7bits(currB, bob, B, decoded):
                 currB, bob = _get_7bits(currB, bob, input[i] & 127, r)
             else:
                 currB, bob = _get_7bits(currB, bob, input[i], r)
-        return "".join(map(chr, r)), len(input)
+        return "".join(map(chr, r)).rstrip("\0"), len(input)
 
 
 add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085)
-main122 = main(122, "<http://blog.kevinalbs.com/base122>")
+main122 = main(122, "<http://blog.kevinalbs.com/base122>", wrap=False)
 
diff --git a/codext/base/base91.py b/codext/base/base91.py
@@ -72,7 +72,7 @@ def encode(text, errors="strict"):
 def base91_decode(mode):
     b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))}
     def decode(text, errors="strict"):
-        t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
+        t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
         ehandler = handle_error("base91", errors, decode=True)
         for i in range(0, len(t), 2):
             try:
@@ -103,7 +103,7 @@ def decode(text, errors="strict"):
                 bits = bits[8:]
         elif not alt and len(bits) > 0 and not set(bits) == {"0"}:
             s += chr(int(bits, 2))
-        return s, len(t)
+        return s.rstrip("\0"), len(t)
     return decode