From b186a968dc7e0ac52f3303d59758f65f5a2a3d98 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 22 Feb 2022 18:49:10 +0100 Subject: [PATCH 01/32] Fixed bug in guess mode --- codext/__common__.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 89522e8..9b7936f 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1243,11 +1243,9 @@ def __develop(encodings): def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False, regex=False): + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): - if regex: - stop = True if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s @@ -1288,7 +1286,7 @@ def expand(items, descr=None, transform=None): if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, - found + (encoding, ), stop, show, scoring_heuristic, extended, debug, regex) + found + (encoding, ), stop, show, scoring_heuristic, extended, debug) def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False): @@ -1371,8 +1369,8 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F expf = expf(f, encoding) except TypeError: expf = expf(f) - elif isinstance(expf, (int, float)): - epxf = f - .1 <= expf <= f + .1 + if isinstance(expf, (int, float)): + expf = (f - .1 <= expf <= f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = f - expf[1] <= expf[0] <= expf[1] + .1 s += [-1., .1][expf] @@ -1414,17 +1412,15 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_cat if len(found) > 0: for encoding in found: input = decode(input, encoding) - regex = False if isinstance(stop_func, string_types): stop_func = stopfunc.regex(stop_func) - regex = True result = {} if len(input) > 0: try: # breadth-first search for d in range(max_depth): __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop, - show, scoring_heuristic, extended, debug, regex) + show, scoring_heuristic, extended, debug) if stop and len(result) > 0: return result except KeyboardInterrupt: From b3253142fa93bc5d089788c26e58512940fbf367 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:22 +0100 Subject: [PATCH 02/32] Fixed minor issues --- codext/__common__.py | 12 ++++++++---- codext/__init__.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 9b7936f..35d1fc5 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1303,9 +1303,13 @@ class _Text(object): __slots__ = ["entropy", "lcharset", "len", "padding", "printables"] def __init__(self, text, pad_char=None): + c = text[-1] + last_char = c if isinstance(c, int) else ord(c) + self.padding = pad_char is not None and last_char == ord(pad_char) + if self.padding: + text = text.rstrip(pad_char) self.len = len(text) self.lcharset = len(set(text)) - self.padding = pad_char is not None and text[-1] in [pad_char, b(pad_char)] self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len self.entropy = entropy(text) @@ -1363,16 +1367,16 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F s += .1 expf = sc.get('expansion_factor', 1.) if expf: - f = float(len(new_input)) / obj.len + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f if isinstance(expf, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec expf = expf(f, encoding) except TypeError: expf = expf(f) if isinstance(expf, (int, float)): - expf = (f - .1 <= expf <= f + .1) + expf = (1/f - .1 <= 1/expf <= 1/f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = f - expf[1] <= expf[0] <= expf[1] + .1 + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the # number of input characters to take bad entropies of shorter strings into account diff --git a/codext/__init__.py b/codext/__init__.py index 3b98af4..0fa49d5 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -88,7 +88,7 @@ def main(): "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", ]) parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter) - sparsers = parser.add_subparsers(dest="command", help="command to be executed") + sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed") parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", @@ -140,7 +140,7 @@ def main(): search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed") + lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed") liste = lsparsers.add_parser("encodings", help="list encodings") liste.add_argument("category", nargs="*", help="selected categories") listm = lsparsers.add_parser("macros", help="list macros") From 3cf1f5dbd67b78a771b55885d5e2c20eaf31fd81 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:29 +0100 Subject: [PATCH 03/32] Refined tests --- tests/test_base.py | 14 ++++++++------ tests/test_common.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index 33eff65..7b3dae0 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -44,6 +44,7 @@ def test_codec_base1(self): for i in range(3): self.assertIsNotNone(codecs.encode(i * C, "base1")) self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") def test_codec_base2(self): STR = "test" @@ -181,7 +182,7 @@ def test_codec_base62(self): self.assertEqual(codecs.decode(b(b62), enc), b(STR)) def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3QK", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): self.assertEqual(codecs.encode(STR, enc), b64) self.assertEqual(codecs.encode(b(STR), enc), b(b64)) self.assertEqual(codecs.decode(b64, enc), STR) @@ -224,11 +225,12 @@ def test_base_main(self): tfile = "test-base-main.txt" with open(tfile, 'w') as f: f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - sys.argv = [tmp[0], tfile] - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] - self.assertEqual(main2(), 1) + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) os.remove(tfile) sys.argv[:] = tmp diff --git a/tests/test_common.py b/tests/test_common.py index ec57aaa..a35abfd 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -150,7 +150,7 @@ def test_guess_decode(self): self.assertIsNone(codext.stopfunc._reload_lang()) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2, @@ -204,7 +204,7 @@ def test_guess_decode(self): def test_rank_input(self): codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) STR = "This is a test string !" ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) From cb6656845940db1386a563e08ea13ac918f51bb5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:46 +0100 Subject: [PATCH 04/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 6b89d58..81f3632 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.2 +1.12.3 From 544e1cc39a7b4e793864087d3d2cf4cc77d73038 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 26 Feb 2022 16:01:45 +0100 Subject: [PATCH 05/32] Fixed multiple base codecs --- codext/base/base100.py | 2 +- codext/base/base122.py | 4 ++-- codext/base/base91.py | 4 ++-- codext/base/baseN.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/codext/base/base100.py b/codext/base/base100.py index db0b3c9..f5faa1d 100755 --- a/codext/base/base100.py +++ b/codext/base/base100.py @@ -37,7 +37,7 @@ def base100_encode(input, errors="strict"): return bytes(r), len(input) def base100_decode(input, errors="strict"): - input = b(input) + input = b(_stripl(input, True, True)) if errors == "ignore": input = input.replace(b"\n", "") if len(input) % 4 != 0: diff --git a/codext/base/base122.py b/codext/base/base122.py index 33a42ad..f580ff8 100755 --- a/codext/base/base122.py +++ b/codext/base/base122.py @@ -98,9 +98,9 @@ def _get_7bits(currB, bob, B, decoded): currB, bob = _get_7bits(currB, bob, input[i] & 127, r) else: currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)), len(input) + return "".join(map(chr, r)).rstrip("\0"), len(input) add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "") +main122 = main(122, "", wrap=False) diff --git a/codext/base/base91.py b/codext/base/base91.py index 6f0d6ec..21a21d5 100755 --- a/codext/base/base91.py +++ b/codext/base/base91.py @@ -72,7 +72,7 @@ def encode(text, errors="strict"): def base91_decode(mode): b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} def decode(text, errors="strict"): - t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None + t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None ehandler = handle_error("base91", errors, decode=True) for i in range(0, len(t), 2): try: @@ -103,7 +103,7 @@ def decode(text, errors="strict"): bits = bits[8:] elif not alt and len(bits) > 0 and not set(bits) == {"0"}: s += chr(int(bits, 2)) - return s, len(t) + return s.rstrip("\0"), len(t) return decode diff --git a/codext/base/baseN.py b/codext/base/baseN.py index f935bf9..3c63453 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -82,7 +82,7 @@ r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", } base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", - guess=["base58", "base58-ripple", "base58-flickr"]) + guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) main58bc = main(58, "", "bitcoin") main58rp = main(58, "", "ripple") main58fl = main(58, "", "flickr") @@ -119,7 +119,7 @@ B128 = {r'': "".join(chr(i) for i in range(128))} base(B128, r"^base[-_]?128$", padding_char="=") -main128 = main(128, None, False) +main128 = main(128, None, False, wrap=False) # generic base encodings, to be added after all others as they have the precedence From 95f4b80825e101e3e44cc7b5d961a993c5736cce Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 26 Feb 2022 16:02:39 +0100 Subject: [PATCH 06/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 81f3632..89c881b 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.3 +1.12.4 From f8bd7b741c181b2789f4cf4785e10ac0e67c237c Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:24:29 +0100 Subject: [PATCH 07/32] Fixed codec: shift --- codext/crypto/shift.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codext/crypto/shift.py b/codext/crypto/shift.py index 89ca992..599e60d 100755 --- a/codext/crypto/shift.py +++ b/codext/crypto/shift.py @@ -19,12 +19,12 @@ def ord_shift_decode(i): - return ord_shift_encode(-i) + return ord_shift_encode(-int(i)) def ord_shift_encode(i): def encode(text, errors="strict"): - r = "".join(chr((ord(c) + i) % 256) for c in text) + r = "".join(chr((ord(c) + int(i)) % 256) for c in text) return r, len(r) return encode From d9cc79ae047cf9d384fd0068006d43b52a73771a Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:24:42 +0100 Subject: [PATCH 08/32] Fixed codec: scytale --- codext/crypto/scytale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/crypto/scytale.py b/codext/crypto/scytale.py index 32e6e96..7490241 100755 --- a/codext/crypto/scytale.py +++ b/codext/crypto/scytale.py @@ -17,7 +17,7 @@ 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, } -__guess__ = ["scytale-%d" % i for i in range(10)] +__guess__ = ["scytale-%d" % i for i in range(1, 10)] PADDING_CHAR = "" From 1e31eab38491fc5768383a21332a06f974c5f1e6 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:25:27 +0100 Subject: [PATCH 09/32] Fixed bug in base --- codext/base/_base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codext/base/_base.py b/codext/base/_base.py index 05aaed0..fce8b9a 100755 --- a/codext/base/_base.py +++ b/codext/base/_base.py @@ -191,7 +191,11 @@ def _decode(input, errors="strict"): kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) n = "base{}".format(n) if name is None else name - kwargs['guess'] = kwargs.get('guess', [n]) + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) add(n, encode, decode, pattern, entropy=nb, **kwargs) From d09dd0be29ffb3ce1a42c5e42eb1c58e0e3e5faf Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:25:41 +0100 Subject: [PATCH 10/32] Improved unbase tool --- codext/base/__init__.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/codext/base/__init__.py b/codext/base/__init__.py index 5859f6b..8c0d220 100755 --- a/codext/base/__init__.py +++ b/codext/base/__init__.py @@ -19,7 +19,7 @@ def main(): With no FILE, or when FILE is -, read standard input. Optional arguments: - -e, --extended also consider generic base codecs while guess-decoding + -E, --extended also consider generic base codecs while guess-decoding -f, --stop-function set the result chceking function (default: text) format: printables|text|flag|lang_[bigram] -M, --max-depth maximum codec search depth (default: 5) @@ -36,28 +36,23 @@ def main(): """ parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) parser.format_help = MethodType(lambda s: s.description, parser) + group = parser.add_mutually_exclusive_group() parser.add_argument("file", nargs="?") - parser.add_argument("-e", "--extended", action="store_true") - parser.add_argument("-f", "--stop-function", default="text") + parser.add_argument("-E", "--extended", action="store_true") + group.add_argument("-f", "--stop-function", default="text") parser.add_argument("-M", "--max-depth", type=int, default=10) parser.add_argument("-m", "--min-depth", type=int, default=0) - parser.add_argument("-p", "--pattern") + group.add_argument("-p", "--pattern") parser.add_argument("-s", "--show", action="store_true") parser.add_argument("--help", action="help") parser.add_argument("--version", action="version") parser.add_argument("--verbose", action="store_true") parser.version = "CodExt " + __version__ args = parser.parse_args() - excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function - if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(stopfunc.LANG_BACKEND) - #TODO: validate args.stop_function - #TODO: make --stop-function and --pattern mutually exclusive - sfunc = getattr(stopfunc, s, s) - c = _input(args.file) + c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base", - stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose) + r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, + show=args.verbose, debug=args.verbose) if len(r) == 0: print("Could not decode :-(") return 0 From 0d132317fc4d214311d1f81071bbf916437d3532 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:26:07 +0100 Subject: [PATCH 11/32] Improved codext tool --- codext/__init__.py | 115 +++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 50 deletions(-) diff --git a/codext/__init__.py b/codext/__init__.py index 0fa49d5..692ab48 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -33,12 +33,23 @@ pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native") -def __literal_eval(o): - """ Non-failing ast.literal_eval alias function. """ - try: - return literal_eval(str(o)) - except ValueError: - return literal_eval("'" + str(o) + "'") +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d def __print_tabular(lst, space=4): @@ -70,6 +81,19 @@ def __print_tabular(lst, space=4): def main(): import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ .format(__version__, __author__, __email__, __copyright__, __license__, __source__) @@ -87,62 +111,68 @@ def main(): "echo -en \"test\" | codext encode base64 gzip | codext guess", "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", ]) - parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter) - sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed") + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + kw2 = {'required': True} if PY3 else {} + sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) encode.add_argument("encoding", nargs="+", help="list of encodings to apply") encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) decode.add_argument("encoding", nargs="+", help="list of encodings to apply") decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; " - "format: string|tuple") - guess.add_argument("-d", "--min-depth", default=0, type=int, help="minimum codec search depth before triggering " - "results (default: 0)") - guess.add_argument("-D", "--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)") - guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; " - "format: string|tuple") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") guess.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") lng = "lang_%s" % LANG def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, help="result checking function (default: %s) ; " - "format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-sensitive ; add -i to " - "force it as case-insensitive or add '(?i)' in front of the expression" % def_func) - guess.add_argument("-i", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") if len(stopfunc.LANG_BACKENDS) > 0: _lb = stopfunc.LANG_BACKEND guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") guess.add_argument("-s", "--do-not-stop", action="store_true", help="do not stop if a valid output is found (default: False)") guess.add_argument("-v", "--verbose", action="store_true", help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input") - rank.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; " - "format: string|tuple|list(strings|tuples)") - rank.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; " - "format: string|tuple|list(strings|tuples)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY", + help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)") + rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC", + help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)") rank.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="*", help="selected categories") + liste.add_argument("category", nargs="+", help="selected categories") listm = lsparsers.add_parser("macros", help="list macros") addm = sparsers.add_parser("add-macro", help="add a macro to the registry") addm.add_argument("name", help="macro's name") @@ -150,15 +180,7 @@ def main(): remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") remm.add_argument("name", help="macro's name") args = parser.parse_args() - try: - args.codec_categories = _lst(map(__literal_eval, args.codec_categories)) - except (AttributeError, TypeError): - pass - try: - args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs)) - except (AttributeError, TypeError): - pass - #print(args.codec_categories, args.exclude_codecs) + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) try: # if a search pattern is given, only handle it if args.command == "search": @@ -211,17 +233,9 @@ def main(): all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): stopfunc._reload_lang(lb) r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), - args.min_depth, - args.max_depth, - args.codec_categories, - args.exclude_codecs, - args.encoding, - not args.do_not_stop, - True, # show - not args.no_heuristic, - args.extended, - args.verbose) + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) for i, o in enumerate(r.items()): e, out = o if len(e) > 0: @@ -238,6 +252,7 @@ def main(): s = "[+] %.5f: %s" % (i[0], e) print(s if len(s) <= 80 else s[:77] + "...") except Exception as e: + raise e m = str(e) print("codext: " + m[0].lower() + m[1:]) From ad01045ad33093658cc19fe48b7deb6bddfc4c47 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:26:18 +0100 Subject: [PATCH 12/32] Improved guess performance --- .coveragerc | 7 +- codext/__common__.py | 382 +++++++++++++++++++++++++------------------ tests/test_common.py | 22 +-- 3 files changed, 237 insertions(+), 174 deletions(-) diff --git a/.coveragerc b/.coveragerc index 0baf7fa..4ccc970 100644 --- a/.coveragerc +++ b/.coveragerc @@ -11,7 +11,7 @@ exclude_lines = def main\(\)\: def __stdin_pipe\(\)\: for line in __stdin_pipe\(\)\: - def __literal_eval\(o\)\: + def __format_list\(items, include\=True\)\: def __print_tabular\(lst, space\=4\)\: except ImportError: except NameError: @@ -20,3 +20,8 @@ exclude_lines = if PY3 def encode\(self, input, final\=False\)\: def decode\(self, input, final\=False\)\: + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: diff --git a/codext/__common__.py b/codext/__common__.py index 35d1fc5..e45fb1e 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -45,6 +45,7 @@ CODECS_REGISTRY = None CODECS_OVERWRITTEN = [] CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} LANG = getlocale() if LANG: LANG = (LANG[0] or "")[:2].lower() @@ -674,17 +675,16 @@ def list_categories(): # particular category, hardcoded from base/_base.py c += ["base-generic"] return c +list_categories() def list_encodings(*categories): """ Get a list of all codecs. """ - # first, determine the list of valid categories - valid_categories = list_categories() - # then, if "non-native" is in the input list, extend the list with the whole categories but "native" + # if "non-native" is in the input list, extend the list with the whole categories but "native" categories, exclude = list(categories), [] for c in categories[:]: if c == "non-native": - for c in valid_categories: + for c in CODECS_CATEGORIES: if c == "native" or c in categories: continue categories.append(c) @@ -714,7 +714,7 @@ def list_encodings(*categories): if (len(categories) == 0 or c in categories) and c not in exclude: enc.append(name) for category in categories: - if category not in valid_categories: + if category not in CODECS_CATEGORIES: raise ValueError("Category '%s' does not exist" % category) return sorted(list(set(enc)), key=_human_keys) @@ -1226,23 +1226,22 @@ def _load_lang_backend(backend=None): stopfunc._reload_lang = _load_lang_backend -def __develop(encodings): - """ Private method for developing the input list of encodings, trying to extend it with every encoding name. """ - enc = [] - for e in (encodings or []): - try: - ci = lookup(e, False) - g = ci.parameters['guess'] - except: - g = [e] - if e in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - enc.append(e) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - enc.extend(g) - return enc +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, string_types): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(), +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(), stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): @@ -1254,47 +1253,60 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_cat result[found] = input if depth >= max_depth or len(result) > 0 and stop: return - # compute included and excluded codecs for this depth - def expand(items, descr=None, transform=None): - items = items or [] - # format 1: when string, take it as the only items at any depth - if isinstance(items, string_types): - r = (items, ) - # format 2: when tuple, consider it as a list of items at any depth - elif isinstance(items, tuple): - r = items - # format 3: when list, consider it as the list of tuples of items with the order number corresponding to the - # applicable depth - elif isinstance(items, list): - try: - r = items[depth] or () - if isinstance(r, string_types): - r = (r, ) - except IndexError: - r = () - else: - raise ValueError("Bad %sformat %s" % (["%s " % descr, ""][descr is None], items)) - return r if transform is None else transform(*r) - # parse valid encodings, expanding included/excluded codecs - c, e = expand(codec_categories, "codec_categories", list_encodings), __develop(expand(exclude, "exclude")) prev_enc = found[-1] if len(found) > 0 else "" - for new_input, encoding in __rank(prev_input, input, prev_enc, c, scoring_heuristic, extended): + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended): if len(result) > 0 and stop: return - if encoding in e: - continue if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result, found + (encoding, ), stop, show, scoring_heuristic, extended, debug) -def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False): +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + codecs = {} + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc = [], [e for e in v if e in CODECS_CATEGORIES] + for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \ + [e for e in v if e not in CODECS_CATEGORIES]): + try: + g = lookup(enc, False).parameters['guess'] + except: + g = [enc] + if enc in g and not keep: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = l + if keep: + for e in l: + # cache newly loaded CodecInfo objects + ci = lookup(e, False) + n = ci.name + if n in CODECS_CACHE: + ci = CODECS_CACHE[n] # keep the cached object + else: + CODECS_CACHE[n] = ci # cache the new object + codecs[e] = ci + return d + exclude = _develop(exclude, False) + return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs + + +def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False): """ Filter valid encodings and rank them by relevance. """ ranking = {} - for codec in codecs: - for score, new_input, encoding in __score(prev_input, input, prev_encoding, codec, heuristic, extended): - ranking[encoding] = (score, new_input) + for encoding in encodings: + try: + score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended) + except TypeError: + continue + ranking[encoding] = (score, new) for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): yield result if yield_score else result[1], encoding @@ -1304,7 +1316,7 @@ class _Text(object): def __init__(self, text, pad_char=None): c = text[-1] - last_char = c if isinstance(c, int) else ord(c) + pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) self.padding = pad_char is not None and last_char == ord(pad_char) if self.padding: text = text.rstrip(pad_char) @@ -1314,136 +1326,182 @@ def __init__(self, text, pad_char=None): self.entropy = entropy(text) -def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=False): +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): """ Score relevant encodings given an input. """ - obj, ci = None, lookup(codec, False) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...) - sc = ci.parameters.get('scoring', {}) - no_error, transitive = ci.parameters.get('no_error', False), sc.get('transitive', False) - for encoding in ci.parameters.get('guess', [codec]): - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = decode(input, encoding) - except: - continue - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - continue - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == ci.parameters['name']: - continue - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', {}) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = decode(input, encoding) + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) + expf = expf(f, encoding) except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, ci, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - yield s, new_input, encoding + expf = expf(f) + if isinstance(expf, (int, float)): + expf = (1/f - .1 <= 1/expf <= 1/f + .1) + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', {}) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) + d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(), +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). """ + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths if max_depth <= 0: raise ValueError("Depth must be a non-null positive integer") if min_depth > max_depth: raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account if len(found) > 0: for encoding in found: input = decode(input, encoding) + # handle the stop function as a regex if a string was given if isinstance(stop_func, string_types): stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, string_types): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, codecs = __make_encodings_dict(include, exclude) result = {} - if len(input) > 0: - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop, - show, scoring_heuristic, extended, debug) - if stop and len(result) > 0: - return result - except KeyboardInterrupt: - pass + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} return result codecs.guess = guess -def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None): - """ Rank the most probable encodings based on the given input. """ - if isinstance(codec_categories, string_types): - codec_categories = (codec_categories, ) - codecs = list_encodings(*(codec_categories or ())) - for e in __develop(exclude): - try: - codecs.remove(e) - except ValueError: - pass - r = list(__rank(None, input, "", codecs, True, extended, True)) +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []}) + r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True)) + CODECS_CACHE = {} return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/tests/test_common.py b/tests/test_common.py index a35abfd..6eddd7e 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -148,14 +148,15 @@ def test_encode_multiple_rounds(self): def test_guess_decode(self): self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2, + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories=["test", "base"], + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], max_depth=2).items())[0][1], "TEST") STR = "This is a test" self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) @@ -163,12 +164,12 @@ def test_guess_decode(self): self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, exclude=["base100"]))) self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=[None])), 0) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base", + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base", + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) self.assertRaises(ValueError, codext.guess, STR, max_depth=0) self.assertRaises(ValueError, codext.guess, STR, exclude=42) @@ -198,8 +199,7 @@ def test_guess_decode(self): self.assertEqual(encoding, found_encodings[0]) txt = "".join(chr(i) for i in range(256)) b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, - codec_categories="base"))) + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") def test_rank_input(self): @@ -210,10 +210,10 @@ def test_rank_input(self): self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64") - self.assertEqual(codext.rank(ENC, codec_categories="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, codec_categories=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, codec_categories=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", codec_categories=["test", "base"])[0][0][1], "TEST") + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") def test_handle_macros(self): MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" From c79e2bdf6633df8e1c643afeea728fc6033315b9 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:32:16 +0100 Subject: [PATCH 13/32] Fixed codec: baudot --- codext/binary/baudot.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/codext/binary/baudot.py b/codext/binary/baudot.py index ae5fc32..a57e1ea 100755 --- a/codext/binary/baudot.py +++ b/codext/binary/baudot.py @@ -10,9 +10,9 @@ from ..__common__ import * -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us", "murray", "uk"] +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] if PY3: - __CODES.extend(["ita2_meteo", "mtk2"]) + __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) __guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] __examples1__ = { 'enc(baudot-BAD_ALPHABET)': None, @@ -51,7 +51,7 @@ PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2" if PY3 else r"") + r"|murray|uk|us_tty)(?:[-_](?:lsb|msb))?)?$" + (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" # reserved character RES_CHR = "\xff" @@ -116,20 +116,22 @@ "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", ] # Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", -] +if PY3: + MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", + ] # English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code # https://en.wikipedia.org/wiki/Baudot_code) -UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", -] +if PY3: + UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ + "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", + ] def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): From 3076c9ff0a1182caf5a361e54e9d99887b7f5348 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 21:06:40 +0100 Subject: [PATCH 14/32] New release --- codext/VERSION.txt | 2 +- codext/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 89c881b..feaae22 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.4 +1.13.0 diff --git a/codext/__init__.py b/codext/__init__.py index 692ab48..486dd2f 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -180,7 +180,8 @@ def _format_action_invocation(self, action): remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") remm.add_argument("name", help="macro's name") args = parser.parse_args() - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) try: # if a search pattern is given, only handle it if args.command == "search": From 803e211c1922dd8e98a0d77eb466ec4d8eeb7e75 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:22:40 +0100 Subject: [PATCH 15/32] Added codec: base11 --- codext/base/baseN.py | 7 ++++++- docs/enc/base.md | 31 +++++++++++-------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/codext/base/baseN.py b/codext/base/baseN.py index 3c63453..cf4abe4 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -39,10 +39,15 @@ B10 = {r'': "0123456789"} -base(B10, r"^(?:base[-_]?10|int(?:eger)?)$") +base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") main10 = main(10) +B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} +base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") +main11 = main(11) + + B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) main16 = main(16, "RFC 4648") diff --git a/docs/enc/base.md b/docs/enc/base.md index 73b78ff..757965e 100644 --- a/docs/enc/base.md +++ b/docs/enc/base.md @@ -12,12 +12,12 @@ Common base encodings with N a power of 2: **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | charset: `1234` -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | charset: `abcdefgh` +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) `base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)` | -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | human-oriented Base32 +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 `base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | !!! note "Aliases" @@ -62,10 +62,12 @@ Note that for `base64`, it overwrites the native `base64_codec` to also support **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | `base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | `base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | supports Bitcoin, Ripple and short URL +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL `base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | `base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | `base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | @@ -131,11 +133,7 @@ This encoding implements various different versions of Base85. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base85` | text <-> ascii85 | `ascii85` | -`base85` | text <-> z85 | `z85`, `base85-zeromq` | -`base85` | text <-> base85-ipv6 | `base85-ipv6`, `base85-rfc1924` | -`base85` | text <-> base85-adobe | `base85-adobe` | -`base85` | text <-> base85-btoa | `base85-btoa`, `base85-xbtoa` | +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | ```python >>> codext.encode("this is a test", "ascii85") @@ -156,16 +154,9 @@ This encoding implements various different versions of Base85. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base85` | text <-> Base85 encoded text | `base[-_]?85` | Python 3 only (relies on `base64` module) `base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only `base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only - -```python ->>> codecs.encode("this is a test", "base85") -'bZBXFAZc?TVIXv6b94' ->>> codecs.decode("bZBXFAZc?TVIXv6b94", "base85") -'this is a test' -``` +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset ```python >>> codecs.encode("this is a test", "base100") From b37e8a15b9b1f309a5229433e65318f16bd47b3c Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:22:54 +0100 Subject: [PATCH 16/32] Improved docs about crypto --- README.md | 1 + docs/enc/crypto.md | 13 ++++--------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0b69bfb..2ce70be 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,7 @@ o - [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) - [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) - [X] `base10`: simple conversion to decimal +- [X] `base11`: conversion to digits with a "*a*" - [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) - [X] `base26`: conversion to alphabet letters - [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md index 974f49d..e59ab0f 100644 --- a/docs/enc/crypto.md +++ b/docs/enc/crypto.md @@ -152,9 +152,8 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_1`, `caesar1` | -`rot` | text <-> rot(X) ciphertext | ... | -`rot` | text <-> rot(25) ciphertext | `rot25`, `rot-25`, `rot_25`, `caesar25` | +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | ```python >>> codext.encode("this is a test", "rot-15") @@ -173,9 +172,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-1`, `shift_1` | -`shift` | text <-> shift(X) ciphertext | ... | -`shift` | text <-> shift(255) ciphertext | `shift255`, `shift-255`, `shift_255` | +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ ```python >>> codext.encode("this is a test", "shift-3") @@ -194,9 +191,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor1`, `xor-1`, `xor_1` | -`xor` | text <-> XOR(X) ciphertext | ... | -`xor` | text <-> XOR(255) ciphertext | `XOR255`, `xor255`, `xor-255`, `xor_255` | +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ ```python >>> codext.encode("this is a test", "xor-10") From d64c4f34b4c96b59976105680cbd35232be33e17 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:23:29 +0100 Subject: [PATCH 17/32] Applied minor improvement --- codext/__common__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/__common__.py b/codext/__common__.py index e45fb1e..94ad19f 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1333,7 +1333,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) # ignore encodings that fail to decode with their default errors handling value try: - new_input = decode(input, encoding) + new_input = codec.decode(input)[0] except: return # ignore encodings that give an output identical to the input (identity transformation) or to the previous input From e3092151c35da08d048a92c8dc03269ee95aed5c Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 18:40:33 +0100 Subject: [PATCH 18/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index feaae22..b50dd27 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.0 +1.13.1 From df8ff0fe870315c15eee63428516b47a1a74628b Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 12 Mar 2022 23:20:40 +0100 Subject: [PATCH 19/32] Fixed codec: uu --- codext/__common__.py | 3 ++- codext/__init__.py | 6 ----- codext/others/__init__.py | 1 + codext/others/uuencode.py | 47 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 codext/others/uuencode.py diff --git a/codext/__common__.py b/codext/__common__.py index 94ad19f..0c7ec17 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1312,9 +1312,10 @@ def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables"] + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] def __init__(self, text, pad_char=None): + self.text = text c = text[-1] pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) self.padding = pad_char is not None and last_char == ord(pad_char) diff --git a/codext/__init__.py b/codext/__init__.py index 486dd2f..661357a 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -27,12 +27,6 @@ reset() -# overwritten native codec -add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e), - lambda i, e="strict": orig_lookup("uu").decode(b(i), e), - pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native") - - def __format_list(items, include=True): if items is None: return diff --git a/codext/others/__init__.py b/codext/others/__init__.py index 22d6830..aa7ffa2 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -2,4 +2,5 @@ from .dna import * from .letters import * from .markdown import * +from .uuencode import * diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py new file mode 100644 index 0000000..5377493 --- /dev/null +++ b/codext/others/uuencode.py @@ -0,0 +1,47 @@ +# -*- coding: UTF-8 -*- +"""UU Codec - UU content encoding, relying on the native uu package. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from io import BytesIO +from uu import decode as _dec, encode as _enc + +from ..__common__ import * + + +__examples__ = { + 'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI Date: Sat, 12 Mar 2022 23:20:52 +0100 Subject: [PATCH 20/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index b50dd27..61ce01b 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.1 +1.13.2 From 837e91a4eb427b926089550509c7cea91f6accab Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Mar 2022 08:03:38 +0200 Subject: [PATCH 21/32] New release --- codext/VERSION.txt | 2 +- codext/common/dummy.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 61ce01b..01b7568 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.2 +1.13.3 diff --git a/codext/common/dummy.py b/codext/common/dummy.py index f2dd2fb..7f4be19 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -35,7 +35,11 @@ def code(input, errors="strict"): reverse = lambda i, e="strict": (i[::-1], len(i)) add("reverse", reverse, reverse) -word_reverse = lambda i, e="strict": (" ".join(w[::-1] for w in i.split()), len(i)) +_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ + if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) +line_reverse = lambda i, e="strict": (_revl(i), len(i)) +add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") +word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) From 564aa5a3743da63ab8b2de8564e9efe41288296b Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:04:03 +0200 Subject: [PATCH 22/32] Refactored codec: uu --- codext/others/uuencode.py | 49 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py index 5377493..a2f2fb6 100644 --- a/codext/others/uuencode.py +++ b/codext/others/uuencode.py @@ -7,39 +7,48 @@ - decodes file content to str (read) - encodes file content from str to bytes (write) """ -from io import BytesIO -from uu import decode as _dec, encode as _enc +from binascii import a2b_uu as _dec, b2a_uu as _enc from ..__common__ import * __examples__ = { 'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI 0 and lines[-1].strip(b" \t\r\n\f") in [b"", b"`"]: + lines = lines[:-1] + r = b"" + for l in lines: + r += _dec(l.strip(b" \t\r\n\f")) + return r, len(text) add("uu", uu_encode, uu_decode, pattern=r"^uu(?:[-_]?encode|[-_]codec)?$", From fb292e998df7ef2eb0193cf52b0c1b2cd84ab614 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:03 +0200 Subject: [PATCH 23/32] Improved guessing and ranking --- codext/__common__.py | 130 +++++++++++++++++++++---------------------- codext/__init__.py | 12 ++-- 2 files changed, 72 insertions(+), 70 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 0c7ec17..ea1281a 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -744,6 +744,10 @@ def remove(name): json.dump(PERS_MACROS, f, indent=2) except KeyError: pass + try: + del CODECS_CACHE[name] + except KeyError: + pass for s in ["En", "De"]: try: delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) @@ -864,6 +868,7 @@ def _handle_error(token, position, output="", eename=None): """ if errors == "strict": msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = ensure_str(token) token = token[:7] + "..." if len(token) > 10 else token err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) err.output = output @@ -968,36 +973,37 @@ def __register(search_function): codecs.register = __register -def search(encoding_regex): +def search(encoding_regex, extended=True): """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way into the local registry but also tries a simple lookup with the original lookup function. """ matches = [] - for search_function in __codecs_registry: + for search_function in CODECS_OVERWRITTEN + __codecs_registry: n = search_function.__name__ for name in [n, n.replace("_", "-")]: if re.search(encoding_regex, name): - matches.append(name) + matches.append(n.replace("_", "-")) continue - # in some cases, encoding_regex can match a generated string that uses a particular portion of its generating - # pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also find "morse" or - # "atbash" very rarely because of their dynamic patterns and the limited number of randomly generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of matches ; - # executing 5 times the string generation for a given codec but adding the codec to the list of matches only - # if we get at least 3 matches ensures that we consider up to 2 failures that could be stochastic, therefore - # drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) break - if c >= 3: - matches.append(n) - break for s, n in ALIASES.items(): if re.search(encoding_regex, s) or re.search(encoding_regex, n): matches.append(n) - break return sorted(list(set(matches)), key=_human_keys) codecs.search = search @@ -1241,7 +1247,7 @@ def _validate(stop_function, lang_backend="none"): stopfunc._validate = _validate -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(), +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): @@ -1255,58 +1261,53 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings return prev_enc = found[-1] if len(found) > 0 else "" e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended): + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): if len(result) > 0 and stop: return if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result, - found + (encoding, ), stop, show, scoring_heuristic, extended, debug) + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) def __make_encodings_dict(include, exclude): """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - codecs = {} def _develop(d, keep=True): d = d or {} for k, v in d.items(): l, cc = [], [e for e in v if e in CODECS_CATEGORIES] - for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \ + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ [e for e in v if e not in CODECS_CATEGORIES]): - try: - g = lookup(enc, False).parameters['guess'] - except: - g = [enc] - if enc in g and not keep: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected l.extend(g) - d[k] = l - if keep: - for e in l: - # cache newly loaded CodecInfo objects - ci = lookup(e, False) - n = ci.name - if n in CODECS_CACHE: - ci = CODECS_CACHE[n] # keep the cached object - else: - CODECS_CACHE[n] = ci # cache the new object - codecs[e] = ci + d[k] = list(set(l)) return d exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs + return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} -def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False): +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): """ Filter valid encodings and rank them by relevance. """ ranking = {} - for encoding in encodings: + for e in encodings: try: - score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended) - except TypeError: - continue - ranking[encoding] = (score, new) + codec = CODECS_CACHE[e] + except KeyError: + CODECS_CACHE[e] = codec = lookup(e, False) + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): yield result if yield_score else result[1], encoding @@ -1315,16 +1316,16 @@ class _Text(object): __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] def __init__(self, text, pad_char=None): - self.text = text - c = text[-1] - pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) - self.padding = pad_char is not None and last_char == ord(pad_char) + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char if self.padding: text = text.rstrip(pad_char) - self.len = len(text) - self.lcharset = len(set(text)) - self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len - self.entropy = entropy(text) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): @@ -1386,13 +1387,14 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: expf = expf(f) if isinstance(expf, (int, float)): + tmp = expf expf = (1/f - .1 <= 1/expf <= 1/f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', {}) + entr = sc.get('entropy', lambda e: e) entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr if isinstance(entr, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec @@ -1401,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, entr = entr(obj.entropy) if entr is not None: # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) + d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input)) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) @@ -1475,12 +1477,11 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=N if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): raise ValueError("Include argument shall be a list or a dictionary with integer keys") # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, codecs = __make_encodings_dict(include, exclude) - result = {} + encodings, result = __make_encodings_dict(include, exclude), {} try: # breadth-first search for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show, + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, scoring_heuristic, extended, debug) if stop and len(result) > 0: break @@ -1500,9 +1501,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None): :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) """ - encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []}) - r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True)) - CODECS_CACHE = {} + encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/codext/__init__.py b/codext/__init__.py index 661357a..f95abb8 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -154,12 +154,14 @@ def _format_action_invocation(self, action): guess.add_argument("-v", "--verbose", action="store_true", help="show guessing information and steps (default: False)") rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY", - help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)") - rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC", - help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)") + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") rank.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") @@ -243,7 +245,7 @@ def _format_action_invocation(self, action): if len(r) == 0: print("Could not decode :-(") elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs): + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): s = "[+] %.5f: %s" % (i[0], e) print(s if len(s) <= 80 else s[:77] + "...") except Exception as e: From b4a29503392cfdbb89f6e58d48441c9f60e8e6e4 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:19 +0200 Subject: [PATCH 24/32] Refined case codecs --- codext/common/cases.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/codext/common/cases.py b/codext/common/cases.py index 65fbdf2..8aa87e4 100644 --- a/codext/common/cases.py +++ b/codext/common/cases.py @@ -20,20 +20,20 @@ capitalize = lambda i, e="strict": (i.capitalize(), len(i)) uncapitalize = lambda i, e="strict": (i[0].lower() + i[1:] if len(i) > 0 else "", len(i)) -add("capitalize", capitalize, uncapitalize) +add("capitalize", capitalize, uncapitalize, penalty=.2) lowercase, uppercase = lambda i, e="strict": (i.lower(), len(i)), lambda i, e="strict": (i.upper(), len(i)) -add("uppercase", uppercase, lowercase, r"^upper(?:case)?$") -add("lowercase", lowercase, uppercase, r"^lower(?:case)?$") +add("uppercase", uppercase, lowercase, r"^upper(?:case)?$", penalty=.2) +add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$") +add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) -add("title", title, untitle) +add("title", title, untitle, penalty=.2) From 224b2d005eb3020b9470ab8b6cf182bcd0a49b89 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:44 +0200 Subject: [PATCH 25/32] Refined tests/test_common --- tests/test_common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_common.py b/tests/test_common.py index 6eddd7e..934155f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -151,7 +151,7 @@ def test_guess_decode(self): self.assertIsNotNone(codext.stopfunc._validate("flag")) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, @@ -203,13 +203,15 @@ def test_guess_decode(self): self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) STR = "This is a test string !" ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64") + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"]) self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) From dcbeba184f89ac7fd31bf7b4dfbca2fc1809ee70 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:31:45 +0200 Subject: [PATCH 26/32] Fixed scoring for compression codecs --- codext/__common__.py | 4 ++-- codext/compressions/__init__.py | 6 ++++++ codext/compressions/gzipp.py | 2 +- codext/compressions/lz77.py | 2 +- codext/compressions/pkzip.py | 6 +++--- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index ea1281a..f65d210 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1402,8 +1402,8 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: entr = entr(obj.entropy) if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input)) + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input)) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) diff --git a/codext/compressions/__init__.py b/codext/compressions/__init__.py index 37f1fa5..606a1dc 100755 --- a/codext/compressions/__init__.py +++ b/codext/compressions/__init__.py @@ -4,3 +4,9 @@ from .lz78 import * from .pkzip import * + +for e in list_encodings("compression"): + ci = lookup(e, False) + ci.parameters['scoring']['entropy'] = 7.9 + ci.parameters['scoring']['expansion_factor'] = lambda f: f + diff --git a/codext/compressions/gzipp.py b/codext/compressions/gzipp.py index da52b5a..14e65bc 100755 --- a/codext/compressions/gzipp.py +++ b/codext/compressions/gzipp.py @@ -40,5 +40,5 @@ def gzip_decompress(data, errors="strict"): return r, len(r) -add("gzip", gzip_compress, gzip_decompress, entropy=7.9) +add("gzip", gzip_compress, gzip_decompress) diff --git a/codext/compressions/lz77.py b/codext/compressions/lz77.py index 662f02c..bdfcf13 100644 --- a/codext/compressions/lz77.py +++ b/codext/compressions/lz77.py @@ -70,5 +70,5 @@ def lz77_decompress(input, errors="strict"): return out, len(out) -add("lz77", lz77_compress, lz77_decompress, entropy=7.9) +add("lz77", lz77_compress, lz77_decompress) diff --git a/codext/compressions/pkzip.py b/codext/compressions/pkzip.py index ebbcbce..47d9cd5 100755 --- a/codext/compressions/pkzip.py +++ b/codext/compressions/pkzip.py @@ -46,11 +46,11 @@ def _decode(data, errors="strict"): add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - entropy=7.9, examples=__examples1__, guess=["deflate"]) + examples=__examples1__, guess=["deflate"]) add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - entropy=7.9, examples=__examples2__, guess=["bz2"]) + examples=__examples2__, guess=["bz2"]) add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - entropy=7.9, examples=__examples3__, guess=["lzma"]) + examples=__examples3__, guess=["lzma"]) From a1b41fab747e36bf2eaf7f3037715f9fc7a28ddf Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 08:31:49 +0200 Subject: [PATCH 27/32] Fixed minor issues --- codext/__common__.py | 4 ++-- tests/test_common.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index f65d210..41cb5b2 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1308,7 +1308,7 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) if t: ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): yield result if yield_score else result[1], encoding @@ -1403,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, entr = entr(obj.entropy) if entr is not None: # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input)) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) diff --git a/tests/test_common.py b/tests/test_common.py index 934155f..8bbf410 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -211,7 +211,7 @@ def test_rank_input(self): ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"]) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) From 3d7f43dea12c40e330b6be8b8fb5011d3c8ee13b Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 08:33:03 +0200 Subject: [PATCH 28/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 01b7568..80138e7 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.3 +1.13.4 From 281ca1bacbbc0891f7e2987bbbd161f507823bd3 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 7 Sep 2022 20:36:19 +0200 Subject: [PATCH 29/32] Added codec: tokenize --- codext/common/dummy.py | 12 +++++++++++- docs/manipulations.md | 10 +++++++++- tests/test_manual.py | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/codext/common/dummy.py b/codext/common/dummy.py index 7f4be19..b45c023 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -22,7 +22,7 @@ def code(input, errors="strict"): # important note: ^ # using "{2}" here instead will break the codec # this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will -# faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo +# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo def substitute(token, replacement): @@ -45,3 +45,13 @@ def code(input, errors="strict"): strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) add("strip-spaces", strip_spaces, strip_spaces, guess=None) +def tokenize(n): + tlen = int(n[8:].lstrip("-_")) + def code(input, errors="strict"): + l = len(input) + if tlen > l: + raise LookupError("unknown encoding: %s" % n) + return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l + return code +add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) + diff --git a/docs/manipulations.md b/docs/manipulations.md index 7962278..8857ca7 100644 --- a/docs/manipulations.md +++ b/docs/manipulations.md @@ -43,11 +43,12 @@ These transformation functions are simple string transformations. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`replace` | text <-> text with single-char replaced | | +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ `reverse` | text <-> reversed text | | `reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) `strip-spaces` | text <-> all whitespaces stripped | | `substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). @@ -58,6 +59,13 @@ $ echo -en "test string" | codext encode reverse-words | codext encode reverse r string_test ``` +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + Or using encodings chaining: ```sh diff --git a/tests/test_manual.py b/tests/test_manual.py index 4211df7..64b1843 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -100,6 +100,8 @@ def test_codec_dummy_str_manips(self): self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") def test_codec_hash_functions(self): STR = b"This is a test string!" From 4792a99b3a3780765b80c68f0bbcb46da27a2f7b Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:41 +0200 Subject: [PATCH 30/32] Fixed minor bugs --- codext/__common__.py | 26 +++++++++++++++----------- tests/test_generated.py | 12 +++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 41cb5b2..9d9400c 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -109,10 +109,11 @@ def __new__(cls, name): for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): if re.match(r"enc(-dec)?\(", action): for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: - for n in (rd.group(1) or "512").split(","): - self.encode("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) continue self.encode(e) @@ -1276,10 +1277,9 @@ def __make_encodings_dict(include, exclude): def _develop(d, keep=True): d = d or {} for k, v in d.items(): - l, cc = [], [e for e in v if e in CODECS_CATEGORIES] + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ - [e for e in v if e not in CODECS_CATEGORIES]): + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): g = [] for e in (search(enc, False) or [enc]): try: @@ -1293,8 +1293,8 @@ def _develop(d, keep=True): l.extend(g) d[k] = list(set(l)) return d - exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): @@ -1304,7 +1304,10 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende try: codec = CODECS_CACHE[e] except KeyError: - CODECS_CACHE[e] = codec = lookup(e, False) + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) if t: ranking[e] = t @@ -1321,7 +1324,7 @@ def __init__(self, text, pad_char=None): pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) self.padding = pad_char is not None and last_char == pad_char if self.padding: - text = text.rstrip(pad_char) + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) self.len = len(self.text) self.lcharset = len(set(self.text)) self.printables = float(len([c for c in self.text if c in printable])) / self.len @@ -1501,7 +1504,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None): :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) """ - encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) r = list(__rank(None, input, "", encodings[-1], True, extended, True)) return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/tests/test_generated.py b/tests/test_generated.py index 6b89129..614562f 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -36,6 +36,11 @@ def _template(self): for ename in m.groups(): if ename is None: continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue # erroneous encoding name test if examples is None: self.assertRaises(LookupError, f1, "test", ename) @@ -72,11 +77,12 @@ def _template(self): # examples validation tests if k.startswith("enc-dec") and isinstance(examples, list): for e in examples[:]: - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: examples.remove(e) - for n in (rd.group(1) or "512").split(","): - examples.append("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) for s in [""] + examples: self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) From b4e1eb66fb8764df992cc6434f0e69a6eedbd9b5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:57 +0200 Subject: [PATCH 31/32] Added codec: kbshift --- codext/others/__init__.py | 1 + codext/others/kbshift.py | 66 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 codext/others/kbshift.py diff --git a/codext/others/__init__.py b/codext/others/__init__.py index aa7ffa2..3bbf102 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- from .dna import * +from .kbshift import * from .letters import * from .markdown import * from .uuencode import * diff --git a/codext/others/kbshift.py b/codext/others/kbshift.py new file mode 100755 index 0000000..2bd0991 --- /dev/null +++ b/codext/others/kbshift.py @@ -0,0 +1,66 @@ +# -*- coding: UTF-8 -*- +"""Keyboard-Shift Codec - keyboard line shifting content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +LAYOUTS = { + 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", + 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", + 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", + 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", + 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", + 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", + 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", +} +__per_len = {} +for k, s in LAYOUTS.items(): + i = max(map(len, s.split("\n"))) + __per_len.setdefault(i, []) + __per_len[i].append(k) + + +__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} +__guess__ = [] +for mlen, kbs in __per_len.items(): + for k in kbs: + __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) + + +def _kbshift(text, keyboard="azerty", n=1, decode=False): + r = "" + for c in text: + nc = None + for l in LAYOUTS[keyboard].splitlines(): + if c.lower() in l: + nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] + break + r += c if nc is None else nc + return r + + +def kbshift_encode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def encode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift)) + return r, len(r) + return encode + + +def kbshift_decode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def decode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift), True) + return r, len(r) + return decode + + +add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, + pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") + From cd234d5d97867f1470b45499694f3776aa74569b Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 12 Sep 2022 21:53:13 +0200 Subject: [PATCH 32/32] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 80138e7..850e742 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.4 +1.14.0