Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b186a96
Fixed bug in guess mode
dhondta Feb 22, 2022
b325314
Fixed minor issues
dhondta Feb 22, 2022
3cf1f5d
Refined tests
dhondta Feb 22, 2022
cb66568
New release
dhondta Feb 22, 2022
544e1cc
Fixed multiple base codecs
dhondta Feb 26, 2022
95f4b80
New release
dhondta Feb 26, 2022
f8bd7b7
Fixed codec: shift
dhondta Feb 27, 2022
d9cc79a
Fixed codec: scytale
dhondta Feb 27, 2022
1e31eab
Fixed bug in base
dhondta Feb 27, 2022
d09dd0b
Improved unbase tool
dhondta Feb 27, 2022
0d13231
Improved codext tool
dhondta Feb 27, 2022
ad01045
Improved guess performance
dhondta Feb 27, 2022
c79e2bd
Fixed codec: baudot
dhondta Feb 27, 2022
3076c9f
New release
dhondta Feb 27, 2022
803e211
Added codec: base11
dhondta Feb 28, 2022
b37e8a1
Improved docs about crypto
dhondta Feb 28, 2022
d64c4f3
Applied minor improvement
dhondta Feb 28, 2022
e309215
New release
dhondta Feb 28, 2022
df8ff0f
Fixed codec: uu
dhondta Mar 12, 2022
96239cd
New release
dhondta Mar 12, 2022
837e91a
New release
dhondta Mar 28, 2022
564aa5a
Refactored codec: uu
dhondta Mar 28, 2022
fb292e9
Improved guessing and ranking
dhondta Mar 28, 2022
b4a2950
Refined case codecs
dhondta Mar 28, 2022
224b2d0
Refined tests/test_common
dhondta Mar 28, 2022
dcbeba1
Fixed scoring for compression codecs
dhondta Mar 28, 2022
a1b41fa
Fixed minor issues
dhondta Mar 29, 2022
3d7f43d
New release
dhondta Mar 29, 2022
281ca1b
Added codec: tokenize
dhondta Sep 7, 2022
4792a99
Fixed minor bugs
dhondta Sep 11, 2022
b4e1eb6
Added codec: kbshift
dhondta Sep 11, 2022
cd234d5
New release
dhondta Sep 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ exclude_lines =
def main\(\)\:
def __stdin_pipe\(\)\:
for line in __stdin_pipe\(\)\:
def __literal_eval\(o\)\:
def __format_list\(items, include\=True\)\:
def __print_tabular\(lst, space\=4\)\:
except ImportError:
except NameError:
Expand All @@ -20,3 +20,8 @@ exclude_lines =
if PY3
def encode\(self, input, final\=False\)\:
def decode\(self, input, final\=False\)\:
def _detect\(text\)\:
def _lang\(lang\)\:
if stopfunc\.LANG_BACKEND\:
def _validate\(stop_function, lang_backend\=\"none\"\)\:
except KeyboardInterrupt\:
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ o
- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet)
- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet)
- [X] `base10`: simple conversion to decimal
- [X] `base11`: conversion to digits with a "*a*"
- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted)
- [X] `base26`: conversion to alphabet letters
- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html))
Expand Down
2 changes: 1 addition & 1 deletion codext/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.12.2
1.14.0
465 changes: 264 additions & 201 deletions codext/__common__.py

Large diffs are not rendered by default.

126 changes: 69 additions & 57 deletions codext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,23 @@
reset()


# overwritten native codec
add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e),
lambda i, e="strict": orig_lookup("uu").decode(b(i), e),
pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native")


def __literal_eval(o):
""" Non-failing ast.literal_eval alias function. """
try:
return literal_eval(str(o))
except ValueError:
return literal_eval("'" + str(o) + "'")
def __format_list(items, include=True):
if items is None:
return
d = {-1: list_encodings() if include else []}
for n, i in enumerate(items):
try:
depth, i = i.split(":")
depth = int(depth.strip().replace("~", "-"))
if depth < 0:
depth = -1
except ValueError:
if n == 0:
d[-1] = []
depth = -1
d.setdefault(depth, [])
d[depth].append(i.strip())
return d


def __print_tabular(lst, space=4):
Expand Down Expand Up @@ -70,6 +75,19 @@ def __print_tabular(lst, space=4):

def main():
import argparse, os

class _CustomFormatter(argparse.RawTextHelpFormatter):
def __init__(self, prog, **kwargs):
kwargs['max_help_position'] = 32
super(_CustomFormatter, self).__init__(prog, **kwargs)

def _format_action_invocation(self, action):
if not action.option_strings:
metavar, = self._metavar_formatter(action, action.dest)(1)
return metavar
else:
return ", ".join(action.option_strings)

descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \
"\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \
.format(__version__, __author__, __email__, __copyright__, __license__, __source__)
Expand All @@ -87,78 +105,79 @@ def main():
"echo -en \"test\" | codext encode base64 gzip | codext guess",
"echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base",
])
parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter)
sparsers = parser.add_subparsers(dest="command", help="command to be executed")
kw = {'formatter_class': _CustomFormatter}
parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw)
kw2 = {'required': True} if PY3 else {}
sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2)
parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)")
parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)")
parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip",
help="strip newlines from input (default: False)")
encode = sparsers.add_parser("encode", help="encode input using the specified codecs")
encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw)
encode.add_argument("encoding", nargs="+", help="list of encodings to apply")
encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
help="error handling (default: strict)")
decode = sparsers.add_parser("decode", help="decode input using the specified codecs")
decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw)
decode.add_argument("encoding", nargs="+", help="list of encodings to apply")
decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"],
help="error handling (default: strict)")
guess = sparsers.add_parser("guess", help="try guessing the decoding codecs")
guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw)
guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)")
guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; "
"format: string|tuple")
guess.add_argument("-d", "--min-depth", default=0, type=int, help="minimum codec search depth before triggering "
"results (default: 0)")
guess.add_argument("-D", "--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)")
guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; "
"format: string|tuple")
guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
help="categories, codecs and encodings to be explicitely not used ;\n "
"format: [category|codec|encoding] OR depth:[category|codec|encoding]")
guess.add_argument("-E", "--extended", action="store_true",
help="while using the scoring heuristic, also consider null scores (default: False)")
lng = "lang_%s" % LANG
def_func = lng if getattr(stopfunc, lng, None) else "text"
guess.add_argument("-f", "--stop-function", default=def_func, help="result checking function (default: %s) ; "
"format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-sensitive ; add -i to "
"force it as case-insensitive or add '(?i)' in front of the expression" % def_func)
guess.add_argument("-i", "--case-insensitive", dest="icase", action="store_true",
help="while using the regex stop function, set it as case-insensitive (default: False)")
guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function "
"(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-"
"sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression"
% def_func)
guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down"
" the search but may be more accurate (default: False)")
guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
help="categories, codecs and encodings to be explicitely used ;\n "
"format: [category|codec|encoding] OR depth:[category|codec|encoding]")
guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true",
help="while using the regex stop function, set it as case-insensitive (default: False)")
if len(stopfunc.LANG_BACKENDS) > 0:
_lb = stopfunc.LANG_BACKEND
guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"],
help="natural language detection backend (default: %s)" % _lb)
guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT",
help="minimum codec search depth before triggering results (default: 0)")
guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT",
help="maximum codec search depth (default: 5)")
guess.add_argument("-s", "--do-not-stop", action="store_true",
help="do not stop if a valid output is found (default: False)")
guess.add_argument("-v", "--verbose", action="store_true",
help="show guessing information and steps (default: False)")
rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input")
rank.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; "
"format: string|tuple|list(strings|tuples)")
rank.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; "
"format: string|tuple|list(strings|tuples)")
rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw)
rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC",
help="categories, codecs and encodings to be explicitely not used ;\n "
"format: [category|codec|encoding] OR depth:[category|codec|encoding]")
rank.add_argument("-E", "--extended", action="store_true",
help="while using the scoring heuristic, also consider null scores (default: False)")
rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC",
help="categories, codecs and encodings to be explicitely used ;\n "
"format: [category|codec|encoding] OR depth:[category|codec|encoding]")
rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results")
search = sparsers.add_parser("search", help="search for codecs")
search.add_argument("pattern", nargs="+", help="encoding pattern to search")
listi = sparsers.add_parser("list", help="list items")
lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed")
lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2)
liste = lsparsers.add_parser("encodings", help="list encodings")
liste.add_argument("category", nargs="*", help="selected categories")
liste.add_argument("category", nargs="+", help="selected categories")
listm = lsparsers.add_parser("macros", help="list macros")
addm = sparsers.add_parser("add-macro", help="add a macro to the registry")
addm.add_argument("name", help="macro's name")
addm.add_argument("encoding", nargs="+", help="list of encodings to chain")
remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry")
remm.add_argument("name", help="macro's name")
args = parser.parse_args()
try:
args.codec_categories = _lst(map(__literal_eval, args.codec_categories))
except (AttributeError, TypeError):
pass
try:
args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs))
except (AttributeError, TypeError):
pass
#print(args.codec_categories, args.exclude_codecs)
if args.command in ["guess", "rank"]:
args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False)
try:
# if a search pattern is given, only handle it
if args.command == "search":
Expand Down Expand Up @@ -211,17 +230,9 @@ def main():
all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
stopfunc._reload_lang(lb)
r = codecs.guess(c,
getattr(stopfunc, s, ["", "(?i)"][args.icase] + s),
args.min_depth,
args.max_depth,
args.codec_categories,
args.exclude_codecs,
args.encoding,
not args.do_not_stop,
True, # show
not args.no_heuristic,
args.extended,
args.verbose)
getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth,
args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show
not args.no_heuristic, args.extended, args.verbose)
for i, o in enumerate(r.items()):
e, out = o
if len(e) > 0:
Expand All @@ -234,10 +245,11 @@ def main():
if len(r) == 0:
print("Could not decode :-(")
elif args.command == "rank":
for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs):
for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude):
s = "[+] %.5f: %s" % (i[0], e)
print(s if len(s) <= 80 else s[:77] + "...")
except Exception as e:
raise e
m = str(e)
print("codext: " + m[0].lower() + m[1:])

21 changes: 8 additions & 13 deletions codext/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def main():
With no FILE, or when FILE is -, read standard input.

Optional arguments:
-e, --extended also consider generic base codecs while guess-decoding
-E, --extended also consider generic base codecs while guess-decoding
-f, --stop-function set the result chceking function (default: text)
format: printables|text|flag|lang_[bigram]
-M, --max-depth maximum codec search depth (default: 5)
Expand All @@ -36,28 +36,23 @@ def main():
"""
parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False)
parser.format_help = MethodType(lambda s: s.description, parser)
group = parser.add_mutually_exclusive_group()
parser.add_argument("file", nargs="?")
parser.add_argument("-e", "--extended", action="store_true")
parser.add_argument("-f", "--stop-function", default="text")
parser.add_argument("-E", "--extended", action="store_true")
group.add_argument("-f", "--stop-function", default="text")
parser.add_argument("-M", "--max-depth", type=int, default=10)
parser.add_argument("-m", "--min-depth", type=int, default=0)
parser.add_argument("-p", "--pattern")
group.add_argument("-p", "--pattern")
parser.add_argument("-s", "--show", action="store_true")
parser.add_argument("--help", action="help")
parser.add_argument("--version", action="version")
parser.add_argument("--verbose", action="store_true")
parser.version = "CodExt " + __version__
args = parser.parse_args()
excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function
if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)):
stopfunc._reload_lang(stopfunc.LANG_BACKEND)
#TODO: validate args.stop_function
#TODO: make --stop-function and --pattern mutually exclusive
sfunc = getattr(stopfunc, s, s)
c = _input(args.file)
c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended]
c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base",
stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose)
r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False,
show=args.verbose, debug=args.verbose)
if len(r) == 0:
print("Could not decode :-(")
return 0
Expand Down
6 changes: 5 additions & 1 deletion codext/base/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,11 @@ def _decode(input, errors="strict"):
kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs)
kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05))
n = "base{}".format(n) if name is None else name
kwargs['guess'] = kwargs.get('guess', [n])
try:
g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n]
except AttributeError:
g = [n]
kwargs['guess'] = kwargs.get('guess', g)
add(n, encode, decode, pattern, entropy=nb, **kwargs)


Expand Down
2 changes: 1 addition & 1 deletion codext/base/base100.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def base100_encode(input, errors="strict"):
return bytes(r), len(input)

def base100_decode(input, errors="strict"):
input = b(input)
input = b(_stripl(input, True, True))
if errors == "ignore":
input = input.replace(b"\n", "")
if len(input) % 4 != 0:
Expand Down
4 changes: 2 additions & 2 deletions codext/base/base122.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ def _get_7bits(currB, bob, B, decoded):
currB, bob = _get_7bits(currB, bob, input[i] & 127, r)
else:
currB, bob = _get_7bits(currB, bob, input[i], r)
return "".join(map(chr, r)), len(input)
return "".join(map(chr, r)).rstrip("\0"), len(input)


add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085)
main122 = main(122, "<http://blog.kevinalbs.com/base122>")
main122 = main(122, "<http://blog.kevinalbs.com/base122>", wrap=False)

4 changes: 2 additions & 2 deletions codext/base/base91.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def encode(text, errors="strict"):
def base91_decode(mode):
b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))}
def decode(text, errors="strict"):
t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None
ehandler = handle_error("base91", errors, decode=True)
for i in range(0, len(t), 2):
try:
Expand Down Expand Up @@ -103,7 +103,7 @@ def decode(text, errors="strict"):
bits = bits[8:]
elif not alt and len(bits) > 0 and not set(bits) == {"0"}:
s += chr(int(bits, 2))
return s, len(t)
return s.rstrip("\0"), len(t)
return decode


Expand Down
Loading