/
hexload.py
113 lines (100 loc) · 4.64 KB
/
hexload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import inspect
import operator
import re
from typing import Dict, List, Type
from refinery.units.sinks import HexViewer
from refinery.lib.patterns import make_hexline_pattern
def regex(cls: Type) -> re.Pattern:
return re.compile(inspect.getdoc(cls))
class hexload(HexViewer):
"""
Convert hex dumps back to the original data and vice versa. All options of this unit apply
to its reverse operation where binary data is converted to a readable hexdump format.
The default mode of the unit expects the input data to contain a readable hexdump and
converts it back to binary.
"""
@regex
class _ENCODED_BYTES:
R"""
(?ix)(?:^|(?<=\s)) # encoded byte patches must be prefixed by white space
(?:
(?: # separated chunks of hex data
[a-f0-9]{2} # hexadecimal chunk; single byte (two hexadecimal letters)
\s{1,2} # encoded byte followed by whitespace
(?: # at least one more encoded byte
[a-f0-9]{2} # followed by more encoded bytes
(?:\s{1,2}[a-f0-9]{2})* # unless it was just a single byte
)?
)
| (?:[a-f0-9]{4}\s{1,2} # 2-byte chunks
(?:[a-f0-9]{4}
(?:\s{1,2}[a-f0-9]{4})*)?)
| (?:[a-f0-9]{8}\s{1,2} # 4-byte chunks
(?:[a-f0-9]{8}
(?:\s{1,2}[a-f0-9]{8})*)?)
| (?:(?:[a-f0-9]{2})+) # continuous line of hexadecimal characters
)(?=\s|$) # terminated by a whitespace or line end
"""
def __init__(self, blocks=1, dense=False, expand=False, narrow=False, width=0):
super().__init__(blocks=blocks, dense=dense, expand=expand, narrow=narrow, width=width)
self._hexline_pattern = re.compile(F'{make_hexline_pattern(1)}(?:[\r\n]|$)', flags=re.MULTILINE)
def process(self, data: bytearray):
lines = data.decode(self.codec).splitlines(keepends=False)
if not lines:
return None
decoded_bytes = bytearray()
encoded_byte_matches: List[Dict[int, int]] = []
for line in lines:
matches: Dict[int, int] = {}
encoded_byte_matches.append(matches)
for match in self._ENCODED_BYTES.finditer(line):
a, b = match.span()
matches[a] = b - a
it = iter(encoded_byte_matches)
offsets = set(next(it).keys())
for matches in it:
offsets.intersection_update(matches.keys())
if not offsets:
raise ValueError('unable to determine the position of the hex bytes in this dump')
lengths: Dict[int, List[int]] = {offset: [] for offset in offsets}
del offsets
for matches in encoded_byte_matches:
for offset in lengths:
lengths[offset].append(matches[offset])
for offset in lengths:
lengths[offset].sort()
midpoint = len(encoded_byte_matches) // 2
offset, length = max(((offset, lengths[offset][midpoint]) for offset in lengths),
key=operator.itemgetter(1))
end = offset + length
del lengths
for k, line in enumerate(lines, 1):
encoded_line = line[offset:end]
onlyhex = re.search(r'^[\sA-Fa-f0-9]+', encoded_line)
if not onlyhex:
self.log_warn(F'ignoring line without hexadecimal data: {line}')
continue
if onlyhex.group(0) != encoded_line:
if k != len(lines):
self.log_warn(F'ignoring line with mismatching hex data length: {line}')
continue
encoded_line = onlyhex.group(0)
self.log_debug(F'decoding: {encoded_line.strip()}')
decoded_line = bytes.fromhex(encoded_line)
decoded_bytes.extend(decoded_line)
txt = line[end:]
txt_stripped = txt.strip()
if not txt_stripped:
continue
if len(decoded_line) not in range(len(txt_stripped), len(txt) + 1):
self.log_warn(F'preview size {len(txt_stripped)} does not match decoding: {line}')
if decoded_bytes:
yield decoded_bytes
def reverse(self, data):
metrics = self._get_metrics(len(data))
if not self.args.width:
metrics.fit_to_width(allow_increase=True)
for line in self.hexdump(data, metrics):
yield line.encode(self.codec)