-
Notifications
You must be signed in to change notification settings - Fork 62
/
email.py
119 lines (103 loc) · 4.44 KB
/
email.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from email.parser import BytesParser
from functools import partial
from collections import defaultdict
from refinery.units.formats import PathExtractorUnit, UnpackResult
from refinery.units.pattern.mimewords import mimewords
from refinery.lib.mime import file_extension
from refinery.lib.tools import NoLogging, isbuffer
class xtmail(PathExtractorUnit):
"""
Extract files and body from EMail messages. The unit supports both the Outlook message format
and regular MIME documents.
"""
def _get_headparts(self, head):
mw = mimewords()
mw = partial(mw.process.__wrapped__.__wrapped__, mw)
jh = defaultdict(list)
for key, value in head:
jh[key].append(mw(''.join(t.lstrip() for t in value.splitlines(False))))
jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()}
yield UnpackResult('headers.txt',
lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec))
yield UnpackResult('headers.json',
lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec))
@PathExtractorUnit.Requires('extract_msg', optional=False)
def _extract_msg():
from extract_msg.message import Message
return Message
def _get_parts_outlook(self, data):
def ensure_bytes(data):
return data if isinstance(data, bytes) else data.encode(self.codec)
def make_message(name, msg):
with NoLogging:
htm = msg.htmlBody
txt = msg.body
if txt:
yield UnpackResult(F'{name}.txt', ensure_bytes(txt))
if htm:
yield UnpackResult(F'{name}.htm', ensure_bytes(htm))
msgcount = 0
with NoLogging:
msg = self._extract_msg(bytes(data))
yield from self._get_headparts(msg.header.items())
yield from make_message('body', msg)
def attachments(msg):
for attachment in getattr(msg, 'attachments', ()):
yield attachment
if attachment.type == 'data':
continue
yield from attachments(attachment.data)
for attachment in attachments(msg):
self.log_debug(attachment)
if attachment.type == 'msg':
msgcount += 1
yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data)
continue
if not isbuffer(attachment.data):
self.log_warn(F'unknown attachment of type {attachment.type}, please report this!')
continue
path = attachment.longFilename or attachment.shortFilename
yield UnpackResult(F'attachments/{path}', attachment.data)
def _get_parts_regular(self, data):
msg = BytesParser().parsebytes(data)
yield from self._get_headparts(msg.items())
for k, part in enumerate(msg.walk()):
path = part.get_filename()
elog = None
if path is None:
extension = file_extension(part.get_content_type(), 'txt')
path = F'body.{extension}'
else:
path = F'attachments/{path}'
try:
data = part.get_payload(decode=True)
except Exception as E:
try:
data = part.get_payload(decode=False)
except Exception as E:
elog = str(E)
data = None
else:
from refinery import carve
self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
if isinstance(data, str):
data = data.encode('latin1')
if isbuffer(data):
data = next(data | carve('b64', stripspace=True, single=True, decode=True))
else:
elog = str(E)
data = None
if not data:
if elog is not None:
self.log_warn(F'could not get content of message part {k}: {elog!s}')
continue
yield UnpackResult(path, data)
def unpack(self, data):
try:
yield from self._get_parts_outlook(data)
except Exception:
self.log_debug('failed parsing input as Outlook message')
yield from self._get_parts_regular(data)