In [1]:
import json
import mailbox
import email
import html
import mimetypes
from datetime import datetime


In [2]:
fn='messages/inbox/anna_d0ta1k5dmg/message_1.json'
jdata = json.load(open(fn))

In [3]:
def utfdemangle(s):
    # FB delivers strange JSON where all strings are encoded in UTF8 bytes
    return s.encode('latin1').decode('utf8')

In [4]:
def group_messages(jdata):
    # group messages accrodig to sender and time
    # so that messages sent shortly after each other 
    # end up as multiple paragraphs
    threshold_ms = 60000
    rawmsgs = jdata['messages']
    rawmsgs.sort(key=lambda m: m['timestamp_ms'])
    oldts = 0
    oldsender = None
    group = None
    groups = []
    for msg in rawmsgs:
        if msg['sender_name'] != oldsender or abs(msg['timestamp_ms'] - oldts) > threshold_ms:
            if group is not None:
                groups.append(group)
            group = { 'sender_name': msg['sender_name'], 'timestamp_ms': msg['timestamp_ms'], 'messages' : []}

        group['messages'].append(msg)
        oldts = msg['timestamp_ms']
        oldsender = msg['sender_name']

    groups.append(group)

    return groups

In [5]:
mgroups = group_messages(jdata)
[len(x['messages']) for x in mgroups]


[1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]

In [6]:
def format_email(msg):
    m=email.message.EmailMessage()
    m['From']=utfdemangle(msg['sender_name'])
    
    ts = msg['timestamp_ms']
    dt=datetime.utcfromtimestamp(ts/1000)
    m['Date']=email.utils.format_datetime(dt)
    
    plaintext=''
    htmltext='<html><body>\n'
    attachements= {}
    for paragraph in msg['messages']:
        if 'content' in paragraph:
            text = utfdemangle(paragraph['content'])
            plaintext += text + '\n'
            htmltext += '<p>' + html.escape(text) + '</p>\n'
        if 'photos' in paragraph:
            plaintext += str(len(paragraph['photos'])) + ' Photos'
            for photo in paragraph['photos']:
                image_cid = email.utils.make_msgid(domain='facebookexporter.invalid')
                htmlcid = image_cid[1:-1] # whyever
                uri = photo['uri']
                htmltext += f'<p> <img src="cid:{htmlcid}"> </p>\n'
                #print('embedding ' + uri + 'as ' + image_cid)
                attachements[image_cid] = uri
    htmltext += '</body> </html>\n'
    m.set_content(plaintext)
    m.add_alternative(htmltext)
    # attach images, if there are any
    for cid,fn in attachements.items():
        with open(fn, 'rb') as img:

            # know the Content-Type of the image
            maintype, subtype = mimetypes.guess_type(fn)[0].split('/')

            # attach it
            m.get_payload()[1].add_related(img.read(), 
                                                 maintype=maintype, 
                                                 subtype=subtype, 
                                         cid=cid)
    return m


In [7]:
onemail= str(format_email(mgroups[3]))

In [9]:
with open('test.eml', 'wb') as f:
    f.write(onemail.encode('utf-8'))