In [1]:
import json
import pandas
import numpy
import os
from collections import defaultdict

In [4]:
root = '../DataExport_2022-01-12'

In [5]:
with open(os.path.join(root, 'result.json'), encoding='utf8') as handle:
    data = json.load(handle)

In [6]:
chats_list = data['chats']['list']

In [7]:
saved_messages_chats = [chat for chat in chats_list if chat['type'] == 'saved_messages']
assert len(saved_messages_chats) == 1
chat = saved_messages_chats[0]['messages']

# Manual schema exploration

In [8]:
len(chat)

22075

In [9]:
all_keys = set()
for msg in chat:
    for key in msg.keys():
        all_keys.add(key)
        
key_vals = {key: [] for key in all_keys}
for msg in chat:
    for key in msg.keys():
        key_vals[key].append(msg[key])

In [12]:
all_keys

{'action',
 'actor',
 'actor_id',
 'date',
 'duration_seconds',
 'edited',
 'file',
 'forwarded_from',
 'from',
 'from_id',
 'height',
 'id',
 'media_type',
 'mime_type',
 'performer',
 'photo',
 'poll',
 'reply_to_message_id',
 'saved_from',
 'sticker_emoji',
 'text',
 'thumbnail',
 'title',
 'type',
 'via_bot',
 'width'}

In [13]:
key_vals['action']  # ignore

['clear_history']

In [15]:
# key_vals['actor']  # ignore

In [11]:
# key_vals['actor_id']  # ignore

In [16]:
# [date_str[:10] for date_str in key_vals['date']]  # use, cut time

In [17]:
# key_vals['duration_seconds']  # ignore

In [18]:
# key_vals['edited']  # ignore

In [30]:
# key_vals['file']  # use, relative asset link (though maybe rename the asset to avoid clashes when merging asset folders)

In [32]:
# key_vals['forwarded_from']  # ignore

In [34]:
# key_vals['from']  # ignore

In [36]:
# key_vals['from_id']  # ignore

In [39]:
# key_vals['height']  # ignore

In [41]:
# key_vals['id']  # ignore

In [43]:
set(key_vals['media_type'])  # ignore

{'animation', 'audio_file', 'sticker', 'video_file', 'video_message'}

In [45]:
# key_vals['mime_type']  # file format, ignore

In [47]:
# key_vals['performer']  # song performer, ignore

In [49]:
# key_vals['photo']  # use, direct relative asset link, photos folder

In [51]:
# key_vals['poll']  # ignore

In [53]:
# key_vals['reply_to_message_id']  # ignore

In [55]:
# key_vals['saved_from']  # ignore

In [58]:
# key_vals['sticker_emoji']  # ignore

In [71]:
# key_vals['text']  # TODO, investigate below

In [61]:
# key_vals['thumbnail']  # ignore

In [63]:
# key_vals['title']  # song title, ignore

In [65]:
set(key_vals['type'])

{'message', 'service'}

In [67]:
# key_vals['via_bot']  # ignore

In [69]:
# key_vals['width']  # ignore

In [73]:
set([type(entry) for entry in key_vals['text']])

{list, str}

In [74]:
len([entry for entry in key_vals['text'] if type(entry) == str])

13185

In [9]:
list_msgs = [entry for entry in key_vals['text'] if type(entry) == list]

In [10]:
list_msgs_key_types = set()
for msg_list in list_msgs:
    for entry in msg_list:
        list_msgs_key_types.add(type(entry))
print(list_msgs_key_types)

{<class 'str'>, <class 'dict'>}


In [12]:
list_msgs_keys = set()
list_msgs_vals = defaultdict(list)
for msg_list in list_msgs:
    for entry in msg_list:
        if type(entry) == dict:
            for key, val in entry.items():
                list_msgs_keys.add(key)
                list_msgs_vals[key].append(val)

In [89]:
list_msgs_keys

{'href', 'language', 'text', 'type'}

In [94]:
# list_msgs_vals['href']  # list of urls, merge with the text

In [98]:
# list_msgs_vals['language']  # practically empty, ignore

In [100]:
# list_msgs_vals['text']  # mix of urls, plain text, hashtags, mentions, terminal commands, phones and emails, etc, merge with the text

In [14]:
set([type(entry) for entry in list_msgs_vals['text']])

{str}

In [15]:
attachment_types = list(set(list_msgs_vals['type']))

In [16]:
attachment_results = {}
for cur_type in attachment_types:
    cur_type_results = []
    list_msgs_keys = set()
    list_msgs_vals = defaultdict(list)
    for msg_list in list_msgs:
        for entry in msg_list:
            if type(entry) == dict:
                if entry['type'] == cur_type:
                    cur_type_results.append((entry['text'], 'href' in entry.keys()))
    #                 print(entry['text'])
    #                 print('href' in entry.keys())
    attachment_results[cur_type] = cur_type_results

In [114]:
attachment_types

['phone',
 'italic',
 'code',
 'bank_card',
 'text_link',
 'cashtag',
 'underline',
 'email',
 'bold',
 'pre',
 'mention',
 'link',
 'hashtag']

In [118]:
# attachment_results['phone']  # just insert them into the text

In [120]:
# attachment_results['italic']  # apply markdown formatting and insert

In [18]:
# attachment_results['code']  # apply markdown formatting and insert

In [124]:
# attachment_results['bank_card']  # just insert into the text

In [149]:
# attachment_results['text_link']  # insert both text and href concatenated, href is there iff type == 'text_link'

In [151]:
# # simple check
# for msg_list in list_msgs:
#     for entry in msg_list:
#         if type(entry) == dict and 'href' in entry:
#             print(entry)

In [128]:
# attachment_results['cashtag']  # looks like $PWD, hashtag plus uppercase. Put this into the "code" markdown construct to avoid creating hashtags.

In [130]:
# attachment_results['underline']  # just insert (markdown weirdly doesn't support this)

In [133]:
# attachment_results['email']  # just insert

In [135]:
# attachment_results['bold']  # apply markdown formatting and insert

In [137]:
# attachment_results['pre']  # designate this as "code" and then insert

In [139]:
# attachment_results['mention']  # mentions, can simply insert them as they are in markdown

In [142]:
# attachment_results['link']  # just insert

In [144]:
# attachment_results['hashtag']  # first filter them with a whitelist of tags I want to keep, then if whitelisted insert w/o changes and remove the hash sign otherwise