## Import Telegram JSON file

In [5]:
import json
import pandas as pd

with open('./data/telegram_export.json', 'r') as export_file:
    messages = json.load(export_file)['messages']
    messages_df = pd.json_normalize(data=messages)

## Tokenised words grouped by senders

In [29]:
from pyhanlp import *
from pymaybe import maybe
import re

analyzer = PerceptronLexicalAnalyzer()

# split string into string[] by its words
def tokenise_words(body):
    cleaned_body = maybe(' '.join(map(lambda w: w.upper(), re.findall('\w+', body)))).or_else('')
    tokenised_chinese_words = analyzer.analyze(cleaned_body).toWordArray()
    word_list = []
    
    try:
        for word in tokenised_chinese_words:
            if re.search('[a-z]+', word, re.IGNORECASE):
                # english words
                for eng_word in word.split():
                    if len(eng_word) < 2:
                        # meaningless words
                        continue

                    word_list.append(eng_word)
            elif re.search('\w+', word, re.IGNORECASE): 
                if len(word) < 2:
                        # meaningless words
                        continue
                # all words
                word_list.append(word)
            # not matching punctuation marks
    except UnicodeDecodeError:
        print(body)

    
    return word_list

def extract_message(message, combined_word = ''):
    if type(message) is str:
        return combined_word + message
    elif type(message) is list:
        for message_segment in message:
            return combined_word + extract_message(message_segment)
    elif 'text' in message:
        return combined_word + extract_message(message['text'])
    else:
        raise message + 'cannot be extracted'

# returns summary[sender][word] = count object
# only those with word count > 5 will be returned
def summarise_words(words_df, min_count = 5, max_count = 1000, file = None):
    summary = {}
    combined_words = {}
    filtered_summary = {}

    for index, message in ten_messages_df.iterrows():
        sender = message['from']
        body = extract_message(message)
        combined_words[sender] = combined_words[sender] + ' ' + body if sender in combined_words else ''
    
    for sender, combined_word in combined_words.items():
        tokenised_words = tokenise_words(combined_word)

        for tokenised_word in tokenised_words:
            summary[sender] = summary[sender] if sender in summary else {}
            summary[sender][tokenised_word] = summary[sender][tokenised_word] if tokenised_word in summary[sender] else 0
            summary[sender][tokenised_word] += 1;

    for sender in summary:
        filtered_summary[sender] = {}
        for word, count in sorted(summary[sender].items(), key=lambda item: item[1], reverse=True):
            if count >= min_count and count <= max_count:
                if file is not None:
                    file.write('{sender},{word},{count}\n'.format(sender = sender, word = word, count = count))
                else:
                    filtered_summary[sender][word] = count
            
    return filtered_summary if file is None else file.name

In [30]:
init_file = open('output.csv', 'w')
init_file.write('')
init_file.close()

output_file = open('output.csv', 'a')
ten_messages_df = messages_df
once_a_day = 720
once_a_week = 100
word_summary = summarise_words(
    ten_messages_df,
    min_count = once_a_week,
    max_count = once_a_day,
    file = output_file
)
output_file.close()

- TODO https://www.google.com/search?q=pandas+dataframe+map+column+values&oq=pandas+dataframe+map&aqs=chrome.2.69i57j0l7.10384j0j4&sourceid=chrome&ie=UTF-8
- Map `messages[0].text` to tokens using `HanLp`

## Top 10 words from each sender