In [None]:
import json

# Loading

In [None]:
with open('msgs.json') as msgs_file:
    msgs = json.load(msgs_file)
    
msgs['chats']['list'] += msgs['left_chats']['list']
msgs = msgs['chats']['list']

In [None]:
len(msgs)

# Preparing

In [None]:
msgs = [m for m in msgs if m['type'] not in ['saved_messages']]
threads = {m['name']: m['messages'] for m in msgs}

print(f'Threads: {len(threads)}')

In [None]:
msgs_count = 0
for name, messages in threads.items():
    msgs_count += len(messages)
print(f'Messages: {msgs_count}')

In [None]:
msgs_type = set()
for _, messages in threads.items():
    for message in messages:
        msgs_type.add(message['type'])
print(f'Message types: {msgs_type}')

In [None]:
def merge_thread_messages(msgs):
    merged_msgs = []
    
    last_writer = None
    last_msg = ''
    for msg in msgs:
        if msg['type'] != 'message':
            continue
        if 'from' not in msg:
            continue
        if type(msg['text']) is not str:
            continue

        if msg['from'] == last_writer:
            last_msg += f" {msg['text']}"
        else:
            merged_msgs.append(last_msg)
            last_writer = msg['from']
            last_msg = msg['text']

    return merged_msgs
        
merged_msgs = {n: merge_thread_messages(m) for n, m in threads.items()}
print(f'Messages length: {len(merged_msgs)}')

In [None]:
msgs = [sen for thread in merged_msgs.values() for sen in thread]

## Normalizing

In [None]:
from hazm import Normalizer, WordTokenizer, Lemmatizer

In [None]:
normalizer = Normalizer()

msgs = [normalizer.normalize(s) for s in msgs]

In [None]:
import codecs

def stopwords_list(stopwords_file):
    with codecs.open(stopwords_file, encoding='utf8') as stopwords_file:
        return list(map(lambda w: w.strip(), stopwords_file))

stopwords = set(stopwords_list("stopwords.dat"))

In [None]:
word_tokenizer = WordTokenizer()
lemmatizer = Lemmatizer()

msgs = [[lemmatizer.lemmatize(w) for w in word_tokenizer.tokenize(s) if w not in stopwords] for s in msgs]

# Word2Vec

In [None]:
import gensim

In [None]:
model = gensim.models.Word2Vec(msgs, size=100, sg=1, iter=20, min_count=30, workers=6)

# Visualize

In [None]:
from network_viz import visualize_notebook, visualize

In [None]:
words_dict = {}
for idx, word in enumerate(model.wv.vocab):
    words_dict[word] = idx
print(f'Words: {len(words_dict.keys())}')

edges = []
edge_count = {}
for word in words_dict:
    for other_word, similarity in model.wv.similar_by_word(word):
        if similarity < 0.55:
            break
        if other_word not in words_dict:
            continue
        if word not in edge_count:
            edge_count[word] = 0
        edge_count[word] += 1
        if other_word not in edge_count:
            edge_count[other_word] = 0
        edge_count[other_word] += 1
        edges.append((word, other_word, 1))

new_edges = []
for word, other_word, w in edges:
    if edge_count[word] < 3 or edge_count[other_word] < 3:
        continue
    new_edges.append((word, other_word, w))
edges = new_edges

html = visualize(edges, size=800)
with open('output.html','w') as output_file:
    output_file.write(html)