In [2]:
# -*- coding: utf-8 -*-

import os, json, re, random
from os.path import join, dirname, basename, split, splitext
from collections import Counter, defaultdict, OrderedDict

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import general as ge

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

%matplotlib inline

In [3]:
_, origianl_words = ge.read_file('1813 PRIDE AND PREJUDICE.txt', use_zip=0, zf='none')
origianl_words = origianl_words[:40000]
lenght_original_text = len(origianl_words)

In [21]:
stops = stopwords.words('english')
words_0stop = [w for w in origianl_words if not w in stops]
words_0stop_stem = [porter.stem(w) for w in words_0stop]
c = Counter(words_0stop_stem)
threshold = 4
words_0stop_stem_threshold = [w for w, count in c.items() if count >= threshold]
final_vocab = words_0stop_stem_threshold
words_stem = [porter.stem(w) for w in origianl_words]

window_length = 200
if len(words_stem) % window_length:
    num_windows = len(words_stem) / window_length + 1
else:
    num_windows = len(words_stem) / window_length

finalwords_inwindows = []
for ind in range(num_windows):
    window_words = words_stem[ind*window_length:(ind+1)*window_length]
    tmp = [w for w in window_words if w in final_vocab]
    finalwords_inwindows.append(tmp)

In [22]:
m = np.zeros([len(final_vocab), num_windows])
for ind_window, finalwords_inwindow in enumerate(finalwords_inwindows):
    c = Counter(finalwords_inwindow)
    for word, count in c.items():
        ind_word = final_vocab.index(word)
        m[ind_word, ind_window] = count

In [24]:
u, s, v = np.linalg.svd(m, full_matrices=False)

# After plotting, use the first 200 dimensions as the desired.
window_vectors = np.dot(np.diag(s), v)
window_vectors = np.absolute(window_vectors)

In [25]:
# Jensen Shannon divergence
def shannon_entropy(dis):
    dis = [-pi * np.log(pi) for pi in dis]
    return sum(dis)

def jensen_shannon_divergence(dis, dis1):
    term = shannon_entropy(0.5 * dis + 0.5 * dis1)
    term1 = 0.5 * shannon_entropy(dis)
    term2 = 0.5 * shannon_entropy(dis1)
    return term - term1 - term2

In [27]:
distances = np.zeros([num_windows, num_windows])
for ind in range(window_vectors.shape[1]-1):
    dis = window_vectors[:, ind]
    for ind1 in range(ind+1, window_vectors.shape[1]):
        dis1 = window_vectors[:, ind1]
        distances[ind, ind1] = jensen_shannon_divergence(dis, dis1)
distances = distances + distances.transpose()

In [28]:
def information(i, j, m):
    # m: a np matrix, i and j: numeric
    row_i = m[i, :]
    pij = row_i[j]
    Iij = - pij * np.log(pij)
    row_i_without_i = row_i[row_i != 0.0]
    Ii = sum(- row_i_without_i * np.log(row_i_without_i))
    return Iij / Ii

In [29]:
path_entropy = .0
for node in range(num_windows-1):
    node1 = node+1
    Iij = information(node, node1, distances)
    path_entropy += Iij

In [31]:
# 20 random paths.
infos = []
for count in range(50):
    
    nodes = range(distances.shape[1])
    path = []
    while len(nodes) != 0:
        next_node = random.sample(nodes, 1)[0]
        path.append(next_node)
        nodes.remove(next_node)
    
    informations = 0.0
    for ind in range(len(path) - 1):
        node = path[ind]
        node1 = path[ind+1]
        Iij = information(node, node1, distances)
        informations += Iij
    infos.append(informations)