In [1]:
pip install POT

Collecting POT
  Downloading POT-0.7.0-cp37-cp37m-manylinux2010_x86_64.whl (430 kB)
[?25l[K     |▊                               | 10 kB 21.8 MB/s eta 0:00:01[K     |█▌                              | 20 kB 23.1 MB/s eta 0:00:01[K     |██▎                             | 30 kB 26.1 MB/s eta 0:00:01[K     |███                             | 40 kB 28.2 MB/s eta 0:00:01[K     |███▉                            | 51 kB 29.7 MB/s eta 0:00:01[K     |████▋                           | 61 kB 30.9 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 29.9 MB/s eta 0:00:01[K     |██████                          | 81 kB 29.1 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 30.8 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 30.1 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 30.1 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 30.1 MB/s eta 0:00:01[K     |██████████                      | 133 kB 30.1

In [2]:
import numpy as np
import ot

In [3]:
def calc_Pc(items):
    total_items = 0
    for item in items:
        total_items += item[1]
    P = []
    for item in items:
        P.append(item[1] / total_items)
    return P

def calc_Pt(items, corpus):
    total_items = 0
    for item in items:
        total_items += corpus[item[0]]
    P = []
    for item in items:
        P.append(corpus[item[0]] / total_items)
    return P

def calc_dist_matrix(chars, tokens):
    matrix = np.zeros((len(chars), len(tokens)))
    rows = len(chars)
    cols = len(tokens)
    for i in range(rows):
        for j in range(cols):
            if chars[i][0] in tokens[j][0]:
                matrix[i][j] = np.log(len(tokens[j][0]))
            else:
                matrix[i][j] = 1e2
    return matrix

def write_vocab(chars, tokens, opt_matrix):
    total_tokens = 0
    for token in tokens:
        total_tokens += token[1]
    vocab = []
    for j in range(len(opt_matrix[0])):
        for i in range(len(opt_matrix)):
            if opt_matrix[i][j] != 0:
                vocab.append(tokens[j][0])
                break
    return vocab

def calc_alv(vocab): 
    lv = 0
    for word in vocab:
        lv += len(word)
    return lv / len(vocab)

def calc_entropy(matrix, alv): 
    entropy = 0
    for rows in matrix:
        for item in rows:
            entropy -= item * np.log(item)
    return entropy / alv

In [4]:
def VOLT(L, C, S, Dc):
    vocabularies = []
    C = list(C.items())
    L = list(L.items())
    Pc = calc_Pc(C)
    for item in S:
        T = L[:item]
        Pt = calc_Pt(T, Dc)
        D = calc_dist_matrix(C, T)
        optimal_matrix = ot.sinkhorn(Pc, Pt, D, 1.0, method='sinkhorn')
        vocab = write_vocab(C, T, optimal_matrix)
        entropy = calc_entropy(optimal_matrix, calc_alv(vocab))
        vocabularies.append((entropy, vocab))
    best_vocab = vocabularies[0][1]
    best_diff = -1e6
    for i in range(1, len(vocabularies)): 
        diff_entropy = vocabularies[i][0] - vocabularies[i - 1][0]
        if diff_entropy > best_diff:
            best_vocab = vocabularies[i][1]
            best_diff = diff_entropy
    return best_vocab