# Dummy word generator with estimated probability distribution over the alphabet
- corpus
- count char frequency
- index

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import string
from models import DummyWordGenerator

In [3]:
dummy_gen = DummyWordGenerator(alphabet=string.ascii_lowercase, special_chars=" .")

In [4]:
dummy_gen.generate

['v', 'e', 'e', 'p']

In [5]:
vowels = {'a', 'e', 'i', 'o', 'u'}
p = [5 if x in vowels else 1 for x in string.ascii_lowercase + " ."]
p_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, 
    special_chars=" .", p=p)

In [6]:
p_gen.generate[:5]

['a', 'e', 'm', 'm', 'e']

## Massive generation of words

In [7]:
size = 1000
dummy_list = [x for x in dummy_gen.words(size)]
p_list = [x for x in p_gen.words(size)]

In [8]:
dummy_list[:5]

['hvupfjhjfg', 'cxefbpnzcrrtnfa', 'rfkpc', 'bdrdhvrydwggdit', 'ahwz']

In [9]:
p_list[:5]

['t', '', 'xiljyyiaacoaey', 'zfvrm', 'ooaioifcafteoaatauirupzpl']

**Q1 how can we evaluate the quality of the generator?**

In [10]:
from index import WordIndex

In [11]:
local_file = '/Users/flint/Data/daily-dialogue/EMNLP_dataset/dialogues_text.txt'
with open(local_file, 'r') as infile:
    raw = infile.read()
sentences = [x.strip() for x in raw.split('__eou__')]

In [12]:
W = WordIndex(sentences)

In [13]:
correct_dummy = [w for w in dummy_list if len(w) > 1 and w in W.index]
correct_p = [w for w in p_list if len(w) > 1 and w in W.index]

In [14]:
len(correct_dummy) / len(dummy_list), len(correct_p) / len(p_list)

(0.011, 0.01)

In [15]:
correct_dummy[:5]

['up', 'we', 'on', 'far', 'ti']

**Q2 how to compute p is a more reliable way?**

In [16]:
from collections import defaultdict

$p(c) = \frac{count(c)}{\sum\limits_{i \in W} count(c_i)}$

In [26]:
from index import UnigramIndex, WordIndex

In [36]:
U = UnigramIndex.from_word_index(W, lower_case=True)
p_corpus = [U[c] for c in string.ascii_lowercase + " ."]

In [37]:
corpus_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, special_chars=" .", p=p_corpus)

In [38]:
corpus_list = [x for x in corpus_gen.words(size)]

In [39]:
correct_p = [w for w in corpus_list if len(w) > 1 and w in W.index]

In [40]:
len(correct_p) / len(corpus_list)

0.015

In [41]:
correct_p[:5]

['to', 'lo', 'la', 'en', 'is']

In [92]:
def index_word(word, index):
    for j, c in enumerate(word):
        if j == 0:
            s = '#s'
        else:
            s = word[j-1]
        index[s][c] += 1

In [93]:
i = defaultdict(lambda: defaultdict(lambda: 0))
index_word('thing', index=i)
index_word('think', index=i)
index_word('the', index=i)

In [95]:
for k, v in i.items():
    print(k)
    for x, y in v.items():
        print('\t', x, y)

#s
	 t 3
t
	 h 3
h
	 i 2
	 e 1
i
	 n 2
n
	 g 1
	 k 1


In [82]:
for k, v in i['th'].items():
    print(k, v)

i 2
e 1


In [89]:
def count(idx, s, last_char=None):
    if last_char is not None:
        return idx[s][c]
    else:
        continuations = idx[s]
        return sum(continuations.values())

In [90]:
count(i, 'th', last_char=None)

3

In [96]:
count(i, 'h', last_char='e') / count(i, 'h', last_char=None)

0.0

In [88]:
i['thin']['g']

1

**Q3 how can we take into account the sequence of chars in real words?**

$p(c) = \frac{count(c)}{\sum\limits_{i \in A}count(c_i)}$

$$
p(c_i \mid c_1, c_2, \dots, c_{i-1}) = \frac{count(c_1, c_2, \dots, c_{i-1}, c_i)}{count(c_1, c_2, \dots, c_{i-1})}
$$

$$
p(c_i \mid c_{i-k}, \dots, c_{i-1}) = \frac{count(c_{i-k}, \dots, c_{i-1}, c_i)}{count(c_{i-k}, \dots, c_{i-1})}
$$

In [64]:
k = defaultdict(lambda: defaultdict(lambda: 0))

In [68]:
k['thin']['g']

0