# Dummy word generator with estimated probability distribution over the alphabet
- corpus
- count char frequency
- index

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import string
from models import DummyWordGenerator

In [3]:
dummy_gen = DummyWordGenerator(alphabet=string.ascii_lowercase, special_chars=" .")

In [6]:
dummy_gen.generate

['l', 'i', 'g', 'y', 'q', 'x', 'h', 'm', 'g', 'v']

In [7]:
vowels = {'a', 'e', 'i', 'o', 'u'}
p = [5 if x in vowels else 1 for x in string.ascii_lowercase + " ."]
p_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, 
    special_chars=" .", p=p)

In [9]:
p_gen.generate[:5]

['r', 's', 'x', 'o', 'z']

## Massive generation of words

In [10]:
size = 10000
dummy_list = [x for x in dummy_gen.words(size)]
p_list = [x for x in p_gen.words(size)]

In [12]:
dummy_list[:5]

['usbd',
 'rjynrhfbffdkustyfqvfbqw',
 'tcnfqkn',
 'vbzel',
 'mjcgaawnkztgwgenayrdfcnsz']

In [13]:
p_list[:5]

['uoq', 'ceeemebuahyuaeojawmkregiuoiugntda', 'pqiooajiie', 'eiuuauu', 'my']

**Q1 how can we evaluate the quality of the generator?**

In [15]:
from index import WordIndex

In [14]:
local_file = '/Users/flint/Data/daily-dialogue/EMNLP_dataset/dialogues_text.txt'
with open(local_file, 'r') as infile:
    raw = infile.read()
sentences = [x.strip() for x in raw.split('__eou__')]

In [16]:
W = WordIndex(sentences)

In [22]:
correct_dummy = [w for w in dummy_list if len(w) > 1 and w in W.index]
correct_p = [w for w in p_list if len(w) > 1 and w in W.index]

In [23]:
len(correct_dummy) / len(dummy_list), len(correct_p) / len(p_list)

(0.0123, 0.0118)

In [24]:
correct_dummy[:5]

['bo', 'be', 'new', 'bo', 'na']

**Q2 how to compute p is a more reliable way?**

In [27]:
from collections import defaultdict

$p(c) = \frac{count(c)}{\sum\limits_{i \in W} count(c_i)}$

In [36]:
index_chars = defaultdict(lambda: 0)
for word, count in W.index.items():
    for c in word.lower():
        index_chars[c] += count
index_chars[' '] = sum(W.index.values())

In [37]:
p_corpus = [index_chars[c] for c in string.ascii_lowercase + " ."]

In [39]:
corpus_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, special_chars=" .", p=p_corpus)

In [40]:
corpus_list = [x for x in corpus_gen.words(size)]

In [47]:
correct_p = [w for w in corpus_list if len(w) > 1 and w in W.index]

In [48]:
len(correct_p) / len(corpus_list)

0.0074

In [49]:
correct_p[:5]

['ant', 'tom', 'tile', 'set', 'non']

**Q3 how can we take into account the sequence of chars in real words?**