# Dummy word generator with estimated probability distribution over the alphabet
- corpus
- count char frequency
- index

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import string
from models import DummyWordGenerator

In [3]:
dummy_gen = DummyWordGenerator(alphabet=string.ascii_lowercase, special_chars=" .")

In [4]:
dummy_gen.generate

['p',
 'z',
 'b',
 'f',
 'u',
 'r',
 'y',
 'x',
 's',
 'c',
 'p',
 'k',
 'q',
 'l',
 'c',
 'm',
 'r',
 'y',
 'h',
 'f',
 'p',
 'r',
 'o',
 'w',
 'c',
 'g',
 'l',
 'g',
 'b',
 'y',
 'x',
 's',
 'w',
 'k',
 't',
 'o',
 'i',
 'w',
 'r',
 'p',
 'z',
 'l',
 'f',
 'm',
 'i',
 'x',
 's',
 'b',
 'p',
 'k',
 't',
 'y',
 'z',
 'n',
 'j',
 'k',
 'q',
 'b',
 'z',
 'g',
 'l',
 'x']

In [5]:
vowels = {'a', 'e', 'i', 'o', 'u'}
p = [5 if x in vowels else 1 for x in string.ascii_lowercase + " ."]
p_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, 
    special_chars=" .", p=p)

In [6]:
p_gen.generate[:5]

['a', 'n', 'u', 'o', 'u']

## Massive generation of words

In [7]:
size = 1000
dummy_list = [x for x in dummy_gen.words(size)]
p_list = [x for x in p_gen.words(size)]

In [8]:
dummy_list[:5]

['', 'gsvxqyirpbug', 'mhcnkybgtjhshms', 'pidxcbohcpqbe', 'ytlb']

In [9]:
p_list[:5]

['neixqhacgevoeimaiqeibisuakijrliwdeevaiq',
 'zbjojsvahaourxxooieadipntu',
 'utdswooigwkbuuejiwuoixeaezkhomeoe',
 'ada',
 'eaiueyeoluuxosmkudotkeergaojeoeeqgssriuiekoyueeoueeuzakvsaioaxtoohaeoau']

**Q1 how can we evaluate the quality of the generator?**

In [10]:
from index import WordIndex

In [11]:
local_file = '/Users/flint/Data/daily-dialogue/EMNLP_dataset/dialogues_text.txt'
with open(local_file, 'r') as infile:
    raw = infile.read()
sentences = [x.strip() for x in raw.split('__eou__')]

In [12]:
W = WordIndex(sentences)

In [13]:
correct_dummy = [w for w in dummy_list if len(w) > 1 and w in W.index]
correct_p = [w for w in p_list if len(w) > 1 and w in W.index]

In [14]:
len(correct_dummy) / len(dummy_list), len(correct_p) / len(p_list)

(0.009, 0.011)

In [15]:
correct_dummy[:5]

['vs', 'kg', 'xx', 'pong', 'vol']

**Q2 how to compute p is a more reliable way?**

In [16]:
from collections import defaultdict

$p(c) = \frac{count(c)}{\sum\limits_{i \in W} count(c_i)}$

In [17]:
from index import UnigramIndex, WordIndex

In [18]:
U = UnigramIndex.from_word_index(W, lower_case=True)
p_corpus = [U[c] for c in string.ascii_lowercase + " ."]

In [19]:
corpus_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, special_chars=" .", p=p_corpus)

In [20]:
corpus_list = [x for x in corpus_gen.words(size)]

In [21]:
correct_p = [w for w in corpus_list if len(w) > 1 and w in W.index]

In [22]:
len(correct_p) / len(corpus_list)

0.064

In [23]:
correct_p[:5]

['ect', 'en', 'on', 'tor', 'me']

**Q3 how can we take into account the sequence of chars in real words?**

$p(c) = \frac{count(c)}{\sum\limits_{i \in A}count(c_i)}$

$$
p(c_i \mid c_1, c_2, \dots, c_{i-1}) = \frac{count(c_1, c_2, \dots, c_{i-1}, c_i)}{count(c_1, c_2, \dots, c_{i-1})}
$$

$$
p(c_i \mid c_{i-k}, \dots, c_{i-1}) = \frac{count(c_{i-k}, \dots, c_{i-1}, c_i)}{count(c_{i-k}, \dots, c_{i-1})}
$$

### N-Gram indexing

In [24]:
from index import BiGramIndex

In [25]:
b = BiGramIndex.from_word_index(W)

In [28]:
# Usage
b[('a', 'b')], b.frequency('a')

(385, 12909)

### Other options for bidimensional (sparse) index

#### Numpy matrix

#### Pandas DataFrame

## Markov generation process