# Dummy word generator with estimated probability distribution over the alphabet
- corpus
- count char frequency
- index

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import string
from models import DummyWordGenerator

In [3]:
dummy_gen = DummyWordGenerator(alphabet=string.ascii_lowercase, special_chars=" .")

In [4]:
vowels = {'a', 'e', 'i', 'o', 'u'}
p = [5 if x in vowels else 1 for x in string.ascii_lowercase + " ."]
p_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, 
    special_chars=" .", p=p)

In [5]:
p_gen.generate[:5]

['e', 'e', 'y', 'r', 'u']

## Massive generation of words

In [6]:
size = 1000
dummy_list = [x for x in dummy_gen.words(size)]
p_list = [x for x in p_gen.words(size)]

In [7]:
dummy_list[:5]

['vexvudbgeyqektyvxadirx',
 'axosf',
 'muxldlxvjvucghn',
 'tvumsaezdeqbwzztx',
 'gzyajtyjslcgabpgmgaelpqj']

In [8]:
p_list[:5]

['udugieiejofii',
 'eaauewkedjktoeheeiuneqraylv',
 'ho',
 'rbnogbsiqomayadcpiusiartpeopercazqdio',
 'oneluzufzudoyuaou']

**Q1 how can we evaluate the quality of the generator?**

In [9]:
from index import WordIndex

In [10]:
local_file = '/Users/flint/Data/daily-dialogue/EMNLP_dataset/dialogues_text.txt'
with open(local_file, 'r') as infile:
    raw = infile.read()
sentences = [x.strip() for x in raw.split('__eou__')]

In [11]:
W = WordIndex(sentences)

In [12]:
correct_dummy = [w for w in dummy_list if len(w) > 1 and w in W.index]
correct_p = [w for w in p_list if len(w) > 1 and w in W.index]

In [13]:
len(correct_dummy) / len(dummy_list), len(correct_p) / len(p_list)

(0.006, 0.007)

In [14]:
correct_dummy[:5]

['mo', 'by', 'th', 'mg', 'pa']

**Q2 how to compute p is a more reliable way?**

In [15]:
from collections import defaultdict

$p(c) = \frac{count(c)}{\sum\limits_{i \in W} count(c_i)}$

In [16]:
from index import UnigramIndex, WordIndex

In [17]:
U = UnigramIndex.from_word_index(W, lower_case=True)
p_corpus = [U[c] for c in string.ascii_lowercase + " ."]

In [18]:
corpus_gen = DummyWordGenerator(
    alphabet=string.ascii_lowercase, special_chars=" .", p=p_corpus)

In [19]:
corpus_list = [x for x in corpus_gen.words(size)]

In [20]:
correct_p = [w for w in corpus_list if len(w) > 1 and w in W.index]

In [21]:
len(correct_p) / len(corpus_list)

0.055

In [22]:
correct_p[:5]

['ou', 'ai', 'tee', 'ai', 'pi']

**Q3 how can we take into account the sequence of chars in real words?**

$p(c) = \frac{count(c)}{\sum\limits_{i \in A}count(c_i)}$

$$
p(c_i \mid c_1, c_2, \dots, c_{i-1}) = \frac{count(c_1, c_2, \dots, c_{i-1}, c_i)}{count(c_1, c_2, \dots, c_{i-1})}
$$

$$
p(c_i \mid c_{i-k}, \dots, c_{i-1}) = \frac{count(c_{i-k}, \dots, c_{i-1}, c_i)}{count(c_{i-k}, \dots, c_{i-1})}
$$

### N-Gram indexing

In [23]:
from index import BiGramIndex

In [24]:
b = BiGramIndex.from_word_index(W)

In [25]:
# Usage
b[('a', 'b')], b.frequency('a')

(8926, 342007)

In [26]:
sum(b.index['a'].values())

342007

In [27]:
b[('a', 'b')] / b.frequency('a')

0.026098881017055205

In [28]:
import numpy as np

In [29]:
alphabet, special = list(string.ascii_lowercase + " ."), [' ', '.']

In [30]:
def conditional(alphabet, previous, bindex):
    p = []
    for c in alphabet:
        p.append(bindex[(previous, c)])
    p = np.array(p)
    return p / p.sum()

In [31]:
conditional(alphabet, 'a', b)

array([3.51032181e-05, 2.61109437e-02, 3.05105471e-02, 2.84862615e-02,
       2.63274136e-04, 9.10635983e-03, 1.57730460e-02, 5.16894886e-03,
       2.86237491e-02, 6.93288557e-04, 2.44289145e-02, 8.04507253e-02,
       2.65175560e-02, 1.80913210e-01, 3.83210131e-04, 1.49919994e-02,
       2.34021454e-05, 9.93538083e-02, 6.64591676e-02, 1.49525083e-01,
       8.80213194e-03, 4.96973810e-02, 4.32647163e-03, 1.73468403e-03,
       5.00893669e-02, 1.58257008e-03, 9.58493370e-02, 9.94591179e-05])

In [32]:
b[('a', ' ')]

32766

In [40]:
word = []
previous = None
while True:
    if previous is None:
        c = np.random.choice(alphabet)
    else:
        c = np.random.choice(alphabet, p=conditional(
            alphabet=alphabet, previous=previous, bindex=b
        ))
    if c in special:
        break
    word.append(c)
    previous = c
print("".join(word))

hangoman


In [34]:
previous

'e'

### Other options for bidimensional (sparse) index

#### Numpy matrix

In [43]:
m = np.zeros((len(alphabet), len(alphabet)))
for c, following in b.index.items():
    if c in alphabet:
        i = alphabet.index(c)
        for k, value in following.items():
            if k in alphabet:
                j = alphabet.index(k)
                m[i, j] = value        

In [45]:
m.sum(axis=0)

array([2.316440e+05, 2.143100e+04, 5.938200e+04, 1.080270e+05,
       4.907410e+05, 3.932200e+04, 6.814400e+04, 1.506530e+05,
       2.040500e+05, 1.589000e+03, 5.056800e+04, 1.571050e+05,
       6.069700e+04, 2.430080e+05, 3.274480e+05, 4.227400e+04,
       8.790000e+02, 2.192240e+05, 1.722720e+05, 2.453510e+05,
       1.525850e+05, 3.953500e+04, 3.169100e+04, 7.016000e+03,
       8.412600e+04, 2.231000e+03, 1.251469e+06, 7.210000e+03])

#### Pandas DataFrame

In [48]:
df = pd.DataFrame(b.index).fillna(0)

In [49]:
df

Unnamed: 0,T,h,e,k,i,t,c,n,s,.,...,“,”,*,,@,_,\,、,+,#
h,16574.0,40.0,354.0,34.0,11.0,96882.0,16326.0,63.0,11168.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,404.0,82514.0,20975.0,18981.0,8909.0,30113.0,14441.0,25614.0,29357.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a,557.0,56243.0,32635.0,928.0,3313.0,14500.0,19556.0,5695.0,6042.0,8.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
o,846.0,18044.0,2480.0,89.0,9865.0,45838.0,19970.0,16608.0,17122.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
u,182.0,1516.0,169.0,22.0,229.0,4984.0,3292.0,2372.0,7401.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
:,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
",",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Markov generation process