# Examples of text corpora from which get data

## Dialogues
A high-quality multi-turn dialog dataset.
> Li, Y., Su, H., Shen, X., Li, W., Cao, Z., & Niu, S. (2017). Dailydialog: A manually labelled multi-turn dialogue dataset. arXiv preprint arXiv:1710.03957. [download](https://aclanthology.org/I17-1099/)

In [1]:
local_file = '/Users/flint/Data/daily-dialogue/EMNLP_dataset/dialogues_text.txt'
with open(local_file, 'r') as infile:
    raw = infile.read()
sentences = [x.strip() for x in raw.split('__eou__')]

In [2]:
sentences[:2], len(sentences)

(['The kitchen stinks .', "I'll throw out the garbage ."], 102981)

- input: list of strings
- tokenization
- storage: index

In [3]:
import nltk
from collections import defaultdict

In [4]:
nltk.word_tokenize("I'll throw out the garbage .")

['I', "'ll", 'throw', 'out', 'the', 'garbage', '.']

## Indexing

In [5]:
from collections import defaultdict

In [19]:
I = defaultdict(lambda: 0)
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    for word in words:
        I[word.lower()] += 1

In [21]:
vocabulary = I.keys()

## Check generator

In [31]:
import string
from wordgenerators import DummyWordGenerator, END_CHAR
import numpy as np

In [25]:
d = DummyWordGenerator(string.ascii_lowercase)

In [27]:
words = [d.generate() for i in range(1000)]
valid = [w for w in words if w in vocabulary]

In [28]:
len(valid) / len(words)

0.037

In [29]:
valid[:5]

['f', 'c', 'r', 'e', 'h']

## Char Index
$$
p(c) = \frac{count(c)}{\sum\limits_{i} count(c_i)}
$$

In [37]:
import pandas as pd

In [32]:
U = defaultdict(lambda: 0)
for w, freq in I.items():
    for c in w:
        U[c] += freq
    U[END_CHAR] += freq

In [38]:
Us = pd.Series(U)

In [43]:
p = lambda w: Us[w] / Us.sum()

In [44]:
p('a')

0.05562784943599184

## N-gram Index

$$
p(c_i \mid c_1, \dots, c_{i-1}) = \frac{count(c_1, \dots, c_i)}{\sum\limits_{j} count(c_1, \dots, c_j)} = \frac{count(c_1, \dots, c_i)}{count(c_1, \dots, c_{i-1})}
$$

$$
p(c_i \mid c_{i-n}, \dots, c_{i-1}) = \frac{count(c_{i-n}, \dots, c_i)}{count(c_{i-n}, \dots, c_{i-1})}
$$

In [60]:
N = defaultdict(lambda: 0)
for w, freq in I.items():
    for i, c in enumerate(w):
        if i == 0:
            N[('', c)] += 1
        else:
            N[(w[i-1], c)] += 1

```
{
    a : {
        t: v(a, t),
        s: v(a, s),
        ...
    },
    ...
}
```

In [87]:
N = defaultdict(lambda: defaultdict(lambda: 0))
alphabet = set(['', END_CHAR])

In [88]:
for w, freq in I.items():
    for i, c in enumerate(w):
        alphabet.add(c)
        if i == 0:
            N[''][c] += freq
        else:
            N[w[i-1]][c] += freq
    N[w[-1]][END_CHAR] += freq
alphabet = list(alphabet)

In [74]:
p_ta = N['a']['t'] / sum(N['a'].values())

In [96]:
def get_prob(previous, alphabet, bindex):
    p = np.zeros(len(alphabet))
    for s, v in bindex[previous].items():
        p_s = bindex[previous][s] / sum(bindex[previous].values())
        p[alphabet.index(s)] = p_s
    return p

In [97]:
def bigram_gen(bindex, alphabet, top_k=1000, previous_char=''):
    word = []
    for i in range(top_k):
        p = get_prob(previous_char, alphabet, bindex)
        next_char = np.random.choice(alphabet, p=p)
        if next_char == END_CHAR:
            break
        previous_char = next_char
        word.append(next_char)
    return "".join(word)        

In [107]:
bigram_gen(N, alphabet)

'odaherr'

In [108]:
words = [bigram_gen(N, alphabet) for i in range(1000)]
valid = [w for w in words if w in vocabulary]

In [109]:
len(valid) / len(words)

0.524

In [112]:
[x for x in valid if len(x) > 3]

['ther',
 'cred',
 'that',
 'meng',
 'wore',
 'wire',
 'ream',
 'that',
 'care',
 'wang',
 'dork',
 'dome',
 'tans',
 'there']

## Multilevel indexes

$$
\begin{bmatrix}
1.3 & 3 & \dots & 0 \\
0 & 0 & \dots & 4 \\
\end{bmatrix}
$$

In [119]:
Ndf = pd.DataFrame(N).T.fillna(0)

In [120]:
Ndf.head()

Unnamed: 0,t,k,s,.,i,',o,g,d,",",...,”,*,,@,\,、,+,#,#E,_
,162451.0,8608.0,74545.0,117890.0,121593.0,28542.0,52245.0,29304.0,42260.0,54565.0,...,91.0,5.0,1.0,9.0,6.0,3.0,10.0,1.0,0.0,0.0
t,6078.0,0.0,6266.0,477.0,23886.0,10.0,46703.0,65.0,58.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,154954.0,1.0
h,9079.0,4.0,384.0,107.0,29124.0,2.0,22977.0,0.0,373.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25903.0,0.0
e,20908.0,1726.0,36610.0,696.0,3437.0,10.0,2497.0,1940.0,21389.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,221970.0,0.0
k,53.0,9.0,3732.0,132.0,6810.0,2.0,198.0,70.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24836.0,0.0


In [131]:
Ndf.iloc[2:10][[x for i, x in enumerate(Ndf.columns) if i in [2, 3, 6]]]

Unnamed: 0,s,.,o
h,384.0,107.0,22977.0
e,36610.0,696.0,2497.0
k,3732.0,132.0,198.0
i,35310.0,13.0,9867.0
c,311.0,13.0,21982.0
n,6938.0,215.0,21261.0
s,8588.0,521.0,19879.0
.,251.0,3479.0,68.0


In [133]:
Ndf.loc[['a', 's']]

Unnamed: 0,t,k,s,.,i,',o,g,d,",",...,”,*,,@,\,、,+,#,#E,_
a,51425.0,8359.0,23126.0,38.0,9878.0,110.0,132.0,5449.0,9842.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33778.0,1.0
s,31490.0,1967.0,8588.0,521.0,13072.0,15.0,19879.0,42.0,318.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115570.0,2.0
