# Example of Markov chain to model sequences of characters

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Get list of words and model them as sequences of characters

In [2]:
import nltk
from collections import defaultdict

In [3]:
W = list(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

In [4]:
np.random.shuffle(W)
sample = [x for x in W[:20000] if len(x) > 1]

## Full chain

In [5]:
import string

In [6]:
I = defaultdict(lambda: defaultdict(lambda: 0.00001))
alphabet = set()
for word in tqdm(sample):
    seq = list(word) + ['#E']
    for i, c in enumerate(seq):
        if c not in string.punctuation:
            alphabet.add(c)
            I[tuple(seq[:i])][c] += 1
A = list(alphabet)

  0%|          | 0/16389 [00:00<?, ?it/s]

In [7]:
def P(sequence, pindex):
    key = tuple(sequence)
    data = pindex[key]
    N = sum([data[a] for a in A])
    return [data[a] / N for a in A]

In [8]:
short = pd.Series(dict([(A[i], s) for i, s in enumerate(P('g', I))]))
medium = pd.Series(dict([(A[i], s) for i, s in enumerate(P('ga', I))]))
long = pd.Series(dict([(A[i], s) for i, s in enumerate(P('gam', I))]))

In [100]:
selchar = ['r', 'o', 'e', 'a', 'l', 'm', '#E']

In [101]:
round(short.sort_values(ascending=False), 2).loc[selchar]

r     0.28
o     0.20
e     0.17
a     0.10
l     0.10
m     0.00
#E    0.00
dtype: float64

In [102]:
round(medium.sort_values(ascending=False), 2).loc[selchar]

r     0.07
o     0.00
e     0.00
a     0.00
l     0.19
m     0.15
#E    0.00
dtype: float64

In [103]:
round(long.sort_values(ascending=False), 2).loc[selchar]

r     0.00
o     0.00
e     0.75
a     0.00
l     0.00
m     0.00
#E    0.25
dtype: float64

### Generate a word

In [12]:
def generate(A, I, start=''):
    word = start
    for i in range(20):
        new_char = np.random.choice(A, p=P(word, I))
        if new_char == '#E':
            break
        else:
            word += new_char
    return word

In [13]:
word_stats = defaultdict(lambda: 0)
for i in range(1000):
    word_stats[generate(A, I)] += 1

In [14]:
pd.Series(word_stats).sort_values(ascending=False)

the          69
of           32
and          27
in           24
to           13
             ..
stood         1
palpably      1
stuff         1
can           1
conversed     1
Length: 575, dtype: int64

### Evaluate words probability

In [15]:
word = 'the'
prob = 0
for i, c in enumerate(word):
    x = P("".join(word[:i]), I)[A.index(c)]
    prob += np.log(x)
prob

-2.510840339213326

## Approximate by the Markov assumption using n=3

In [16]:
I3 = defaultdict(lambda: defaultdict(lambda: 0.00001))
for word in tqdm(sample):
    seq = list(word) + ['#E']
    for i, c in enumerate(seq):
        if c not in string.punctuation:
            I3[tuple(seq[:i][-3:])][c] += 1

  0%|          | 0/16389 [00:00<?, ?it/s]

In [17]:
def generate3(A, I, start=''):
    word = start
    for i in range(20):
        if len(word) < 3:
            new_char = np.random.choice(A, p=P(word, I))
        else:
            new_char = np.random.choice(A, p=P(word[-3:], I))
        if new_char == '#E':
            break
        else:
            word += new_char
    return word

In [18]:
word_stats = defaultdict(lambda: 0)
for i in range(1000):
    word_stats[generate(A, I3)] += 1

In [19]:
pd.Series(word_stats).sort_values(ascending=False)

the                     55
of                      29
and                     28
to                      26
in                      20
                        ..
windczaT                 1
nownO5fFFinCq7uFSaKe     1
WhalMtyuZK3EVCsxYXNJ     1
feelb                    1
thre5cnIDr51p7wDV1Fr     1
Length: 690, dtype: int64

In [20]:
word = 'the'
prob = 0
for i, c in enumerate(word):
    x = P("".join(word[:i]), I3)[A.index(c)]
    prob += np.log(x)
prob

-2.510840339213326

## Work with sentences
Example of sentences and syntactic parser are taken from the [Stanford Natural Language Inference Corpus](https://nlp.stanford.edu/projects/snli/)

In [29]:
import os
import re

In [24]:
data = pd.read_csv('/Users/flint/Data/snli/snli_1.0_train.csv')

In [26]:
sentences = list(data.sentence1.values)
parser = list(data.sentence1_parse.values)

In [27]:
sentences[0]

'A person on a horse jumps over a broken down airplane.'

In [28]:
parser[0]

'(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))'

In [38]:
def get_pos(parser_text):
    pattern = re.compile('(\((?:\(??[^\(]*?\)))')
    data = re.findall(pattern, parser_text)
    pos = []
    for item in data:
        i = item.replace('(', '').replace(')', '').split()[0]
        pos.append(i)
    return pos

In [43]:
pos_sequences = [get_pos(x) for x in parser]

### Text as a sequence of POS tags
See the [Tag list](https://gist.github.com/nlothian/9240750) for reference

In [52]:
pos_index = defaultdict(lambda: defaultdict(lambda: 0.00001))
pos_list = set()
for pos_seq in tqdm(pos_sequences):
    for i, pos in enumerate(pos_seq):
        pos_list.add(pos)
        pos_index[" ".join(pos_seq[:i])][pos] += 1

  0%|          | 0/550152 [00:00<?, ?it/s]

In [54]:
def prob_pos(seq):
    series = pd.Series(dict(
        [(pos_tag, pos_index[seq][pos_tag]) for pos_tag in pos_list]))
    return series / series.sum()

In [57]:
round(prob_pos('').sort_values(ascending=False).head(6), 2)

DT     0.67
CD     0.17
NNP    0.05
NNS    0.04
JJ     0.04
NN     0.01
dtype: float64

In [61]:
round(prob_pos('DT JJ').sort_values(ascending=False).head(6), 2)

NN     0.84
JJ     0.09
CC     0.03
NNS    0.02
,      0.01
NNP    0.00
dtype: float64

In [65]:
round(prob_pos('DT JJ NN IN DT JJ').sort_values(ascending=False).head(6), 2)

NN     0.87
JJ     0.06
CC     0.05
,      0.01
NNS    0.01
NNP    0.00
dtype: float64

### Text as a sequence of words

In [78]:
words_index = defaultdict(lambda: defaultdict(lambda: 0.00001))
words_list = set()
for sentence in tqdm(sentences):
    word_seq = sentence.lower().split()
    for i, word in enumerate(word_seq):
        words_list.add(word)
        words_index[" ".join(word_seq[:i])][word] += 1

  0%|          | 0/550152 [00:00<?, ?it/s]

In [80]:
def prob_word(seq):
    series = pd.Series(dict(
        [(word, words_index[seq][word]) for word in words_list]))
    return series / series.sum()

In [82]:
round(prob_word('').sort_values(ascending=False).head(6), 2)

a         0.59
two       0.11
the       0.04
an        0.03
three     0.03
people    0.02
dtype: float64

In [84]:
round(prob_word('a person in').sort_values(ascending=False).head(6), 2)

a        0.70
blue     0.03
black    0.03
an       0.03
red      0.02
the      0.02
dtype: float64

In [89]:
round(prob_word('a person in a blue').sort_values(ascending=False).head(6), 2)

shirt     0.37
jacket    0.20
suit      0.08
hat       0.07
kayak     0.03
outfit    0.03
dtype: float64