# Example of Markov chain to model sequences of words

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import nltk
from collections import defaultdict

## Work with sentences
Example of sentences are taken from the [Stanford Natural Language Inference Corpus](https://nlp.stanford.edu/projects/snli/)

In [3]:
import os
import re

In [4]:
data = pd.read_csv('/Users/flint/Data/snli/snli_1.0_train.csv')

In [6]:
data.head(2)

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,captionID,pairID,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,3416050480.jpg#4,3416050480.jpg#4r1n,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",3416050480.jpg#4,3416050480.jpg#4r1c,contradiction,,,,


In [7]:
sentences = list(data.sentence1.values) + list(data.sentence2.values)

In [8]:
len(sentences)

1100304

### Text as a sequence of words
No markov assumption: $p(w_n \mid w_1, w_2, \dots, w_{n-1})$

In [10]:
words_index = defaultdict(lambda: defaultdict(lambda: 0.00001))
words_list = set()
for sentence in tqdm(sentences):
    if not pd.isnull(sentence):
        word_seq = sentence.lower().split()
        for i, word in enumerate(word_seq):
            words_list.add(word)
            words_index[" ".join(word_seq[:i])][word] += 1

  0%|          | 0/1100304 [00:00<?, ?it/s]

In [14]:
len(words_index)

2482276

In [15]:
def prob_word(seq):
    series = pd.Series(dict(
        [(word, words_index[seq][word]) for word in words_list]))
    return series / series.sum()

In [16]:
round(prob_word('').sort_values(ascending=False).head(6), 2)

a         0.50
the       0.16
two       0.09
there     0.03
people    0.03
an        0.02
dtype: float64

In [17]:
round(prob_word('a person in').sort_values(ascending=False).head(6), 2)

a        0.68
blue     0.03
black    0.03
the      0.03
an       0.03
red      0.02
dtype: float64

In [18]:
round(prob_word('a person in a blue').sort_values(ascending=False).head(6), 2)

shirt     0.37
jacket    0.18
suit      0.08
hat       0.07
outfit    0.03
kayak     0.03
dtype: float64

### Markov chain (n=3)
We compute $p(w_n \mid w_{n-2}, w_{n-1})$

In [23]:
markov_index = defaultdict(lambda: defaultdict(lambda: 0.00001))
markov_list = set()
for sentence in tqdm(sentences):
    if not pd.isnull(sentence):
        word_seq = sentence.lower().split()
        for i, word in enumerate(word_seq):
            markov_list.add(word)
            markov_index[" ".join(word_seq[i-2:i])][word] += 1

  0%|          | 0/1100304 [00:00<?, ?it/s]

In [24]:
len(markov_index)

420667

In [37]:
def markov_word(seq):
    components = seq.split()
    sub = " ".join(components[-2:])
    series = pd.Series(dict(
        [(word, markov_index[sub][word]) for word in markov_list]))
    return series / series.sum()

In [38]:
print(round(markov_word('').sort_values(ascending=False).head(6), 2), '\n')
print(round(markov_word('a person in').sort_values(ascending=False).head(6), 2), '\n')
print(round(markov_word('a person in a blue').sort_values(ascending=False).head(6), 2), '\n')

a         0.25
man       0.09
the       0.08
two       0.05
woman     0.04
people    0.03
dtype: float64 

a        0.56
the      0.11
blue     0.04
black    0.03
red      0.02
an       0.02
dtype: float64 

shirt     0.24
and       0.06
jacket    0.05
hat       0.03
dress     0.02
shirt,    0.02
dtype: float64 



## Generate a sentence

In [61]:
def generate(n, prefix='', generator=prob_word):
    sent = prefix
    for i in range(n):
        p = generator(sent)
        candidates, dist = list(p.keys()), p.values
        next_word = np.random.choice(candidates, p=dist)
        sent += " {}".format(next_word)
    return sent

In [62]:
print(generate(n=10, generator=prob_word))
print(generate(n=10, generator=markov_word))

 a big, clift. holloween. fifteen flashing sews handshakes muggers noone
 some dried antarctica. strange-shaped photobooth spirt boarding containg severely scarf.


## Estimate a sentence probability

In [65]:
test = 'this is a sentence that is generated'
tester = markov_word

sequence = test.split()
prob = 0
for i, w in enumerate(sequence):
    x = tester(sentence[:i])[w]
    prob += np.log(x)
prob

-71.96128542424196

In [67]:
test = 'a non the generated very next best next next'
tester = markov_word

sequence = test.split()
prob = 0
for i, w in enumerate(sequence):
    x = tester(sentence[:i])[w]
    prob += np.log(x)
prob

-88.88840705953086