## Learning N-gram Models
Speech and Language Processing. Daniel Jurafsky & James H. Martin. Copyright ⃝c 2014. All rights reserved. Draft of September 1, 2014.

### Ch.4 N-Gram Modeling

In [583]:
#meta 8/9/2018
#reading the chapter

In [584]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


### 0. Data source

In [585]:
#tiny minicorpus from the chapter - all text as 1 record
#$actodo handle separators later
minicorpus = ["<s> I am Sam </s> <s> Sam I am </s> <s> I do not like green eggs and ham </s>"]

#not minicorpus = ["<s> I am Sam </s>",  "<s> Sam I am </s>", "<s> I do not like green eggs and ham </s>"]

### Start with Unigrams

In [586]:
#get frequency counts
vectorizer = CountVectorizer(token_pattern=r'(?u)[\<[\/]*]?\b\w+\b[\>*]?', ngram_range=(1, 1))
#print(type(vectorizer))
print('Successful regex for this vocab - unigrams with index')

vectorizer.fit(minicorpus)
print(vectorizer.vocabulary_)

unigrams = list(vectorizer.vocabulary_)
unigrams


Successful regex for this vocab - unigrams with index
{'<s>': 1, 'i': 8, 'am': 2, 'sam': 11, '</s>': 0, 'do': 4, 'not': 10, 'like': 9, 'green': 6, 'eggs': 5, 'and': 3, 'ham': 7}


['<s>',
 'i',
 'am',
 'sam',
 '</s>',
 'do',
 'not',
 'like',
 'green',
 'eggs',
 'and',
 'ham']

### Continue with Bigrams
Building unigram x unigram matrix results in bigram probabilities.  So need to generate frequency counts for bigrams.

In [587]:
#need unigrams and bigrams 
vectorizer = CountVectorizer(token_pattern=r'(?u)[\<[\/]*]?\b\w+\b[\>*]?', ngram_range=(1, 2))
#dtm with 1 document
dtm = vectorizer.fit_transform(minicorpus) #class 'scipy.sparse.csr.csr_matrix'
print ('DTM type ', type(dtm))

vocab = vectorizer.get_feature_names() #class list
vocab_total=len(vocab)
print('Successful vocab - with bigrams: ', vocab_total)

vocab #class list

DTM type  <class 'scipy.sparse.csr.csr_matrix'>
Successful vocab - with bigrams:  28


['</s>',
 '</s> <s>',
 '<s>',
 '<s> i',
 '<s> sam',
 'am',
 'am </s>',
 'am sam',
 'and',
 'and ham',
 'do',
 'do not',
 'eggs',
 'eggs and',
 'green',
 'green eggs',
 'ham',
 'ham </s>',
 'i',
 'i am',
 'i do',
 'like',
 'like green',
 'not',
 'not like',
 'sam',
 'sam </s>',
 'sam i']

In [588]:
dtm

<1x28 sparse matrix of type '<class 'numpy.int64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [589]:
#query dtm - only works with 1 row;
#if multiple rows, there's no instance of '</s> <s>'
print ('DTM type ', type(dtm))
ngram_value = '</s> <s>'
#ngram_value = 'am sam'
ngram_idx = list(vocab).index(ngram_value)
print ('Query dtm: how many times an n-gram occurs in the text')
dtm[0,ngram_idx]

DTM type  <class 'scipy.sparse.csr.csr_matrix'>
Query dtm: how many times an n-gram occurs in the text


2

#### From sparse matrix into NumPy array  
NumPy arrays supports a greater variety of operations than a list

In [590]:
#convert from current format, sparse matrix, into a normal numpy array 
print ('DTM type before: ', type(dtm))
dtm = dtm.toarray()
print ('DTM type after', type(dtm))
dtm

DTM type before:  <class 'scipy.sparse.csr.csr_matrix'>
DTM type after <class 'numpy.ndarray'>


array([[3, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1,
        1, 1, 1, 2, 1, 1]], dtype=int64)

In [591]:
#convert python list storing vocab into numpy array
vocab = np.array(vocab)
vocab

array(['</s>', '</s> <s>', '<s>', '<s> i', '<s> sam', 'am', 'am </s>',
       'am sam', 'and', 'and ham', 'do', 'do not', 'eggs', 'eggs and',
       'green', 'green eggs', 'ham', 'ham </s>', 'i', 'i am', 'i do',
       'like', 'like green', 'not', 'not like', 'sam', 'sam </s>',
       'sam i'], dtype='<U10')

In [592]:
#query dtm
ngram_idx = list(vocab).index(ngram_value)
dtm[0,ngram_idx]

2

#### Using NumPy indexing is more natural

In [593]:
dtm[0,vocab == ngram_value]

array([2], dtype=int64)

#### Print frequency counts (aka dtm)

In [594]:
#print dtm frequency counts
df = pd.DataFrame(dtm,columns = vocab)
df

Unnamed: 0,</s>,</s> <s>,<s>,<s> i,<s> sam,am,am </s>,am sam,and,and ham,...,i,i am,i do,like,like green,not,not like,sam,sam </s>,sam i
0,3,2,3,2,1,2,1,1,1,1,...,3,2,1,1,1,1,1,2,1,1


### Calculate some bigram probabilities from this corpus
Didn't think I could at this point.  Thought I had to build unigram to bigram matrix first.  

Wrong.  I have everything at this point.  Probably not efficient, but sufficient.  Worry about efficiency after figure out stats.


In [595]:
#P(I|<s>) = .67  
#P(Sam|<s>) = .33 
#P(am|I) = .67 
#P(</s>|Sam) = 0.5 
#P(Sam|am)  = .5 
#P(do|I) = .33

In [596]:
#calculate some bigram probabilities - 
#P(I|<s>) = .67 

n_gram_of = '<s> i'
n_gram_given = '<s>'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram

0.6666666666666666

In [597]:
#P(Sam|<s>) = .33 

n_gram_of = '<s> sam'
n_gram_given = '<s>'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram


0.3333333333333333

In [598]:
#P(am|I) = .67 

n_gram_of = 'i am'
n_gram_given = 'i'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram

0.6666666666666666

In [599]:
#P(</s>|Sam) = .5 
n_gram_of = 'sam </s>'
n_gram_given = 'sam'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram


0.5

In [600]:
#P(Sam|am)  = .5 

n_gram_of = 'am sam'
n_gram_given = 'am'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram 

0.5

In [601]:
#P(do|I) = .33

n_gram_of = 'i do'
n_gram_given = 'i'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram 

0.3333333333333333

In [602]:
#P(not|do)  = 1 

n_gram_of = 'do not'
n_gram_given = 'do'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram 

1.0

In [603]:
#P(<s>|</s>)  = .67

n_gram_of = '</s> <s>'
n_gram_given = '</s>'
p_bigram = dtm[0,list(vocab).index(n_gram_of)] / dtm[0,list(vocab).index(n_gram_given)]

p_bigram 

0.6666666666666666

#### Build bigram probability matrix
manually

In [604]:
#print unigrams for reference
unigrams

['<s>',
 'i',
 'am',
 'sam',
 '</s>',
 'do',
 'not',
 'like',
 'green',
 'eggs',
 'and',
 'ham']

In [605]:
bigram_freq_mx = np.zeros(shape = (len(unigrams),len(unigrams)))
bigram_freq_mx

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [606]:
#populate bigram matrix - manually
#i rows, j columns

cnt_i = 0
#loop over rows
for i in unigrams:
    
    cnt_j = 0    
    #loop over columns
    for j in unigrams:
        bigram = i+' '+j
        print (bigram)
        
        try:
            prob = dtm[0,list(vocab).index(bigram)] / dtm[0,list(vocab).index(i)]
            print (prob)
            bigram_freq_mx[cnt_i,cnt_j]=round(prob, 2)
            
        except ValueError:
            print('none')
        finally:
            cnt_j+=1
            #print('----', j, cnt_j)
    
    cnt_i+=1
        

<s> <s>
none
<s> i
0.6666666666666666
<s> am
none
<s> sam
0.3333333333333333
<s> </s>
none
<s> do
none
<s> not
none
<s> like
none
<s> green
none
<s> eggs
none
<s> and
none
<s> ham
none
i <s>
none
i i
none
i am
0.6666666666666666
i sam
none
i </s>
none
i do
0.3333333333333333
i not
none
i like
none
i green
none
i eggs
none
i and
none
i ham
none
am <s>
none
am i
none
am am
none
am sam
0.5
am </s>
0.5
am do
none
am not
none
am like
none
am green
none
am eggs
none
am and
none
am ham
none
sam <s>
none
sam i
0.5
sam am
none
sam sam
none
sam </s>
0.5
sam do
none
sam not
none
sam like
none
sam green
none
sam eggs
none
sam and
none
sam ham
none
</s> <s>
0.6666666666666666
</s> i
none
</s> am
none
</s> sam
none
</s> </s>
none
</s> do
none
</s> not
none
</s> like
none
</s> green
none
</s> eggs
none
</s> and
none
</s> ham
none
do <s>
none
do i
none
do am
none
do sam
none
do </s>
none
do do
none
do not
1.0
do like
none
do green
none
do eggs
none
do and
none
do ham
none
not <s>
none
not i
none
not a

In [607]:
bigram_freq_mx

array([[0.  , 0.67, 0.  , 0.33, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.67, 0.  , 0.  , 0.33, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.5 , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.5 , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.67, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        1.  ],
       [0.  , 0.  , 0

#### findings
At run time will only be interested in one row of the matrix, therefore no need to pre-calculate the entire matrix beforehand.  
Next step: compute one row of the matrix - still manually

In [608]:
#normalize frequency counts
dtm_normalized = dtm/dtm.sum()
print(dtm_normalized)

[[0.07692308 0.05128205 0.07692308 0.05128205 0.02564103 0.05128205
  0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
  0.02564103 0.02564103 0.02564103 0.02564103 0.02564103 0.02564103
  0.07692308 0.05128205 0.02564103 0.02564103 0.02564103 0.02564103
  0.02564103 0.05128205 0.02564103 0.02564103]]


In [635]:
#compute prob of all bigrams 
start_token = 'am'
#Q: given 'am', what are all bigrams and their probabilites?
#P(Sam|am)  = .5 

#work with all vocab
print(vocab)
#--check if vocab contains a unigram token 'am' and if yes, where
start_token in vocab #True
#vocab.index(start_token)

#--check if vocab contains bigram token(s) starting with 'am' 
#for term in vocab:
#    print (term.startswith(start_token))

#--check if vocab contains tokens startubg with 'am' and if yes, where
#no loop, check if vocab contains bigram token(s) starting with 'am' 
#refer to https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.char.html
#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.core.defchararray.startswith.html#numpy.core.defchararray.startswith
np.core.defchararray.startswith(vocab, start_token) #returns boolean array

print('Found bigrams with ')
bigrams_idx = np.where(np.core.defchararray.startswith(vocab, start_token + ' '))
bigrams_idx[0].tolist()

['</s>' '</s> <s>' '<s>' '<s> i' '<s> sam' 'am' 'am </s>' 'am sam' 'and'
 'and ham' 'do' 'do not' 'eggs' 'eggs and' 'green' 'green eggs' 'ham'
 'ham </s>' 'i' 'i am' 'i do' 'like' 'like green' 'not' 'not like' 'sam'
 'sam </s>' 'sam i']
Found bigrams with 


[6, 7]

In [642]:
#calc prob of relevant tokens
#vocab[np.where(np.core.defchararray.startswith(vocab, start_token + ' '))]
np.where(np.core.defchararray.startswith(vocab, start_token + ' '),0,-1)
dtm[0, bigrams_idx[0].tolist()]
bigrams_prob = dtm[0, bigrams_idx[0].tolist()]/vocab_total

print("Bigrams and their probabilities: ")
print(vocab[bigrams_idx])
bigrams_prob.tolist()


Bigrams and their probabilities: 
['am </s>' 'am sam']


[0.03571428571428571, 0.03571428571428571]

### Continue with Bigrams

In [None]:
#need unigrams and bigrams 
vectorizer = CountVectorizer(token_pattern=r'(?u)[\<[\/]*]?\b\w+\b[\>*]?', ngram_range=(1, 2))
dtm = vectorizer.fit_transform(minicorpus) #class 'scipy.sparse.csr.csr_matrix'
print ('DTM type ', type(dtm))

vocab = vectorizer.get_feature_names() #class list
print('Successful vocab - bigrams')

vocab #class list

### Xtra

#### Word Counts with CountVectorizer
https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())