In [39]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
import numpy as np
import os
import sys
sys.path.append(os.path.abspath('..'))

In [40]:
import nltk
nltk.download('brown')
brown.words()

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\chunx\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [41]:
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

In [42]:
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
  # structure of bigram probability matrix:
  # (last word, current word) --> probability
  # use add-1 smoothing, ignore this from the END token
  bigram_probs = np.ones((V, V)) * smoothing
  for sentence in sentences:
    for i in range(len(sentence)):
      
      if i == 0:
        # beginning word
        bigram_probs[start_idx, sentence[i]] += 1
      else:
        # middle word
        bigram_probs[sentence[i-1], sentence[i]] += 1

      # for the final word, update the bigram for last -> current, AND current -> END token
      if i == len(sentence) - 1:
        # final word
        bigram_probs[sentence[i], end_idx] += 1

  # normalize the counts along the rows to get probabilities
  bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
  return bigram_probs

In [43]:
if __name__ == '__main__':
  # load data
  # sentences are already converted to sequences of word indexes
  # limit the vocab size if run out of memory
  sentences, word2idx = get_sentences_with_word2idx_limit_vocab(1000)
  # sentences, word2idx = get_sentences_with_word2idx()

  # vocab size
  V = len(word2idx)
  print("Vocab size:", V)

  # treat beginning and end of sentence as bigrams, START -> first word, last word -> END
  start_idx = word2idx['START']
  end_idx = word2idx['END']


  # a matrix where:
  # row = last word
  # col = current word
  # value at [row, col] = p(current word | last word)
  bigram_probs = get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=0.1)


  # a function to calculate normalized log prob score
  # for a sentence
  def get_score(sentence):
    score = 0
    for i in range(len(sentence)):
      if i == 0:
        # beginning word
        score += np.log(bigram_probs[start_idx, sentence[i]])
      else:
        # middle word
        score += np.log(bigram_probs[sentence[i-1], sentence[i]])
    # final word
    score += np.log(bigram_probs[sentence[-1], end_idx])

    # normalize the score
    return score / (len(sentence) + 1)


  # a function to map word indexes back to real words
  idx2word = dict((v, k) for k, v in iteritems(word2idx))
  def get_words(sentence):
    return ' '.join(idx2word[i] for i in sentence)


  # when sample a fake sentence, ensure not to sample start token or end token
  sample_probs = np.ones(V)
  sample_probs[start_idx] = 0
  sample_probs[end_idx] = 0
  sample_probs /= sample_probs.sum()

  # test model on real and fake sentences
  while True:
    # real sentence
    real_idx = np.random.choice(len(sentences))
    real = sentences[real_idx]

    # fake sentence
    fake = np.random.choice(V, size=len(real), p=sample_probs)

    print("REAL:", get_words(real), "SCORE:", get_score(real))
    print("FAKE:", get_words(fake), "SCORE:", get_score(fake))

    # input your own sentence
    custom = input("Enter your sentence:\n")
    custom = custom.lower().split()

    # check all tokens exist in word2idx (otherwise,can't get score)
    bad_sentence = False
    for token in custom:
      if token not in word2idx:
        bad_sentence = True

    if bad_sentence:
      print("Sorry, the words are not in the vocabulary")
    else:
      # convert sentence into list of indexes
      custom = [word2idx[token] for token in custom]
      print("SCORE:", get_score(custom))


    cont = input("Continue? [Y/n]")
    if cont and cont.lower() in ('N', 'n'):
      break

START inf
END inf
man inf
paris inf
britain inf
england inf
king inf
woman inf
rome inf
london inf
queen inf
italy inf
france inf
the 69971
, 58334
. 49346
of 36412
and 28853
to 26158
a 23195
in 21337
that 10594
is 10109
was 9815
he 9548
for 9489
`` 8837
'' 8789
it 8760
with 7289
as 7253
his 6996
on 6741
be 6377
; 5566
at 5372
by 5306
i 5164
this 5145
had 5133
? 4693
not 4610
are 4394
but 4381
from 4370
or 4206
have 3942
an 3740
they 3620
which 3561
-- 3432
one 3292
you 3286
were 3284
her 3036
all 3001
she 2860
there 2728
would 2714
their 2669
we 2652
him 2619
been 2472
) 2466
has 2437
( 2435
when 2331
who 2252
will 2245
more 2215
if 2198
no 2139
out 2097
so 1985
said 1961
what 1908
up 1890
its 1858
about 1815
: 1795
into 1791
than 1790
them 1788
can 1772
only 1748
other 1702
new 1635
some 1618
could 1601
time 1598
! 1596
these 1573
two 1412
may 1402
then 1380
do 1363
first 1361
any 1344
my 1318
now 1314
such 1303
like 1292
our 1252
over 1236
me 1181
even 1170
most 1159
made 1125
also 

steps 119
test 119
chief 119
reported 119
served 119
based 119
main 119
determined 119
image 119
decision 119
window 119
religion 119
aj 118
gun 118
responsibility 118
middle 118
europe 118
british 118
character 118
learned 117
horse 117
writing 117
appear 117
s. 117
account 117
ones 116
serious 116
activity 116
types 116
green 116
length 116
lived 115
audience 115
letters 115
returned 115
obtained 115
nuclear 115
specific 115
corner 115
forward 115
activities 115
slowly 115
doubt 114
6 114
justice 114
moving 114
latter 114
gives 114
straight 114
hit 114
plane 114
quality 114
design 114
obviously 114
operation 113
plans 113
shot 113
seven 113
a. 113
choice 113
poor 113
staff 113
function 113
figures 113
parts 113
stay 113
saying 113
include 113
15 113
born 113
pattern 113
30 112
cars 112
whatever 112
sun 112
faith 111
pool 111
hospital 110
corps 110
wish 110
lack 110
completely 110
heavy 110
waiting 110
speak 110
ball 110
standard 110
extent 110
visit 109
democratic 109
firm 109
income