In [1]:
from nltk.corpus import gutenberg
from nltk.model import *
from nltk.probability import LidstoneProbDist
import pprint
import sys

class NullDevice():
    def write(self, s):
        pass

use_chars = True

#sys.stderr = NullDevice()
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.02)

t = None
if use_chars:
    t = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    t = [ x for x in gutenberg.sents('carroll-alice.txt') ]

def ngram_gen(n, t, len=100):
    print("\n%d-gram model\n---------------\n" % (n))
    m = NgramModel(n, t, True, True, estimator)
    #print m.entropy("if you cannot mean what you say then at least say what you mean".split())
    sep = '' if use_chars else ' '
    if n == 1:
        # there is a bug in nltk NgramModel when n=1. it just produces a sequence of . characters
        genstring = sep.join([ m[()].generate() for i in range(len) ])
    else:
        genstring = sep.join(m.generate(len))
    print(genstring)
    return genstring

genstring_list = [ngram_gen(n, t) for n in range(1,10)]

ModuleNotFoundError: No module named 'nltk.model'

In [19]:
# Code modified by Anoop Sarkar from original replacement for the NLTK NGramModel by Roger Levy
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/code/NgramModel.py
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/lectures/lecture11/lecture11_ngrams_in_Python.pdf

import nltk, random, collections, math
from math import log
from collections import Counter
from nltk.corpus import gutenberg

use_chars = False
beginToken = "<s>"
boundaryToken = "</s>"

# we can get replicable behavior by initializing the random number generator with random.seed()
random.seed(1)

train = None
if use_chars:
    train = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    train = [ x for x in gutenberg.sents('carroll-alice.txt') ]

def ngrams(n,sentences,beginToken=beginToken,boundaryToken=boundaryToken,verbose=False):
    c = {}
    q = []
    for i in range(n-1):
        q.append(beginToken)
    for sentence in sentences:
        for w in sentence + [boundaryToken]:
            context_gram = stringify_context(q)
            #print "ngrams(w):", w
            #print "context_gram:", context_gram
            if verbose:
                print(q)
                print(context_gram)
                print(w)
            if not context_gram in c:
                c[context_gram] = Counter()
            c[context_gram][w] += 1
            q.pop(0)
            q.append(w)
    return(c)

def stringify_context(context):
    return(" ".join(context))
    
class NgramModel:
    def __init__(self, training_sentences, n=2, smoothing='none',verbose=False):
        self.n = n
        train = ngrams(n,training_sentences,verbose=verbose)
        self.probs = {}
        if smoothing=='none':
            for context_gram in list(train.keys()):
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in list(train[context_gram].items())})
                if verbose:
                    print([(context_gram,k,v) for k,v in list(train[context_gram].items())])

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,addBeginToken=False,verbose=False):
        result = 0
        if len(sentence) < self.n:
            return float("NaN")
        context = []
        if addBeginToken:
            for i in range(self.n-1):
                context.append(beginToken)
        else:
            for i in range(self.n-1):
                w = sentence.pop(0)
                context.append(w)
        for w in sentence + [boundaryToken]:
            if verbose:
                print((context,w,self.prob(w,context)))            
            lp = log(self.prob(w,context))
            result = result + lp
            context.pop(0)
            context.append(w)
            #context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = []
        for i in range(self.n-1):
            context.append(beginToken)
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in list(c.items()):
                x = x + v
                if goryDetails:
                    print((r,context,x,k,v))
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context.pop(0)
                    context.append(w)
                    #context = [w]
                    break
            if verbose:
                print(w)
        #result.pop() # drop the boundary token
        return result

class BigramModel:
    def __init__(self, training_sentences, smoothing='none'):
        train = ngrams(2,training_sentences)
        self.probs = {}
        if smoothing=='none':
            for context_gram in list(train.keys()):
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in list(train[context_gram].items())})

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,verbose=False):
        context = [boundaryToken]
        result = 0
        for w in sentence + [boundaryToken]:
            lp = log(self.prob(w,context))
            result = result + lp
            if verbose:
                print((context,w,lp))
            context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = [boundaryToken]
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in list(c.items()):
                x = x + v
                if goryDetails:
                    print((r,context,x,k,v))
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context = [w]
                    break
            if verbose:
                print(w)
        result.pop() # drop the boundary token
        return result

#ng=ngrams(3,train[3:4],boundaryToken="</s>",verbose=False)
#print ng
print(train[4:6])


[['So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', '),', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy', '-', 'chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', ',', 'when', 'suddenly', 'a', 'White', 'Rabbit', 'with', 'pink', 'eyes', 'ran', 'close', 'by', 'her', '.'], ['There', 'was', 'nothing', 'so', 'VERY', 'remarkable', 'in', 'that', ';', 'nor', 'did', 'Alice', 'think', 'it', 'so', 'VERY', 'much', 'out', 'of', 'the', 'way', 'to', 'hear', 'the', 'Rabbit', 'say', 'to', 'itself', ',', "'", 'Oh', 'dear', '!']]


In [22]:
m = NgramModel(train, n=3, verbose=False)
test = "That white rabbit".split()
m.scoreSentence(test, verbose=False)
#m.generateSentence(verbose=False, goryDetails=False)
#m = BigramModel(train)
#test_sentence = ['That','white','rabbit']
#m.scoreSentence(test_sentence,verbose=True)
#m.generateSentence(verbose=True)

KeyError: 'That white'

In [30]:
out = m.generateSentence(verbose=False, goryDetails=False)
print(" ".join(out))

[ Alice ' s a vegetable . </s>


In [3]:
from IPython.core.display import HTML


def css_styling():
    styles = open("../css/notebook.css", "r").read()
    return HTML(styles)
css_styling()