In [8]:
from nltk.corpus import gutenberg
from nltk.model import *
from nltk.probability import LidstoneProbDist
import pprint
import sys

class NullDevice():
    def write(self, s):
        pass

use_chars = True

#sys.stderr = NullDevice()
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.02)

t = None
if use_chars:
    t = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    t = [ x for x in gutenberg.sents('carroll-alice.txt') ]

def ngram_gen(n, t, len=100):
    print "\n%d-gram model\n---------------\n" % (n)
    m = NgramModel(n, t, True, True, estimator)
    #print m.entropy("if you cannot mean what you say then at least say what you mean".split())
    sep = '' if use_chars else ' '
    if n == 1:
        # there is a bug in nltk NgramModel when n=1. it just produces a sequence of . characters
        genstring = sep.join([ m[()].generate() for i in range(len) ])
    else:
        genstring = sep.join(m.generate(len))
    print genstring
    return genstring

genstring_list = map(lambda n: ngram_gen(n, t), range(1,10))



1-gram model
---------------

 apoA itb  ai g WcHTdi  cnngmoduwFt ,ieiwsei !epe i eyhn, l  ps''tAqif wearw i e ot   n Yl tda'e eri

2-gram model
---------------

'' we ' thtthish sherlozeve ars Fope Allid heatowoutorye me aleathe lad , and Mofraly , OCrg ll t f

3-gram model
---------------

Shesten .'' shent to hat was nockind wrep as eyethe ple MING Alithead upok hat whe Haremble .'

4-gram model
---------------

She Duch a VERY unease said it deciden a bird began older , wentily such they ' You were , poing to 

5-gram model
---------------

' crumbling it would , and as in a thing , ' Pray in he repeaten on sea !' exclamourn ; so make 

6-gram model
---------------

' Oh , YOU like the fifteenth , as there , you been .''These would this time whom she

7-gram model
---------------

And yet you if you ' re looked up eager with one flapper across the first witness , she heads off .

8-gram model
---------------

As for pulling me out among the people about her hand , in the middle , 

In [4]:
# Code modified by Anoop Sarkar from original replacement for the NLTK NGramModel by Roger Levy
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/code/NgramModel.py
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/lectures/lecture11/lecture11_ngrams_in_Python.pdf

import nltk, random, collections, math
from math import log
from collections import Counter
from nltk.corpus import gutenberg

use_chars = False
# we can get replicable behavior by initializing the random number generator with random.seed()
random.seed(1)

train = None
if use_chars:
    train = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    train = [ x for x in gutenberg.sents('carroll-alice.txt') ]

beginToken = "<s>"
boundaryToken = "</s>"
def ngrams(n,sentences,boundaryToken=boundaryToken,verbose=False):
    c = {}
    q = []
    for i in range(n-1):
        q.append(beginToken)
    for sentence in sentences:
        for w in sentence + [boundaryToken]:
            context_gram = stringify_context(q)
            print "ngrams(w):", w
            print "context_gram:", context_gram
            if verbose:
                print(q)
                print(context_gram)
                print(w)
            if not context_gram in c:
                c[context_gram] = Counter()
            c[context_gram][w] += 1
            q.pop(0)
            q.append(w)
    return(c)


def stringify_context(context):
    return(" ".join(context))
    
class NgramModel:
    def __init__(self, training_sentences, n=2, smoothing='none'):
        self.n = n
        train = ngrams(n,training_sentences)
        self.probs = {}
        if smoothing=='none':
            for context_gram in train.keys():
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in train[context_gram].items()})

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,verbose=False):
        context = []
        for i in range(self.n-1):
            context.append(beginToken)
        result = 0
        for w in sentence + [boundaryToken]:
            lp = log(self.prob(w,context))
            result = result + lp
            if verbose:
                print(context,w,lp)
            context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = []
        for i in range(self.n-1):
            context.append(beginToken)
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in c.items():
                x = x + v
                if goryDetails:
                    print(r,context,x,k,v)
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context.pop(0)
                    context.append(w)
                    #context = [w]
                    break
            if verbose:
                print(w)
        #result.pop() # drop the boundary token
        return result

class BigramModel:
    def __init__(self, training_sentences, smoothing='none'):
        train = ngrams(2,training_sentences)
        self.probs = {}
        if smoothing=='none':
            for context_gram in train.keys():
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in train[context_gram].items()})

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,verbose=False):
        context = [boundaryToken]
        result = 0
        for w in sentence + [boundaryToken]:
            lp = log(self.prob(w,context))
            result = result + lp
            if verbose:
                print(context,w,lp)
            context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = [boundaryToken]
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in c.items():
                x = x + v
                if goryDetails:
                    print(r,context,x,k,v)
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context = [w]
                    break
            if verbose:
                print(w)
        result.pop() # drop the boundary token
        return result

#ng=ngrams(3,train[3:4],boundaryToken="</s>",verbose=False)
#print ng
m = NgramModel(train)
m.generateSentence(verbose=False, goryDetails=True)
#m = BigramModel(train)
#test_sentence = ['the','white','rabbit']
#m.scoreSentence(test_sentence,verbose=True)
#m.generateSentence(verbose=True)

 ngrams(w): [
context_gram: <s>
ngrams(w): Alice
context_gram: [
ngrams(w): '
context_gram: Alice
ngrams(w): s
context_gram: '
ngrams(w): Adventures
context_gram: s
ngrams(w): in
context_gram: Adventures
ngrams(w): Wonderland
context_gram: in
ngrams(w): by
context_gram: Wonderland
ngrams(w): Lewis
context_gram: by
ngrams(w): Carroll
context_gram: Lewis
ngrams(w): 1865
context_gram: Carroll
ngrams(w): ]
context_gram: 1865
ngrams(w): </s>
context_gram: ]
ngrams(w): CHAPTER
context_gram: </s>
ngrams(w): I
context_gram: CHAPTER
ngrams(w): .
context_gram: I
ngrams(w): </s>
context_gram: .
ngrams(w): Down
context_gram: </s>
ngrams(w): the
context_gram: Down
ngrams(w): Rabbit
context_gram: the
ngrams(w): -
context_gram: Rabbit
ngrams(w): Hole
context_gram: -
ngrams(w): </s>
context_gram: Hole
ngrams(w): Alice
context_gram: </s>
ngrams(w): was
context_gram: Alice
ngrams(w): beginning
context_gram: was
ngrams(w): to
context_gram: beginning
ngrams(w): get
context_gram: to
ngrams(w): very
context

ValueError: I/O operation on closed file

In [None]:
from IPython.core.display import HTML


def css_styling():
    styles = open("../css/notebook.css", "r").read()
    return HTML(styles)
css_styling()