In [8]:
from nltk.corpus import gutenberg
from nltk.model import *
from nltk.probability import LidstoneProbDist
import pprint
import sys

class NullDevice():
    def write(self, s):
        pass

use_chars = True

#sys.stderr = NullDevice()
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.02)

t = None
if use_chars:
    t = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    t = [ x for x in gutenberg.sents('carroll-alice.txt') ]

def ngram_gen(n, t, len=100):
    print "\n%d-gram model\n---------------\n" % (n)
    m = NgramModel(n, t, True, True, estimator)
    #print m.entropy("if you cannot mean what you say then at least say what you mean".split())
    sep = '' if use_chars else ' '
    if n == 1:
        # there is a bug in nltk NgramModel when n=1. it just produces a sequence of . characters
        genstring = sep.join([ m[()].generate() for i in range(len) ])
    else:
        genstring = sep.join(m.generate(len))
    print genstring
    return genstring

genstring_list = map(lambda n: ngram_gen(n, t), range(1,10))



1-gram model
---------------

 apoA itb  ai g WcHTdi  cnngmoduwFt ,ieiwsei !epe i eyhn, l  ps''tAqif wearw i e ot   n Yl tda'e eri

2-gram model
---------------

'' we ' thtthish sherlozeve ars Fope Allid heatowoutorye me aleathe lad , and Mofraly , OCrg ll t f

3-gram model
---------------

Shesten .'' shent to hat was nockind wrep as eyethe ple MING Alithead upok hat whe Haremble .'

4-gram model
---------------

She Duch a VERY unease said it deciden a bird began older , wentily such they ' You were , poing to 

5-gram model
---------------

' crumbling it would , and as in a thing , ' Pray in he repeaten on sea !' exclamourn ; so make 

6-gram model
---------------

' Oh , YOU like the fifteenth , as there , you been .''These would this time whom she

7-gram model
---------------

And yet you if you ' re looked up eager with one flapper across the first witness , she heads off .

8-gram model
---------------

As for pulling me out among the people about her hand , in the middle , 

In [24]:
# Code modified by Anoop Sarkar from original replacement for the NLTK NGramModel by Roger Levy
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/code/NgramModel.py
# http://idiom.ucsd.edu/~rlevy/teaching/2015winter/lign165/lectures/lecture11/lecture11_ngrams_in_Python.pdf

import nltk, random, collections, math
from math import log
from collections import Counter
from nltk.corpus import gutenberg

use_chars = False
beginToken = "<s>"
boundaryToken = "</s>"

# we can get replicable behavior by initializing the random number generator with random.seed()
random.seed(1)

train = None
if use_chars:
    train = [ [ c for c in ' '.join(sent) ] for sent in gutenberg.sents('carroll-alice.txt') ]
else:
    train = [ x for x in gutenberg.sents('carroll-alice.txt') ]

def ngrams(n,sentences,beginToken=beginToken,boundaryToken=boundaryToken,verbose=False):
    c = {}
    q = []
    for i in range(n-1):
        q.append(beginToken)
    for sentence in sentences:
        for w in sentence + [boundaryToken]:
            context_gram = stringify_context(q)
            #print "ngrams(w):", w
            #print "context_gram:", context_gram
            if verbose:
                print(q)
                print(context_gram)
                print(w)
            if not context_gram in c:
                c[context_gram] = Counter()
            c[context_gram][w] += 1
            q.pop(0)
            q.append(w)
    return(c)

def stringify_context(context):
    return(" ".join(context))
    
class NgramModel:
    def __init__(self, training_sentences, n=2, smoothing='none',verbose=False):
        self.n = n
        train = ngrams(n,training_sentences,verbose=verbose)
        self.probs = {}
        if smoothing=='none':
            for context_gram in train.keys():
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in train[context_gram].items()})
                print [(context_gram,k,v) for k,v in train[context_gram].items()]

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,addBeginToken=False,verbose=False):
        result = 0
        if len(sentence) < self.n:
            return float("NaN")
        context = []
        if addBeginToken:
            for i in range(self.n-1):
                context.append(beginToken)
        else:
            for i in range(self.n-1):
                w = sentence.pop(0)
                context.append(w)
        for w in sentence + [boundaryToken]:
            if verbose:
                print(context,w,self.prob(w,context))            
            lp = log(self.prob(w,context))
            result = result + lp
            context.pop(0)
            context.append(w)
            #context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = []
        for i in range(self.n-1):
            context.append(beginToken)
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in c.items():
                x = x + v
                if goryDetails:
                    print(r,context,x,k,v)
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context.pop(0)
                    context.append(w)
                    #context = [w]
                    break
            if verbose:
                print(w)
        #result.pop() # drop the boundary token
        return result

class BigramModel:
    def __init__(self, training_sentences, smoothing='none'):
        train = ngrams(2,training_sentences)
        self.probs = {}
        if smoothing=='none':
            for context_gram in train.keys():
                N = sum(train[context_gram].values())
                self.probs[context_gram] = Counter({k:v/N for k,v in train[context_gram].items()})

    def prob(self,word,context):
        """takes a word string and a context which is a list of word strings, and returns the probability of the word"""
        c = stringify_context(context)
        return(self.probs[c][word])

    def scoreSentence(self,sentence,verbose=False):
        context = [boundaryToken]
        result = 0
        for w in sentence + [boundaryToken]:
            lp = log(self.prob(w,context))
            result = result + lp
            if verbose:
                print(context,w,lp)
            context = [w]
        return result

    def generateSentence(self,verbose=False,goryDetails=False):
        context = [boundaryToken]
        result = []
        w = None
        while not w == boundaryToken:
            r = random.random() # returns a random float between 0 and 1
            x = 0
            c = self.probs[stringify_context(context)] # this will be a Counter
            for k,v in c.items():
                x = x + v
                if goryDetails:
                    print(r,context,x,k,v)
                if x > r: # choose this word
                    w = k
                    result.append(w)
                    context = [w]
                    break
            if verbose:
                print(w)
        result.pop() # drop the boundary token
        return result

#ng=ngrams(3,train[3:4],boundaryToken="</s>",verbose=False)
#print ng
print train[4:6]
m = NgramModel(train, n=3, verbose=False)
test_sentence = "and making faces".split()
m.scoreSentence(test_sentence, verbose=True)
#m.generateSentence(verbose=False, goryDetails=False)
#m = BigramModel(train)
#test_sentence = ['That','white','rabbit']
#m.scoreSentence(test_sentence,verbose=True)
#m.generateSentence(verbose=True)

[[u'So', u'she', u'was', u'considering', u'in', u'her', u'own', u'mind', u'(', u'as', u'well', u'as', u'she', u'could', u',', u'for', u'the', u'hot', u'day', u'made', u'her', u'feel', u'very', u'sleepy', u'and', u'stupid', u'),', u'whether', u'the', u'pleasure', u'of', u'making', u'a', u'daisy', u'-', u'chain', u'would', u'be', u'worth', u'the', u'trouble', u'of', u'getting', u'up', u'and', u'picking', u'the', u'daisies', u',', u'when', u'suddenly', u'a', u'White', u'Rabbit', u'with', u'pink', u'eyes', u'ran', u'close', u'by', u'her', u'.'], [u'There', u'was', u'nothing', u'so', u'VERY', u'remarkable', u'in', u'that', u';', u'nor', u'did', u'Alice', u'think', u'it', u'so', u'VERY', u'much', u'out', u'of', u'the', u'way', u'to', u'hear', u'the', u'Rabbit', u'say', u'to', u'itself', u',', u"'", u'Oh', u'dear', u'!']]
[(u'her escape', u';', 1)]
[(u'jury ,"', u'Said', 1)]
[(u'Alice and', u'all', 1)]
[(u'table set', u'out', 1)]
[(u'</s> Who', u'Stole', 1), (u'</s> Who', u'ever', 1), (u'</s>

ValueError: math domain error

In [None]:
from IPython.core.display import HTML


def css_styling():
    styles = open("../css/notebook.css", "r").read()
    return HTML(styles)
css_styling()