In [1]:
from string import punctuation
from math import sqrt
from collections import Counter

### Load the text

In [2]:
text = open('data/pandp12.txt', 'rU').read()

In [3]:
text[:250]

'The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\n(#8 in our series by Jane Austen)\n\nCopyright laws are changing all over the world. Be sure to check the\ncopyright laws for your country before downloading or redistributing\nthis or an'

### Clean it up a bit

In [4]:
# replace emdash "--" with spaces to separate words
# add space before possessive "'s" to match Penn Treebank tokenization
text = text.replace('--', ' ').replace("'s", " 's").lower()

In [5]:
text[:250]

'the project gutenberg ebook of pride and prejudice, by jane austen\n(#8 in our series by jane austen)\n\ncopyright laws are changing all over the world. be sure to check the\ncopyright laws for your country before downloading or redistributing\nthis or an'

In [21]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
for w in text[:250].split():
    print w.strip(punctuation)

the
project
gutenberg
ebook
of
pride
and
prejudice
by
jane
austen
8
in
our
series
by
jane
austen
copyright
laws
are
changing
all
over
the
world
be
sure
to
check
the
copyright
laws
for
your
country
before
downloading
or
redistributing
this
or
an


### Tokenize the text into words

In [22]:
# split words, strip punctuation, and remove empty strings
words = [w.strip(punctuation) for w in text.split()]

In [7]:
words[:10]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'pride',
 'and',
 'prejudice',
 'by',
 'jane']

In [8]:
word_counts = Counter(words)

In [9]:
sorted(word_counts.iteritems(), key=lambda (word, count): count, reverse=True)[:10]

[('the', 4421),
 ('to', 4208),
 ('of', 3669),
 ('and', 3627),
 ('her', 2227),
 ('i', 2068),
 ('a', 1980),
 ('in', 1891),
 ('was', 1847),
 ('she', 1710)]

In [23]:
bigrams = zip(words, words[1:])[:50]

In [25]:
text[:250]

'the project gutenberg ebook of pride and prejudice, by jane austen\n(#8 in our series by jane austen)\n\ncopyright laws are changing all over the world. be sure to check the\ncopyright laws for your country before downloading or redistributing\nthis or an'

In [24]:
bigrams

[('the', 'project'),
 ('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', 'of'),
 ('of', 'pride'),
 ('pride', 'and'),
 ('and', 'prejudice'),
 ('prejudice', 'by'),
 ('by', 'jane'),
 ('jane', 'austen'),
 ('austen', '8'),
 ('8', 'in'),
 ('in', 'our'),
 ('our', 'series'),
 ('series', 'by'),
 ('by', 'jane'),
 ('jane', 'austen'),
 ('austen', 'copyright'),
 ('copyright', 'laws'),
 ('laws', 'are'),
 ('are', 'changing'),
 ('changing', 'all'),
 ('all', 'over'),
 ('over', 'the'),
 ('the', 'world'),
 ('world', 'be'),
 ('be', 'sure'),
 ('sure', 'to'),
 ('to', 'check'),
 ('check', 'the'),
 ('the', 'copyright'),
 ('copyright', 'laws'),
 ('laws', 'for'),
 ('for', 'your'),
 ('your', 'country'),
 ('country', 'before'),
 ('before', 'downloading'),
 ('downloading', 'or'),
 ('or', 'redistributing'),
 ('redistributing', 'this'),
 ('this', 'or'),
 ('or', 'any'),
 ('any', 'other'),
 ('other', 'project'),
 ('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', 'this'),
 ('this', 'header'),
 ('

In [10]:

bigram_counts = Counter(bigrams)

In [26]:
sorted(bigram_counts.iteritems(), key=lambda (word, count): count, reverse=True)[:10]

[(('of', 'the'), 481),
 (('to', 'be'), 443),
 (('in', 'the'), 387),
 (('i', 'am'), 303),
 (('mr', 'darcy'), 273),
 (('of', 'her'), 263),
 (('to', 'the'), 260),
 (('it', 'was'), 251),
 (('of', 'his'), 235),
 (('she', 'was'), 212)]

In [None]:
# TODO: top 100 most frequent words

In [12]:
def t_test(c1, c2, c12, N):
    N = float(N)
    p1 = c1 / N
    p2 = c2 / N
    p12 = c12 / N
    return (p12 - p1 * p2) / sqrt(p12 / N)

In [14]:
nwords = len(words)
unigrams = Counter(words)
bigrams = Counter(zip(words, words[1:]))

for w1, w2 in bigrams:
    bigrams[w1, w2] = t_test(unigrams[w1], unigrams[w2], bigrams[w1, w2], nwords)

In [15]:
sorted(bigrams.iteritems(), key=lambda (k, v): v, reverse=True)[:20]

[(('to', 'be'), 19.0264392287896),
 (('i', 'am'), 17.10521371395349),
 (('mr', 'darcy'), 16.363425622278),
 (('in', 'the'), 16.268102644685825),
 (('of', 'the'), 16.007155720962686),
 (('it', 'was'), 14.390799075237354),
 (('had', 'been'), 13.797458735403026),
 (('she', 'had'), 13.1917759293514),
 (('it', 'is'), 13.101618826588656),
 (('of', 'his'), 12.892911957502116),
 (('she', 'was'), 12.822598081459958),
 (('i', 'have'), 12.683147729522101),
 (('could', 'not'), 12.45202803287958),
 (('mrs', 'bennet'), 12.297568664899833),
 (('of', 'her'), 12.181283546068963),
 (('mr', 'collins'), 12.154912957988389),
 (('that', 'he'), 12.0257680592388),
 (('he', 'had'), 11.947387253999926),
 (('such', 'a'), 11.522630179552497),
 (('have', 'been'), 11.315621429018746)]

In [None]:
sorted(bigram_counts.iteritems(), key=lambda (word, count): count, reverse=True)[:50]