In [1]:
import re
from itertools import izip

## Load the corpus and create train and test splits

In [2]:
with open('pandp12.txt', 'r') as f:
    text = f.read()[1355:]
split_idx = len(text)/2
train_Y = text[:split_idx]
test_Y = text[split_idx:]

In [3]:
train_X = train_Y.lower()
test_X = test_Y.lower()

In [4]:
print train_X[:150]

it is a truth universally acknowledged, that a single man in
possession of a good fortune, must be in want of a wife.

however little known the fee


In [5]:
print train_Y[:150]

It is a truth universally acknowledged, that a single man in
possession of a good fortune, must be in want of a wife.

However little known the fee


## Measure progress using character level accuracy

In [6]:
def evaluate(x, y):
    assert len(x) == len(y)
    errors = 0
    for a, b in izip(x, y):
        if a != b:
            errors += 1
    return errors

## Baseline = do nothing

In [7]:
evaluate(train_X, train_Y)

6749

In [8]:
evaluate(train_X, train_Y) / float(len(train_X))

0.018852639900331578

## Capitalize start of sentences

In [9]:
system = train_X
system = re.sub(r'(\. [a-z])', lambda s: s.group().upper(), system)

In [10]:
system[:100]

'it is a truth universally acknowledged, that a single man in\r\npossession of a good fortune, must be '

In [11]:
train_X[:100]

'it is a truth universally acknowledged, that a single man in\r\npossession of a good fortune, must be '

In [12]:
evaluate(system, train_Y)

6104

## Capitalize start of new lines

In [13]:
system = train_X
system = re.sub(r'(\. [a-z])', lambda s: s.group().upper(), system)
system = re.sub(r'((\r\n)+[a-z])', lambda s: s.group().upper(), system)

In [14]:
evaluate(system, train_Y)

9666

In [15]:
system[:100]

'it is a truth universally acknowledged, that a single man in\r\nPossession of a good fortune, must be '

In [16]:
train_Y[:100]

'It is a truth universally acknowledged, that a single man in\r\npossession of a good fortune, must be '

In [17]:
# We actually need to find multiple line breaks

In [18]:
system = train_X
system = re.sub(r'(\. [a-z])', lambda s: s.group().upper(), system)
system = re.sub(r'((\r\n)(\r\n)+[a-z])', lambda s: s.group().upper(), system)

In [19]:
evaluate(system, train_Y)

5693

## Find person names

In [20]:
from collections import Counter

In [21]:
upper_counts = Counter([t for t in train_Y.split() if t[0].isupper()])
lower_counts = Counter([t for t in train_Y.split() if t[0].islower()])

In [22]:
sorted(upper_counts.iteritems(), key=lambda (k, v): v, reverse=True)[:10]

[('I', 869),
 ('Mr.', 512),
 ('Elizabeth', 210),
 ('Miss', 184),
 ('Mrs.', 174),
 ('She', 144),
 ('The', 139),
 ('Darcy', 118),
 ('But', 106),
 ('Bingley', 106)]

In [23]:
gazeteer = []
for word, count in upper_counts.iteritems():
    if count > 1 and lower_counts[word.lower()] == 0:
        gazeteer.append(word)

In [24]:
gazeteer

['Darcy!',
 'Hunsford,',
 'Darcy,',
 'Darcy.',
 'Phillips,',
 'Darcy;',
 'Tuesday,',
 'Mrs.',
 "Elizabeth's",
 'Charlotte',
 'Wickham;',
 "Darcy's",
 'Wickham,',
 "James's",
 'Wickham.',
 'Jane',
 'Collins',
 "Lydia's",
 "Maria's",
 'Darcy,"',
 'Jane,"',
 'Hertfordshire;',
 'Collins.',
 'Jane;',
 'Hertfordshire,',
 'Hertfordshire.',
 'Collins,',
 'Lizzy',
 'Jane,',
 'Jane.',
 'Bingley,"',
 'St.',
 'Pemberley',
 'Eliza,',
 'Monday,',
 'Maria,',
 'Kitty,',
 'Mr.',
 'Christmas',
 'Lucas',
 'Collins,"',
 "Collins's",
 'England;',
 'Fitzwilliam',
 'Phillips',
 "Fitzwilliam's",
 "Bourgh's",
 'Caroline,',
 'Lodge,',
 'Eliza',
 'Saturday',
 'Caroline',
 'Elizabeth',
 'Eliza."',
 'William',
 'Bennet,"',
 'Jenkinson',
 'Lucases',
 'Charles,',
 "Caroline's",
 'George',
 'Fitzwilliam,',
 "Bennet's",
 'Jenkinson,',
 'Charlotte.',
 'Charlotte,',
 'Elizabeth.',
 'Elizabeth,',
 'Monday',
 'Elizabeth;',
 'Saturday.',
 'Saturday,',
 'Hurst',
 "Catherine's",
 'Gardiners',
 'Bourgh',
 'Bingley;',
 'Bingle

In [25]:
system = train_X
for word in gazeteer:
    system = system.replace(' ' + word.lower() + ' ', ' ' + word[0].upper() + word[1:] + ' ')

In [26]:
evaluate(system, train_Y)

3715

## How well do we generalize?

In [27]:
evaluate(test_X, test_Y)

7343

In [28]:
system = test_X
for word in gazeteer:
    system = system.replace(' ' + word.lower() + ' ', ' ' + word[0].upper() + word[1:] + ' ')
evaluate(system, test_Y)

4999