In [1]:
import nltk

In [2]:
from __future__ import division

In [3]:
import re, pprint


### Simple Approcahes to Tokenization

In [4]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
... though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
... well without--Maybe it's always pepper that makes people hot-tempered,'..."""


In [5]:
re.split(r' ',raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone\nthough),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very\nwell',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [6]:
re.split(r'[ \t\n]+',raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [8]:
 re.split(r'\W+', raw)

['',
 'When',
 'I',
 'M',
 'a',
 'Duchess',
 'she',
 'said',
 'to',
 'herself',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 'I',
 'won',
 't',
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'Maybe',
 'it',
 's',
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 'tempered',
 '']

In [9]:
re.findall(r'\w+|\S\w*', raw)


["'When",
 'I',
 "'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'I",
 'won',
 "'t",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 '-',
 '-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 '-tempered',
 ',',
 "'",
 '.',
 '.',
 '.']

In [10]:
 print (re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))


["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


### Sentence Segmentation

In [16]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

20.250994070456922

In [17]:
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')


In [20]:
sents = sent_tokenizer.tokenize(text)


In [21]:
pprint.pprint(sents[171:181])


['In the wild events which were to follow this girl had no\n'
 'part at all; he never saw her again until all his tale was over.',
 'And yet, in some indescribable way, she kept recurring like a\n'
 'motive in music through all his mad adventures afterwards, and the\n'
 'glory of her strange hair ran like a red thread through those dark\n'
 'and ill-drawn tapestries of the night.',
 'For what followed was so\nimprobable, that it might well have been a dream.',
 'When Syme went out into the starlit street, he found it for the\n'
 'moment empty.',
 'Then he realised (in some odd way) that the silence\n'
 'was rather a living silence than a dead one.',
 'Directly outside the\n'
 'door stood a street lamp, whose gleam gilded the leaves of the tree\n'
 'that bent out over the fence behind him.',
 'About a foot from the\n'
 'lamp-post stood a figure almost as rigid and motionless as the\n'
 'lamp-post itself.',
 'The tall hat and long frock coat were black; the\n'
 'face, in an abrupt shadow

### Word Segmentation

In [22]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"


In [23]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words


In [24]:
segment(text, seg1)


['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']

In [25]:
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size

In [27]:
seg3 = "0000100100000011001000000110000100010000001100010000001"
segment(text, seg3)

['doyou',
 'see',
 'thekitt',
 'y',
 'see',
 'thedogg',
 'y',
 'doyou',
 'like',
 'thekitt',
 'y',
 'like',
 'thedogg',
 'y']

In [28]:
evaluate(text,seg3)

46

In [29]:
evaluate(text,seg2)

47

In [31]:
evaluate(text,seg1)

63

### From lists to strings

In [32]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
' '.join(silly)

'We called him Tortoise because he taught us .'

In [33]:
 ';'.join(silly)


'We;called;him;Tortoise;because;he;taught;us;.'

In [34]:
 ''.join(silly)


'WecalledhimTortoisebecausehetaughtus.'

### String Formats

In [35]:
word = 'cat'
sentence = """hello world"""

In [37]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in fdist:
    print (word, '->', fdist[word], ';')

dog -> 4 ;
cat -> 3 ;
snake -> 1 ;


In [40]:
for word in fdist:
    print ('%s->%d;' % (word, fdist[word]))

dog->4;
cat->3;
snake->1;


In [41]:
"%s wants a %s %s" % ("Lee", "sandwich", "for lunch")

'Lee wants a sandwich for lunch'

### Lining Things Up

In [42]:
count, total = 3205, 9375
"accuracy for %d words: %2.4f%%" % (total, 100 * count / total)

'accuracy for 9375 words: 34.1867%'

In [43]:
def tabulate(cfdist, words, categories):
    print ('%-16s' % 'Category')
    for word in words: # column headings
        print ('%6s' % word)
    print
    for category in categories:
        print ('%-16s' % category) # row heading
        for word in words: # for each word
            print ('%6d' % cfdist[category][word]) # print table cell
        print 

In [44]:
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist((genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)


Category        
   can
 could
   may
 might
  must
  will
news            
    93
    86
    66
    38
    50
   389
religion        
    82
    59
    78
    12
    54
    71
hobbies         
   268
    58
   131
    22
    83
   264
science_fiction 
    16
    49
     4
    12
     8
    16
romance         
    74
   193
    11
    51
    45
    43
humor           
    16
    30
     8
     8
     9
    13


### Text Wrapping

In [45]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',','more', 'is', 'said', 'than', 'done', '.']
for word in saying:
    print (word, '(' + str(len(word)) + '),')


After (5),
all (3),
is (2),
said (4),
and (3),
done (4),
, (1),
more (4),
is (2),
said (4),
than (4),
done (4),
. (1),


In [46]:
from textwrap import fill
format = '%s (%d),'

In [47]:
pieces = [format % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)
print (wrapped)


After (5), all (3), is (2), said (4), and (3), done (4), , (1), more
(4), is (2), said (4), than (4), done (4), . (1),
