# Chapter 7

In [1]:
import nltk, re, numpy

In [2]:
# Tokenize document sentences and POS tag the all the words
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document) [1]
    sentences = [nltk.word_tokenize(sent) for sent in sentences] [2]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

### Noun Phrase Chunking

In [3]:
# Use regex the chunk sentence
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), 
            ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

In [4]:
# Define a grammar which chunks sentences into noun-phrases (NP) whenever there is a determiner (DT) 
# followed by n-number of adjectives until it reaches a noun
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [5]:
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)

In [6]:
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [7]:
# draw above tree (opens new window)
result.draw()

In [8]:
# can create more complex grammars with tag patterns
grammar = "<DT>?<JJ.*>*<NN.*>+"  # determiner followed by any form of adjective until any noun

In [9]:
# when/if regexs overlap the leftmost chunk takes priority
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


### Exploring Text

In [10]:
# format label: {posregex}
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown

In [11]:
for sent in brown.tagged_sents():
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK': print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)
(CHUNK like/VB to/TO see/VB)
(CHUNK designed/VBN to/TO provide/VB)
(CHUNK get/VB to/TO hear/VB)
(CHUNK expects/VBZ to/TO tell/VB)
(CHUNK expected/VBN to/TO give/VB)
(CHUNK prefer/VB to/TO pay/VB)
(CHUNK required/VBN to/TO obtain/VB)
(CHUNK permitted/VBN to/TO teach/VB)
(CHUNK designed/VBN to/TO reduce/VB)
(CHUNK Asked/VBN to/TO elaborate/VB)
(CHUNK got/VBN to/TO go/VB)
(CHUNK raised/VBN to/TO pay/VB)
(CHUNK scheduled/VBN to/TO go/VB)
(CHUNK cut/VBN to/TO meet/VB)
(CHUNK needed/VBN to/TO meet/VB)
(CHUNK hastened/VBD to/TO add/VB)
(CHUNK found/VBN to/TO prevent/VB)
(CHUNK continue/VB to/TO insist/VB)
(CHUNK compelled/VBN to/TO make/VB)
(CHUNK mad

In [2]:
# excluding specified regexs called "chinking"
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
       ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### IOB Tagging (Inside, Outside, Begin) is used to break down chunks
The sentence: 
```
We saw the yellow dog.
```
Would be tagged like this:
```
We PRP B-NP
saw VBD O
the DT B-NP
yellow JJ I-NP
dog NN I-NP
```

In [None]:
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''
# Creates tree from iob tagged data
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()

In [17]:
from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])

(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)


In [18]:
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


In [4]:
# Test and evaluate baseline parser (no regex)
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%
    Precision:      0.0%
    Recall:         0.0%
    F-Measure:      0.0%


In [20]:
# Use a regex which finds POS tags common in NP chunks
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%
    Precision:     70.6%
    Recall:        67.8%
    F-Measure:     69.2%


In [5]:
# given a word's POS tag what is the correct chunk (IOB) tag
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        # set up triples (word, pos, chunk)
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)  # train unigram tagger with (word, pos)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [6]:
# train new unigram chunker
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)

In [7]:
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%
    Precision:     79.9%
    Recall:        86.8%
    F-Measure:     83.2%


In [8]:
postags = sorted(set(pos for sent in train_sents
                     for (word,pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [2]:
# given a word's POS tag and the one before's what is the correct chunk (IOB) tag
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        # set up triples (word, pos, chunk)
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)  # train unigram tagger with (word, pos)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [26]:
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%
    Precision:     82.3%
    Recall:        86.8%
    F-Measure:     84.5%


### Classifier-Based Chunkers

In [9]:
# create feature which is the POS tag
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}

In [10]:
# Maxent classifier is used as an alternative to naive Bayes classifiers 
# because it does not assume statistical independence of the features
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='IIS', trace=0)
        #nltk.NaiveBayesClassifier.train(train_set)
        #

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [11]:
chunker = ConsecutiveNPChunker(train_sents)

      Training stopped: keyboard interrupt


In [13]:
# chunker evaluation using MaxEnt classifier
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%
    Precision:     79.9%
    Recall:        86.8%
    F-Measure:     83.2%


In [12]:
# add the previous word's POS as a new feature
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}

In [13]:
# chunker evaluation with new prevpos feature
chunker = ConsecutiveNPChunker(train_sents)  # hangs
print(chunker.evaluate(test_sents))

      Training stopped: keyboard interrupt
ChunkParse score:
    IOB Accuracy:  93.5%
    Precision:     82.3%
    Recall:        87.4%
    F-Measure:     84.7%


In [14]:
# Add the current word as a feature
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}

In [15]:
# chunker evaluation with current word feature added
chunker = ConsecutiveNPChunker(train_sents)  # hangs
print(chunker.evaluate(test_sents))

      Training stopped: keyboard interrupt
ChunkParse score:
    IOB Accuracy:  94.5%
    Precision:     85.0%
    Recall:        89.6%
    F-Measure:     87.2%


In [19]:
# get all tags since the most recent determiner
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

# create features with all previous feautre plus bigrams of POS and tags-since-dt
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
         nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}

In [20]:
# chunker evaluation with new features
chunker = ConsecutiveNPChunker(train_sents)  # hangs
print(chunker.evaluate(test_sents))

      Training stopped: keyboard interrupt
ChunkParse score:
    IOB Accuracy:  96.0%
    Precision:     88.5%
    Recall:        91.3%
    F-Measure:     89.9%


### Recursion in Linguistic Structure

In [4]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
    ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]

In [15]:
print(cp.parse(sentence))

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [16]:
cp.parse(sentence).draw()

In [5]:
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"),
            ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"),
            ("on", "IN"), ("the", "DT"), ("mat", "NN")]

In [6]:
cp = nltk.RegexpParser(grammar, loop=2)
#print(cp.parse(sentence))
cp.parse(sentence).draw()

In [30]:
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)

(NP Alice)


In [31]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)

(NP the rabbit)


In [32]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])

In [33]:
print(tree4)

(S (NP Alice) (VP chased (NP the rabbit)))


In [34]:
tree4[1].label()

'VP'

In [35]:
tree4.leaves()

['Alice', 'chased', 'the', 'rabbit']

In [36]:
tree4.draw()

In [7]:
# recursively traverse tree (left -> right)
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        # Now we know that t.node is defined
        print('(', t.label(), end=" ")
        for child in t:
            traverse(child)
        print(')', end=" ")

In [12]:
t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')
traverse(t)

( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 

### Named Entity Recognition

In [45]:
# Use built in chunker

# Standard treebank POS tagger
# _BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle'
# _MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
# def ne_chunk(tagged_tokens, binary=False):

            
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))

(S
  The/DT
  (NE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (NE Brooke/NNP)
  T./NNP
  Mossman/NNP
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (NE University/NNP)
  of/IN
  (NE Vermont/NNP College/NNP)
  of/IN
  (NE Medicine/NNP)
  ./.)


In [46]:
print(nltk.ne_chunk(sent))

(S
  The/DT
  (GPE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (PERSON Brooke/NNP T./NNP Mossman/NNP)
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (ORGANIZATION University/NNP)
  of/IN
  (PERSON Vermont/NNP College/NNP)
  of/IN
  (GPE Medicine/NNP)
  ./.)


In [47]:
sent

[('The', 'DT'),
 ('U.S.', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('few', 'JJ'),
 ('industrialized', 'VBN'),
 ('nations', 'NNS'),
 ('that', 'WDT'),
 ('*T*-7', '-NONE-'),
 ('does', 'VBZ'),
 ("n't", 'RB'),
 ('have', 'VB'),
 ('a', 'DT'),
 ('higher', 'JJR'),
 ('standard', 'NN'),
 ('of', 'IN'),
 ('regulation', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('smooth', 'JJ'),
 (',', ','),
 ('needle-like', 'JJ'),
 ('fibers', 'NNS'),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('crocidolite', 'NN'),
 ('that', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('are', 'VBP'),
 ('classified', 'VBN'),
 ('*-5', '-NONE-'),
 ('as', 'IN'),
 ('amphobiles', 'NNS'),
 (',', ','),
 ('according', 'VBG'),
 ('to', 'TO'),
 ('Brooke', 'NNP'),
 ('T.', 'NNP'),
 ('Mossman', 'NNP'),
 (',', ','),
 ('a', 'DT'),
 ('professor', 'NN'),
 ('of', 'IN'),
 ('pathlogy', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Vermont', 'NNP'),
 ('College', 'NNP'),
 ('of', 'IN'),
 ('Medicine', 'NNP'),
 ('.', '

In [25]:
# this looks for all types of relationships in the form (X <regex> Y)
# specifically looks for anything like (X "in" Y where Y doesn't end with -ing)
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
ALLIN = re.compile(r'.*\bin\b')
LOCCHANGE = re.compile(r'.*\bto\b')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                                     corpus='ieer', pattern = ALLIN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [14]:
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    print(doc.text)

(DOCUMENT
  For
  almost
  (DURATION 20 years)
  ,
  since
  its
  debut
  in
  (DATE 1979)
  ,
  (PERSON Bob Edwards)
  has
  presided
  over
  the
  (ORGANIZATION National Public Radio)
  news
  magazine
  ``Morning
  Edition.''
  But
  from
  the
  start,
  the
  soothing,
  avuncular
  tone
  that
  is
  (PERSON Edwards)
  '
  trademark
  raised
  certain
  questions.
  ``Isn't
  that
  man
  dangerous?''
  a
  (MEASURE 10-year)
  -old
  in
  the
  school
  car
  pool
  I
  was
  driving
  asked
  in
  (DATE 1980)
  .
  I
  couldn't
  imagine
  what
  could
  be
  less
  dangerous.
  ``What
  are
  you
  talking
  about?''
  I
  asked.
  ``It's
  early
  in
  the
  morning,''
  she
  said.
  ``There
  are
  all
  these
  people
  driving
  around.
  He's
  going
  to
  make
  them
  all
  go
  back
  to
  sleep.''
  Like
  a
  number
  of
  other
  high-profile
  (ORGANIZATION NPR)
  news-magazine
  hosts
  or
  news
  readers,
  (PERSON Edwards)
  conveys
  a
  distinct
  sense
  

In [15]:
gazette = nltk.corpus.gazetteers

In [17]:
dir(gazette)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_encoding',
 '_fileids',
 '_get_root',
 '_root',
 '_tagset',
 '_unload',
 'abspath',
 'abspaths',
 'encoding',
 'ensure_loaded',
 'fileids',
 'open',
 'raw',
 'readme',
 'root',
 'unicode_repr',
 'words']

In [18]:
g_words = gazette.words()

In [22]:
len(g_words)

1211

In [31]:
[name for name in gazette.words('uscities.txt') if name == 'Birmingham']

['Birmingham']

In [34]:
gazette.fileids()

['caprovinces.txt',
 'countries.txt',
 'isocountries.txt',
 'mexstates.txt',
 'nationalities.txt',
 'uscities.txt',
 'usstateabbrev.txt',
 'usstates.txt']

In [35]:
for fid in gazette.fileids():
    print(fid, gazette.words(fid)[:5])

caprovinces.txt ['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador']
countries.txt ['Abkhazia', 'Afghanistan', 'Akrotiri', 'Akrotiri and Dhekelia', 'Aland']
isocountries.txt ['Afghanistan', 'Ĺland Islands', 'Albania', 'Algeria', 'American Samoa']
mexstates.txt ['Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas']
nationalities.txt ['Afghan', 'Albanian', 'Algerian', 'American', 'Andorran']
uscities.txt ['Birmingham', 'Huntsville', 'Mobile', 'Montgomery', 'Anchorage']
usstateabbrev.txt ['Ala.', 'Alas.', 'A.R.', 'Ariz.', 'Ark.']
usstates.txt ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California']


In [3]:
from nltk.chunk import ChunkParserI
from nltk.corpus import gazetteers
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.ch

In [4]:
sentence = "Albany NY is much worse than Boston MA".split(" ")
print(sentence)

['Albany', 'NY', 'is', 'much', 'worse', 'than', 'Boston', 'MA']


In [None]:
class LocationChunker(ChunkParserI):
    self.locations = set(gazetteers.words())
    self.max_lookahead = 1
    
    def get_iob_tags(self, tagged_sents):
        tagged_sents
        

In [5]:
import sys
sys.modules

{'IPython': <module 'IPython' from '/usr/local/lib/python3.4/site-packages/IPython/__init__.py'>,
 'IPython.core': <module 'IPython.core' from '/usr/local/lib/python3.4/site-packages/IPython/core/__init__.py'>,
 'IPython.core.alias': <module 'IPython.core.alias' from '/usr/local/lib/python3.4/site-packages/IPython/core/alias.py'>,
 'IPython.core.application': <module 'IPython.core.application' from '/usr/local/lib/python3.4/site-packages/IPython/core/application.py'>,
 'IPython.core.autocall': <module 'IPython.core.autocall' from '/usr/local/lib/python3.4/site-packages/IPython/core/autocall.py'>,
 'IPython.core.builtin_trap': <module 'IPython.core.builtin_trap' from '/usr/local/lib/python3.4/site-packages/IPython/core/builtin_trap.py'>,
 'IPython.core.compilerop': <module 'IPython.core.compilerop' from '/usr/local/lib/python3.4/site-packages/IPython/core/compilerop.py'>,
 'IPython.core.completer': <module 'IPython.core.completer' from '/usr/local/lib/python3.4/site-packages/IPython/cor

In [7]:
nltk.corpus.ieer.parsed_docs('NYT_19980315')

[<IEERDocument NYT19980315.0063: 'PUBLIC RADIO HOSTS DROP IN AND MAYBE STAY TOO LONG'>, <IEERDocument NYT19980315.0064: 'IN CYBERSPACE, IS THERE LAW WHERE THERE IS NO LAND?'>, ...]