In [1]:
import nltk, re, pprint

## 1 Chunking:

#### Phrase chunking and tag patterns using regex:

In [2]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
    ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

grammar = "NP: {<DT>?<JJ>*<NN>}" # tag pattern

cp = nltk.RegexpParser(grammar) # chart parser
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [3]:
result.draw()

#### Tag pattern combination:

In [4]:
grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN.*>} # determiner/ possesive, adjective and noun
        {<NNP>+}                # proper nouns
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
                 ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
result = cp.parse(sentence)

In [5]:
result.draw()

##### Overlapping matches:

In [6]:
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN>{2}}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


#### Exploring text corpora:

Search text corpora using a tag pattern:

In [7]:
grammar = 'CHUNK: {<V.*> <TO> <V.*>}'

cp = nltk.RegexpParser(grammar)
brown = nltk.corpus.brown

for x in brown.tagged_sents()[:50]:
    tree = cp.parse(x)
    for y in tree.subtrees():
        if y.label() == 'CHUNK': print(y)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)


In [8]:
def find_chunks(grammar, tagged_sents):
    cp = nltk.RegexpParser(grammar)
    for x in tagged_sents:
        tree = cp.parse(x)
        for y in tree.subtrees():
            if y.label() == grammar.split(':')[0]: print(y)
                
tagged_sents = brown.tagged_sents()[:50]
grammar = "NOUNS: {<N.*>{4,}}"

find_chunks(grammar, tagged_sents)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


#### Chinking:

In [9]:
grammar = r"""
    NP:
        {<.*>+}       # Chunk everything
        }<VBD|IN>+{   # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
       ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


## 2 Developing and evaluating chunkers:

In [10]:
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''

In [11]:
nltk.chunk.conllstr2tree(text, chunk_types=['NP', 'PP']).draw()

#### Simple evaluation and baselines:

In [12]:
from nltk.corpus import conll2000
cp = nltk.RegexpParser("") # searching for 'O' tag (i.e. not in chunk)
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [13]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


#### Unigram tagger:

In [14]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for pos, chunktag in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [63]:
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
unigram_chuncker = UnigramChunker(train_sents)
print(unigram_chuncker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [64]:
sentence = nltk.corpus.brown.tagged_sents()[0]
nltk.chunk.tree2conlltags(unigram_chuncker.parse(sentence))

[('The', 'AT', 'O'),
 ('Fulton', 'NP-TL', 'O'),
 ('County', 'NN-TL', 'O'),
 ('Grand', 'JJ-TL', 'O'),
 ('Jury', 'NN-TL', 'O'),
 ('said', 'VBD', 'O'),
 ('Friday', 'NR', 'O'),
 ('an', 'AT', 'O'),
 ('investigation', 'NN', 'B-NP'),
 ('of', 'IN', 'O'),
 ("Atlanta's", 'NP$', 'O'),
 ('recent', 'JJ', 'B-NP'),
 ('primary', 'NN', 'I-NP'),
 ('election', 'NN', 'I-NP'),
 ('produced', 'VBD', 'O'),
 ('``', '``', 'O'),
 ('no', 'AT', 'O'),
 ('evidence', 'NN', 'B-NP'),
 ("''", "''", 'O'),
 ('that', 'CS', 'O'),
 ('any', 'DTI', 'O'),
 ('irregularities', 'NNS', 'B-NP'),
 ('took', 'VBD', 'O'),
 ('place', 'NN', 'B-NP'),
 ('.', '.', 'O')]

In [32]:
postags = sorted(set(pos for sent in train_sents
                    for word, pos in sent.leaves()))
print(unigram_chuncker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


#### Bigram tagger:

In [51]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        t1 = nltk.UnigramTagger(train_data)
        self.tagger = nltk.BigramTagger(train_data, backoff=t1)
                                        #, cutoff=2)
        
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for pos, chunktag in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [52]:
bigram_chuncker = BigramChunker(train_sents)
print(bigram_chuncker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.4%%
    Precision:     82.3%%
    Recall:        87.0%%
    F-Measure:     84.6%%


In [53]:
print(bigram_chuncker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'I-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'B-NP'), ('DT', 'I-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'B-NP'), ('JJR', 'I-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'B-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'I-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'I-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [79]:
#nltk.chunk.tree2conlltags(bigram_chuncker.parse(sentence))

#### Classifier-based chunkers:

In [73]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(
            train_set, trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [74]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [80]:
tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]

In [87]:
tagged_sents[0][0]

(('Confidence', 'NN'), 'B-NP')

In [90]:
untagged_sent = nltk.tag.untag(tagged_sents[0])
untagged_sent[0]

('Confidence', 'NN')

In [None]:
#for x in tagged_sents:
#    for i in enumerate(x):
#        print(i)

In [108]:
#for x in tagged_sents:
#    for i, (word, tag) in enumerate(x):
#        print(i)
#        print((word, tag))

In [104]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


## 3 Named entity recogntion (NER):

In [130]:
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=False)[0:5]) # 'binary=True' to display all named entities as NE

[('The', 'DT'), Tree('GPE', [('U.S.', 'NNP')]), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN')]


## 4 Relation extraction:

In [138]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for x in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for y in nltk.sem.extract_rels('ORG', 'LOC', x,
                                  corpus='ieer', pattern=IN):
        print(nltk.sem.rtuple(y))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [146]:
from nltk.corpus import conll2002
vnv = """
(
is/V|    # 3rd sing present and
was/V|   # past forms of the verb zijn ('be')
werd/V|  # and also present
wordt/V  # past of worden ('become)
)
.*       # followed by anything
van/Prep # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for x in conll2002.chunked_sents('ned.train'):
    for y in nltk.sem.extract_rels('PER', 'ORG', x,
                                  corpus='conll2002', pattern=VAN):
        #print(nltk.sem.clause(y, relsym='VAN'))
        print(nltk.rtuple(y, lcon=True, rcon=True))

...'')[PER: "Cornet/V d'Elzius/N"] 'is/V op/Prep dit/Pron ogenblik/N kabinetsadviseur/N van/Prep staatssecretaris/N voor/Prep' [ORG: 'Buitenlandse/N Handel/N'](''...
...'')[PER: 'Johan/N Rottiers/N'] 'is/V informaticacoördinator/N van/Prep het/Art' [ORG: 'Kardinaal/N Van/N Roey/N Instituut/N']('in/Prep'...
...'Door/Prep rugproblemen/N van/Prep zangeres/N')[PER: 'Annie/N Lennox/N'] 'wordt/V het/Art concert/N van/Prep' [ORG: 'Eurythmics/N']('vandaag/Adv in/Prep'...
