In [2]:
import nltk

In [3]:
simpleSentence = "Bangalore is the capital of karnataka"

In [5]:
wordsInSentence = nltk.word_tokenize(simpleSentence)

In [6]:
print(wordsInSentence)

['Bangalore', 'is', 'the', 'capital', 'of', 'karnataka']


In [10]:
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)

In [11]:
partsOfSpeechTags

[('Bangalore', 'NNP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('capital', 'NN'),
 ('of', 'IN'),
 ('karnataka', 'NN')]

#### Writing your Own Tagger

In [12]:
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [15]:
def learnRETagger(simpleSentence):
    customPatterns = [
        (r'.*ing$','ADJECTIVE'),
        (r'.*ly$','ADVERB'),
        (r'.*ion$','NOUN'),
        (r'.*ate |.*en|is$','VERB'),
        (r'.^an$$','INDEFINITE-ARTICLE'),
        (r'.^(with|on|at)$','PREPOSITION'),
        (r'.^\-?[0-9]+(\.[0-9]+)$','NUMBER'),
        (r'.*$',None)        
    ]
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.RegexpTagger(customPatterns)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [23]:
def learnLookupTagger(simpleSentence):
    mapping  = {
        '.':'.','place':'NN','on':'IN','earth':'NN',
        'Mysore':'NNP','is':'VBZ','an':'DT','amazing':'JJ'        
    }
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.UnigramTagger(model=mapping)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [24]:
if __name__ == '__main__':
    testSentence = "Mysore is an amazing place on earth, I have visited Mysore 10 times."
    learnDefaultTagger(testSentence)
    print("\n")
    learnRETagger(testSentence)
    print("\n")
    learnLookupTagger(testSentence)
    

[('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), (',', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]


[('Mysore', None), ('is', 'VERB'), ('an', None), ('amazing', 'ADJECTIVE'), ('place', None), ('on', None), ('earth', None), (',', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', None), ('times', None), ('.', None)]


[('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), (',', None), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]


### Training your own Tagger

In [8]:
import pickle,nltk

In [9]:
def sampleData():
    return [
        "Bangalore is the capital of Karnataka.",
        "Steve Jobs was the CEO of Apple",
        "iPhone was Invented by Apple",
        "Books can be purchased in Market Iphone"
    ]


In [10]:
def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

In [11]:
def saveMyTagger(tagger,fileName):
    fileHandle = open(fileName,"wb")
    pickle.dump(tagger,fileHandle)
    fileHandle.close()

In [12]:
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model=buildDictionary())
    saveMyTagger(tagger,fileName)

In [13]:
def loadMyTagger(fileName):
    return pickle.load(open(fileName,"rb"))

In [14]:
sentence = "Iphone is purchased by Stevejobs in Bangalore Market"
fileName = "myTagger.pickle"

In [15]:
saveMyTraining(fileName)

In [16]:
myTagger = loadMyTagger(fileName)

In [17]:
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', 'NNP'), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Stevejobs', None), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP')]


#### Learning to write your own grammar 

In [63]:
import string

In [64]:
from nltk.parse.generate import generate

In [65]:
del productions
productions = [
    "ROOT -> WORD",
    "WORD -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER"
]


In [66]:
digits = list(string.digits)

In [67]:
digits

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [68]:
for digit in digits[:4]:
    productions.append("NUMBER  -> '{w}'".format(w=digit))

In [69]:
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER  -> '{w}'".format(w=letters))

In [70]:
letters

"a' | 'b' | 'c' | 'd"

In [71]:
grammarString = "\n".join(productions)

In [72]:
grammar = nltk.CFG.fromstring(grammarString)

In [73]:
print(grammar)

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'


In [76]:
for sentence in generate(grammar, n=6,depth=6):
    #print(sentence)
    palindrome = "".join(sentence).replace(" ","")
    print("Generated Word : {}, Size : {}".format(palindrome,len(palindrome)))

Generated Word : , Size : 0
Generated Word : 0a, Size : 2
Generated Word : 0b, Size : 2
Generated Word : 0c, Size : 2
Generated Word : 0d, Size : 2
Generated Word : 1a, Size : 2


### CFG is a special type of CFG in which the sum of all the probabilities for the non terminal tokens should be equal to 1.

In [19]:
from nltk.parse.generate import generate

In [24]:
productions  = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]

In [25]:
grammarString = "\n".join(productions)

In [26]:
grammar = nltk.PCFG.fromstring(grammarString)

In [27]:
print(grammar)

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]


In [29]:
for sentence in generate(grammar, n = 100, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("String : {},Size : {}".format(palindrome,len(palindrome)))

String : A,Size : 1
String : AB,Size : 2
String : AC,Size : 2
String : ABD,Size : 3
String : ABE,Size : 3
String : ABF,Size : 3
String : ACD,Size : 3
String : ACE,Size : 3
String : ACF,Size : 3
String : ABDG,Size : 4
String : ABDH,Size : 4
String : ABEG,Size : 4
String : ABEH,Size : 4
String : ABFG,Size : 4
String : ABFH,Size : 4
String : ACDG,Size : 4
String : ACDH,Size : 4
String : ACEG,Size : 4
String : ACEH,Size : 4
String : ACFG,Size : 4
String : ACFH,Size : 4


###### Recursive CFGs are a special types of CFGs, where the tokens on the left hand side are present on the right hand side of a production rule. Palindromes are the best examples of recursive CFG.

#### Writing a recursive CFG

In [33]:
import string
productions = [
    "ROOT -> WORD",
    "WORD -> ' '"
]

In [34]:
alphabets = list(string.digits)

In [36]:
for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

In [37]:
grammarString = "\n".join(productions)

In [38]:
grammar = nltk.CFG.fromstring(grammarString)

In [39]:
print(grammar)

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'


In [41]:
for sentence in generate(grammar, n = 10, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("Palindrome : {},Size : {}".format(palindrome,len(palindrome)))

Palindrome : ,Size : 0
Palindrome : 00,Size : 2
Palindrome : 0000,Size : 4
Palindrome : 0110,Size : 4
Palindrome : 0220,Size : 4
Palindrome : 0330,Size : 4
Palindrome : 0440,Size : 4
Palindrome : 0550,Size : 4
Palindrome : 0660,Size : 4
Palindrome : 0770,Size : 4


This section taught us that part of speech tagging forms the basis of any further syntactic analysis, and grammars can be formed and deformed using part of speech tags and chunks. We have learned to use and write our own POS taggers and grammars, awesome. In the next section we'll see chunking, sentence parse, and dependencies.

## chunker

In [42]:
import nltk


In [43]:
text = "Lalbagh Botanical Gradens isa well known botanical garden in Bengaluru, India."

In [44]:
sentences = nltk.sent_tokenize(text)

In [45]:
print(sentences)

['Lalbagh Botanical Gradens isa well known botanical garden in Bengaluru, India.']


In [47]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(tags)
    print(chunks)
    

(S
  (PERSON Lalbagh/NNP)
  (PERSON Botanical/NNP Gradens/NNP)
  isa/RB
  well/RB
  known/VBN
  botanical/JJ
  garden/NN
  in/IN
  (GPE Bengaluru/NNP)
  ,/,
  (GPE India/NNP)
  ./.)


## Write own simple chunker

In [48]:
text = "Ravi is the CEO of a company. He is very powerful public speaker also."


In [80]:
grammar = "\n".join([
    'NP: {<DT>*<NNP>}',
    'NP: {<JJ>*<NN>}',
    'NP: {<NNP>+}',
])

In [81]:
sentences = nltk.sent_tokenize(text)

In [82]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunkparser = nltk.RegexpParser(grammar)
    result = chunkparser.parse(tags)
    print(result)

(S
  (NP Ravi/NNP)
  is/VBZ
  (NP the/DT CEO/NNP)
  of/IN
  a/DT
  (NP company/NN)
  ./.)
(S
  He/PRP
  is/VBZ
  very/RB
  (NP powerful/JJ public/JJ speaker/NN)
  also/RB
  ./.)


## Training a Chunker

In [53]:
from nltk.corpus import conll2000
from nltk.corpus import treebank_chunk

In [54]:
def mySimpleChunker():
    grammar = 'NP: {<NNP>+}'
    return nltk.RegexpParser(grammar)

In [55]:
def test_nothing(data):
    cp = nltk.RegexpParser("")
    print(cp.evaluate(data))

In [59]:
def test_mysimplechunker(data):
    schunker = mySimpleChunker()
    print(schunker.evaluate(data))

In [60]:
datasets = [
    conll2000.chunked_sents('test.txt',chunk_types=['NP']),
    treebank_chunk.chunked_sents()
]


In [61]:
for dataset in datasets:
    test_nothing(dataset[:50])
    test_mysimplechunker(dataset[:50])

ChunkParse score:
    IOB Accuracy:  38.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  48.2%%
    Precision:     71.1%%
    Recall:        17.2%%
    F-Measure:     27.7%%
ChunkParse score:
    IOB Accuracy:  45.0%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  50.7%%
    Precision:     51.9%%
    Recall:         8.8%%
    F-Measure:     15.1%%


  return [tok for tok in self._regexp.split(text) if tok]


In [134]:
def RDParserExample(grammar,textlist):
    parser = nltk.parse.RecursiveDescentParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()
        

In [135]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [136]:
text = [
    "Tajmahal is in Agra",
    "Bangalore is the capital of Karnataka"
]

In [137]:
RDParserExample(grammar,text)

(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))
(S
  (NP (NNP Bangalore) (VBZ is))
  (VP (DT the) (NN capital) (IN of) (NNP Karnataka)))


## Shift Reduce parser

Shift reduce parsers are special types of parsers that parse the input text from left to right on a single line sentences and top to bottom on multi-line sentences.

In [138]:
import nltk

def SRParserExample(grammar, textlist):
    parser = nltk.parse.ShiftReduceParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()

In [142]:
text = [
    "Tajmahal is in Agra",
    "Bangalore is the capital of Karnataka"
]

In [143]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [144]:
SRParserExample(grammar, text)

(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))


## Parsing dependency grammar and projective dependency¶

In [145]:
grammar = nltk.grammar.DependencyGrammar.fromstring("""
'savings' -> 'small'
'yield' -> 'savings'
'gains' -> 'large'
'yield' -> 'gains'
""")

In [146]:
sentence = 'small savings yield large gains'

In [147]:
dp = nltk.parse.ProjectiveDependencyParser(grammar)

In [149]:
for t in sorted(dp.parse(sentence.split())):
    print(t)
    t.draw()

(yield (savings small) (gains large))


## Parsing a chart

In [150]:
from nltk.grammar import CFG

In [151]:
from nltk.parse.chart import ChartParser, BU_LC_STRATEGY

In [152]:
grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 -> IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [153]:
cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True)

In [154]:
sentence = "Bangalore is the capital of Karnataka"

In [155]:
tokens = sentence.split()

In [156]:
chart = cp.chart_parse(tokens)

|.Bangal.  is  . the  .capita.  of  .Karnat.|
|[------]      .      .      .      .      .| [0:1] 'Bangalore'
|.      [------]      .      .      .      .| [1:2] 'is'
|.      .      [------]      .      .      .| [2:3] 'the'
|.      .      .      [------]      .      .| [3:4] 'capital'
|.      .      .      .      [------]      .| [4:5] 'of'
|.      .      .      .      .      [------]| [5:6] 'Karnataka'
|[------]      .      .      .      .      .| [0:1] NNP -> 'Bangalore' *
|[------>      .      .      .      .      .| [0:1] T1 -> NNP * VBZ
|.      [------]      .      .      .      .| [1:2] VBZ -> 'is' *
|[-------------]      .      .      .      .| [0:2] T1 -> NNP VBZ *
|[------------->      .      .      .      .| [0:2] S  -> T1 * T4
|.      .      [------]      .      .      .| [2:3] DT -> 'the' *
|.      .      [------>      .      .      .| [2:3] T2 -> DT * NN
|.      .      .      [------]      .      .| [3:4] NN -> 'capital' *
|.      .      [-------------]      .      .| [2:

In [158]:
parses = list(chart.parses(grammar.start()))

In [159]:
print("Total Edges :", len(chart.edges()))

Total Edges : 24


In [160]:
for tree in parses: print(tree)

(S
  (T1 (NNP Bangalore) (VBZ is))
  (T4 (T2 (DT the) (NN capital)) (T3 (IN of) (NNP Karnataka))))


In [161]:
tree.draw()