#**7. Information Extraction**
1. Part-of-Speech Tagging
2. Chunking
3. Chinking
4. Named Entity Recognition
5. Relation Extraction


In [None]:
import nltk
import string
import re

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#**Part of Speech Tagging**
The part of speech explains how a word is used in a sentence. In a sentence, a word can have different contexts and semantic meanings. The basic natural language processing models like bag-of-words fail to identify these relations between words. Hence, we use part of speech tagging to mark a word to its part of speech tag based on its context in the data. It is also used to extract relationships between words.

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
  
# convert text into word_tokens with their tags
def pos_tagging(text):
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)
  
pos_tagging('i dont quit i dont run i never go back on my word Thats my ninja way!!! Uzumaki Naruto!')

#PRP stands for personal pronoun, RB for adverb, VBD for verb past tense, DT for determiner and NN for noun.

[('i', 'JJ'),
 ('dont', 'VBP'),
 ('quit', 'NN'),
 ('i', 'NN'),
 ('dont', 'VBP'),
 ('run', 'VBN'),
 ('i', 'RB'),
 ('never', 'RB'),
 ('go', 'VBP'),
 ('back', 'RB'),
 ('on', 'IN'),
 ('my', 'PRP$'),
 ('word', 'NN'),
 ('Thats', 'NNPS'),
 ('my', 'PRP$'),
 ('ninja', 'JJ'),
 ('way', 'NN'),
 ('!', '.'),
 ('!', '.'),
 ('!', '.'),
 ('Uzumaki', 'JJ'),
 ('Naruto', 'NN'),
 ('!', '.')]

#**Chunking**
Chunking is the process of extracting phrases from unstructured text and more structure to it. It is also known as shallow parsing. It is done on top of Part of Speech tagging. It groups word into “chunks”, mainly of noun phrases. Chunking is done using regular expressions.

**RULE: "Tag Noun, verb (past tense), adjective, and coordinating junction from the sentence."**

In [None]:
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
from nltk import RegexpParser

  

In [None]:
sentence = "The Fourth Hokage Minato is YellowFlash of Konoha"

In [None]:
grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')

In [None]:
chunkParser = nltk.RegexpParser(grammar)
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
tagged

[('The', 'DT'),
 ('Fourth', 'NNP'),
 ('Hokage', 'NNP'),
 ('Minato', 'NNP'),
 ('is', 'VBZ'),
 ('YellowFlash', 'NNP'),
 ('of', 'IN'),
 ('Konoha', 'NNP')]

In [None]:
tree = chunkParser.parse(tagged)

In [None]:
for subtree in tree.subtrees():
    print(subtree)
 #tree.draw()   

(S
  The/DT
  Fourth/NNP
  Hokage/NNP
  Minato/NNP
  is/VBZ
  YellowFlash/NNP
  of/IN
  Konoha/NNP)


#**Chinking**
Chinking is the process of removing a sequence of tokens from a chunk. If the matching sequence of tokens spans an entire chunk, then the whole chunk is removed; if the sequence of tokens appears in the middle of the chunk, these tokens are removed, leaving two chunks where there was only one before. If the sequence is at the periphery of the chunk, these tokens are removed, and a smaller chunk remains.
Sometimes it is easier to define what we want to exclude from a chunk. We can define a chink to be a sequence of tokens that is not included in a chunk.

**RULE: This means we're removing from the chink one or more verbs, prepositions, determiners, or the word 'to'.** 

In [None]:
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
from nltk import RegexpParser


In [None]:
sentence = "The Seventh Hokage Naruto the son of Minato and Kushina of Konoha village and having Six Sage Path power with kurama."

In [None]:
chinkgra = ('''
    Chunk: {<.*>+}
    }<VB.?|IN|DT|TO>+{
    ''')

In [None]:
chinkParser = nltk.RegexpParser(chinkgra)
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
tagged


[('The', 'DT'),
 ('Seventh', 'NNP'),
 ('Hokage', 'NNP'),
 ('Naruto', 'NNP'),
 ('the', 'DT'),
 ('son', 'NN'),
 ('of', 'IN'),
 ('Minato', 'NNP'),
 ('and', 'CC'),
 ('Kushina', 'NNP'),
 ('of', 'IN'),
 ('Konoha', 'NNP'),
 ('village', 'NN'),
 ('and', 'CC'),
 ('having', 'VBG'),
 ('Six', 'NNP'),
 ('Sage', 'NNP'),
 ('Path', 'NNP'),
 ('power', 'NN'),
 ('with', 'IN'),
 ('kurama', 'NN'),
 ('.', '.')]

#**Named Entity Recognition:**
Named Entity Recognition is used to extract information from unstructured text. It is used to classify entities present in a text into categories like a person, organization, event, places, etc. It gives us detailed knowledge about the text and the relationships between the different entities.

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
def named_entity_recognition(text):
	# tokenize the text
	word_tokens = word_tokenize(text)

	# part of speech tagging of words
	word_pos = pos_tag(word_tokens)

	# tree of word entities
	print(ne_chunk(word_pos))

text = 'The Sannin, otherwise known as the Legendary Three Ninja, were the students of the Third Hokage, Hiruzen Sarutobi. They are Jiraiya, Tsunade, and Orochimaru. All three are incredibly talented shinobi'
named_entity_recognition(text)


(S
  The/DT
  (GPE Sannin/NNP)
  ,/,
  otherwise/RB
  known/VBN
  as/IN
  the/DT
  Legendary/NNP
  Three/NNP
  Ninja/NNP
  ,/,
  were/VBD
  the/DT
  students/NNS
  of/IN
  the/DT
  Third/NNP
  Hokage/NNP
  ,/,
  (PERSON Hiruzen/NNP Sarutobi/NNP)
  ./.
  They/PRP
  are/VBP
  (PERSON Jiraiya/NNP)
  ,/,
  (PERSON Tsunade/NNP)
  ,/,
  and/CC
  (ORGANIZATION Orochimaru/NNP)
  ./.
  All/DT
  three/CD
  are/VBP
  incredibly/RB
  talented/VBN
  shinobi/NN)


#**Relation Extraction**
