## Semantic Analysis

A.k.a meaning generation. The process of determining the meaning of character sequences or word sequences. 

In [1]:
import nltk
nltk.boolean_ops()

negation       	-
conjunction    	&
disjunction    	|
implication    	->
equivalence    	<->


## NER 
Named entity recognition is the process in which proper nouns or named entities are located in a document. Then, these Named Entities are classified into different categories, such as Name of Person, Location, Organization and so on.

- NEP: Name of Person
- NED: Name of Designation
- NEO: Name of Organization 
- NEA: Name of Abbreviation
- NEB: Name of Brand
- NETP: Title of Person
- NETO: Title of Object
- NEL: Name of Location
- NETI: Time
- NEN: Number
- NEM: Measure
- NETE: Terms

In [17]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# Download it from:
# https://nlp.stanford.edu/software/CRF-NER.shtml#Download
sentence = StanfordNERTagger('./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                            './stanford-ner-2018-10-16/stanford-ner.jar')

In [18]:
# Performing NER using Stanford Tagger
sentence.tag(word_tokenize('John goes to NY'))

[('John', 'PERSON'), ('goes', 'O'), ('to', 'O'), ('NY', 'O')]

In [22]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
sentence = nltk.corpus.treebank.tagged_sents()[17]

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [23]:
nltk.ne_chunk(sentence, binary=True)

FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/pt/v3mw_j891dv8yl5k2b5p54wm0000gn/T/tmpxlf_09_n.png'

Tree('S', [('The', 'DT'), ('total', 'NN'), ('of', 'IN'), ('18', 'CD'), ('deaths', 'NNS'), ('from', 'IN'), ('malignant', 'JJ'), ('mesothelioma', 'NN'), (',', ','), ('lung', 'NN'), ('cancer', 'NN'), ('and', 'CC'), ('asbestosis', 'NN'), ('was', 'VBD'), ('far', 'RB'), ('higher', 'JJR'), ('than', 'IN'), ('*', '-NONE-'), ('expected', 'VBN'), ('*?*', '-NONE-'), (',', ','), ('the', 'DT'), ('researchers', 'NNS'), ('said', 'VBD'), ('0', '-NONE-'), ('*T*-1', '-NONE-'), ('.', '.')])

In [24]:
nltk.ne_chunk(sentence)

FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/pt/v3mw_j891dv8yl5k2b5p54wm0000gn/T/tmp1fjw606l.png'

Tree('S', [('The', 'DT'), ('total', 'NN'), ('of', 'IN'), ('18', 'CD'), ('deaths', 'NNS'), ('from', 'IN'), ('malignant', 'JJ'), ('mesothelioma', 'NN'), (',', ','), ('lung', 'NN'), ('cancer', 'NN'), ('and', 'CC'), ('asbestosis', 'NN'), ('was', 'VBD'), ('far', 'RB'), ('higher', 'JJR'), ('than', 'IN'), ('*', '-NONE-'), ('expected', 'VBN'), ('*?*', '-NONE-'), (',', ','), ('the', 'DT'), ('researchers', 'NNS'), ('said', 'VBD'), ('0', '-NONE-'), ('*T*-1', '-NONE-'), ('.', '.')])

## NER System using Hidden Markov Model

## Training NER using Machine Learning Toolkits

- Rule-based or Handcrafted approach
    - list lookup approach
    - linguistic approach
- Machine-learning based or automated approach
    - hidden markov model
    - maximum entropy markov model
    - conditional random fields 
    - support vector machine 
    - decision trees
    
ML based approaches outperforms the rule-based approaches.

In [1]:
import nltk 
from nltk import pos_tag, word_tokenize
pos_tag(word_tokenize('John and Smith are going to NY and Germany'))

[('John', 'NNP'),
 ('and', 'CC'),
 ('Smith', 'NNP'),
 ('are', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('NY', 'NNP'),
 ('and', 'CC'),
 ('Germany', 'NNP')]

In [3]:
import nltk
from nltk.corpus import brown

from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news'))
sentence = ['John', 'and', 'Smith', 'went', 'to', 'NY', 'and', 'Germany']
for word, tag in tagger.tag(sentence):
    print(word, '->', tag)

John -> NP
and -> CC
Smith -> NP
went -> VBD
to -> TO
NY -> None
and -> CC
Germany -> NP-TL
