In [3]:
import warnings
warnings.filterwarnings("ignore")

# Importing the usual utilities
import numpy as np
import re, random, os, string

import nltk
from nltk.tokenize import word_tokenize
from pprint import pprint #pretty print

# POS Tagging

Check out the POS tagging chapter from the NLTK book

http://www.nltk.org/book/ch05.html
    

In [2]:
?nltk.pos_tag

In [2]:
tok = "A very long story".split()
tok

['A', 'very', 'long', 'story']

In [4]:
nltk.pos_tag(tok)

[('A', 'DT'), ('very', 'RB'), ('long', 'JJ'), ('story', 'NN')]

In [5]:
nltk.help.upenn_tagset("RB")

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


In [6]:
tok = "I like machine learning".split()

In [7]:
nltk.pos_tag(tok)

[('I', 'PRP'), ('like', 'VBP'), ('machine', 'NN'), ('learning', 'NN')]

In [8]:
nltk.help.upenn_tagset("PRP$")

PRP$: pronoun, possessive
    her his mine my our ours their thy your


In [9]:
nltk.help.upenn_tagset("TO")

TO: "to" as preposition or infinitive marker
    to


In [5]:
nltk.pos_tag("Our coders code hard to create this code".split())

[('Our', 'PRP$'),
 ('coders', 'NNS'),
 ('code', 'VBP'),
 ('hard', 'JJ'),
 ('to', 'TO'),
 ('create', 'VB'),
 ('this', 'DT'),
 ('code', 'NN')]

In [10]:
nltk.pos_tag("They refuse to give us the refuse permit".split())

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('give', 'VB'),
 ('us', 'PRP'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

#### Understanding any specific tag

In [7]:
#nltk.download('tagsets')
nltk.help.upenn_tagset("RB")

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


In [8]:
nltk.help.upenn_tagset("JJ")

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [9]:
nltk.help.upenn_tagset("")

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

#### Using the Stanford POS Tagger

In [12]:
# Some house keeping
##stanford-postagger-2018-10-16
import os
java_path = "C:/Program Files/jdk-11.0.2/bin/java.exe"
os.environ['JAVAHOME'] = java_path

path_tagger = os.getcwd()+"\\"+"stanford-postagger-2018-10-16"

path_to_model = path_tagger+"\\models\\english-left3words-distsim.tagger"
path_to_jar = path_tagger+"\\stanford-postagger.jar"

In [13]:
from nltk.tag.stanford import StanfordPOSTagger

In [12]:
?StanfordPOSTagger

In [14]:
st = StanfordPOSTagger(path_to_model, path_to_jar)

In [15]:
st.tag("I like machine learning".split())

[('I', 'PRP'), ('like', 'VBP'), ('machine', 'NN'), ('learning', 'NN')]

In [16]:
st.tag("They refuse to give us the refuse permit".split())

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('give', 'VB'),
 ('us', 'PRP'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

#### Using spaCy

https://spacy.io/usage/models

In [17]:
#!python -m spacy download en_core_web_sm
#nlp = spacy.load('en', disable=['parser', 'ner'])

import spacy
import en_core_web_sm
parser = en_core_web_sm.load()

In [18]:
res = parser("Our coders code hard to make the code work")

In [18]:
dir(res[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_extension',
 'has_vector',
 'head',
 'i',
 'idx',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'left_edge',
 'lefts',
 'lemma',
 'lemma_',
 'lex_id',
 'like_email',
 'like_num',
 'l

In [19]:
for token in res:
    print(token.text, token.pos_)

Our ADJ
coders NOUN
code VERB
hard ADV
to PART
make VERB
the DET
code NOUN
work NOUN


#### Creating your own rule based tagger

Example from the NLTK book

http://www.nltk.org/book/ch05.html

In [19]:
patterns = [
    (r'.*ing$', 'VBG'),              # gerund
    (r'.*ed$', 'VBD'),               # past tense
    (r'.*es$', 'VBZ'),               # 3rd singular present
    (r'.*ould$', 'MD'),              # modals
    (r'.*\'s$', 'NN$'),              # possessive nouns
    (r'.*s$', 'NNS'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN')                    # nouns
]

In [20]:
regexp_tagger = nltk.RegexpTagger(patterns)

In [21]:
regexp_tagger.tag("I would love to go swimming".split())

[('I', 'NN'),
 ('would', 'MD'),
 ('love', 'NN'),
 ('to', 'NN'),
 ('go', 'NN'),
 ('swimming', 'VBG')]

In [23]:
nltk.help.upenn_tagset("MD")

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


# Constituency parsing

## Using CFGs

Specfy the CFG

In [1]:
cfg_string = """
S -> NP VP
NP -> N | Det N
VP -> V | V NP
Det -> 'the'
N -> 'bird'
V -> 'sang'
"""

In [2]:
cfg_string

"\nS -> NP VP\nNP -> N | Det N\nVP -> V | V NP\nDet -> 'the'\nN -> 'bird'\nV -> 'sang'\n"

Assigning the grammar

In [4]:
from nltk import CFG

In [5]:
grammar = CFG.fromstring(cfg_string)

The sentence to analyse

In [6]:
txt = "the bird sang".split()

#### Parsing the sentence

In [7]:
from nltk.parse import RecursiveDescentParser

In [8]:
#Using a top-down parser
rdstr = RecursiveDescentParser(grammar)

#Print each of the trees 
for tree in rdstr.parse(txt):
    print(tree)

(S (NP (Det the) (N bird)) (VP (V sang)))


In [9]:
#Print each of the trees 
for tree in rdstr.parse(txt):
    tree.draw()

#### Seeing the process in action

In [11]:
import nltk
nltk.app.rdparser()

[('under',)]
[('with',)]
[('in',)]
[('under',), ('with',)]
[('ate',)]
[('saw',)]
[('dog',)]
[('telescope',)]
[('park',)]
[('dog',), ('telescope',)]
[('man',)]
[('park',), ('dog',), ('telescope',)]
[('the',)]
[('a',)]
[(V, NP)]
[(V,)]
[(V, NP, PP)]
[(V, NP), (V,)]
[(Det, N, PP)]
[(Det, N)]
S [(NP, VP)]
NP [(Det, N, PP), (Det, N)]
VP [(V, NP, PP), (V, NP), (V,)]
PP [(P, NP)]
NP [('I',)]
Det [('the',), ('a',)]
N [('man',), ('park',), ('dog',), ('telescope',)]
V [('ate',), ('saw',)]
P [('in',), ('under',), ('with',)]


### Another example - 

In [12]:
#Specification of CFG
grammar = CFG.fromstring("""
S -> NP VP
NP -> Det N | Det N PP
VP -> V | V NP | V NP PP
PP -> P NP

Det -> 'a' | 'an' | 'the'
N -> 'man' | 'park' | 'dog' | 'telescope'
V -> 'saw' | 'walked'
P -> 'in' | 'with'
""")

In [60]:
txt = "the man saw a dog in the park with a telescope"

#Using a top-down parser
rdstr = RecursiveDescentParser(grammar)

#Print each of the trees 
for tree in rdstr.parse(txt.split()):
    print(tree)

(S
  (NP (Det the) (N man))
  (VP
    (V saw)
    (NP
      (Det a)
      (N dog)
      (PP
        (P in)
        (NP
          (Det the)
          (N park)
          (PP (P with) (NP (Det a) (N telescope))))))))
(S
  (NP (Det the) (N man))
  (VP
    (V saw)
    (NP (Det a) (N dog))
    (PP
      (P in)
      (NP
        (Det the)
        (N park)
        (PP (P with) (NP (Det a) (N telescope)))))))
(S
  (NP (Det the) (N man))
  (VP
    (V saw)
    (NP (Det a) (N dog) (PP (P in) (NP (Det the) (N park))))
    (PP (P with) (NP (Det a) (N telescope)))))


In [61]:
tree.draw()

## Using PCFGs

In [13]:
from nltk import PCFG

Define the grammar

In [14]:
pcfg_grammar = PCFG.fromstring("""
    S -> NP VP [1.0] 
    PP -> P NP [1.0]
    VP -> V NP [0.7] | VP PP [0.3] 
    NP -> NP PP [0.4] 
    P -> 'with' [1.0]
    V -> 'saw' [1.0]
    NP -> 'astronomers' [0.1] | 'ears' [0.18] | 'saw' [0.04] | 'stars' [0.18] | 'telescopes' [0.1]
    """)

In [15]:
txt = "astronomers saw stars with ears"

In [16]:
from nltk.parse import pchart

parser = pchart.InsideChartParser(pcfg_grammar)

#print all possible trees, showing probability of each parse
for t in parser.parse(txt.split()):
     print(t)

(S
  (NP astronomers)
  (VP (V saw) (NP (NP stars) (PP (P with) (NP ears))))) (p=0.0009072)
(S
  (NP astronomers)
  (VP (VP (V saw) (NP stars)) (PP (P with) (NP ears)))) (p=0.0006804)


# Dependency parsing
- We'll user spaCy for this
- Stanford dependency parser is another option

In [17]:
import spacy
import en_core_web_sm
parser = en_core_web_sm.load()

Parsing a simple sentence

In [18]:
res = parser('cat ate fish')

In [19]:
print(spacy.displacy.serve(res, style='dep', page = True))


    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [05/Feb/2019 17:58:11] "GET / HTTP/1.1" 200 2302
127.0.0.1 - - [05/Feb/2019 17:58:11] "GET /favicon.ico HTTP/1.1" 200 2302



    Shutting down server on port 5000.

None


A little more complex sentence

In [97]:
res = parser('economic news had little effect on the financial markets')

In [98]:
print(spacy.displacy.serve(res, style='dep', page = True))


    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [04/Feb/2019 01:47:15] "GET / HTTP/1.1" 200 6834
127.0.0.1 - - [04/Feb/2019 01:47:16] "GET /favicon.ico HTTP/1.1" 200 6834



    Shutting down server on port 5000.

None


# Named Entity Recognition

In [20]:
doc = parser(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [21]:
print(spacy.displacy.serve(doc, style='ent', page = True))


    Serving on port 5000...
    Using the 'ent' visualizer



127.0.0.1 - - [05/Feb/2019 17:59:02] "GET / HTTP/1.1" 200 1613
127.0.0.1 - - [05/Feb/2019 17:59:02] "GET /favicon.ico HTTP/1.1" 200 1613



    Shutting down server on port 5000.

None
