# Week 9 - Text Features
## Understanding Sysnets Starting on page 522

In [1]:
from nltk.corpus import wordnet as wn
import pandas as pd

term = 'fruit'
synsets = wn.synsets(term)

print('Total Synsets:', len(synsets), '\n')

pd.options.display.max_colwidth = 200
fruit_df = pd.DataFrame([{'Synset': synset,
                          'Part of Speech': synset.lexname(),
                          'Definition': synset.definition(),
                          'Lemmas': synset.lemma_names(),
                          'Examples': synset.examples()} for synset in synsets])

fruit_df = fruit_df[['Synset', 'Part of Speech', 'Definition', 'Lemmas', 'Examples']]
print('Fruit DataFrame:\n', fruit_df, '\n')

# entailments page 524
print('Entailments:')
for action in ['walk', 'eat', 'digest']:
    action_syn = wn.synsets(action, pos='v')[0]
    print(action_syn, '-- entails -->', action_syn.entailments())
print('\n')

# homonyms\homographs page 524
print('Homonyms/homographs:')
for synset in wn.synsets('bank'):
    print(synset.name(),'-',synset.definition())
print('\n')

# synonyms and antonyms page 525
print('Synonyms and antonyms:')
term = 'large'
synsets = wn.synsets(term)
adj_large = synsets[1]
adj_large = adj_large.lemmas()[0]
adj_large_synonym = adj_large.synset()
adj_large_antonym = adj_large.antonyms()[0].synset()

print('Synonym:', adj_large_synonym.name())
print('Definition:', adj_large_synonym.definition())
print('Antonym:', adj_large_antonym.name())
print('Definition:', adj_large_antonym.definition(), '\n')

term = 'rich'
synsets = wn.synsets(term)[:3]
for synset in synsets:
    rich = synset.lemmas()[0]
    rich_synonym = rich.synset()
    rich_antonym = rich.antonyms()[0].synset()
    print('Synonym:', rich_synonym.name())
    print('Definition:', rich_synonym.definition())
    print('Antonym:', rich_antonym.name())
    print('Definition:', rich_antonym.definition())
print('\n')

# hyponyms and hypernyms page 527
print('Hyponyms and hypernyms:')
term = 'tree'
synsets = wn.synsets(term)
tree = synsets[0]
print('Name:', tree.name())
print('Definition:', tree.definition())

hyponyms = tree.hyponyms()
print('Total Hyponyms:', len(hyponyms))
print('Sample Hyponyms')
for hyponym in hyponyms[:10]:
    print(hyponym.name(), '-', hyponym.definition())

print('\n')
    
hypernyms = tree.hypernyms()
print(hypernyms)

hypernym_paths = tree.hypernym_paths()
print('Total Hypernym paths:', len(hypernym_paths))

print('Hypernym Hierarchy')
print(' -> '.join(synset.name() for synset in hypernym_paths[0]))
print('\n')

# holonyms and meronyms page 529
# member holonyms
print('Holonyms and meronyms:')
member_holonyms = tree.member_holonyms()    
print('Total Member Holonyms:', len(member_holonyms))
print('Member Holonyms for [tree]:-')
for holonym in member_holonyms:
    print(holonym.name(), '-', holonym.definition())
print('\n')

# part meronyms page 529
print('Part meronyms:')
part_meronyms = tree.part_meronyms()
print('Total Part Meronyms:', len(part_meronyms))
print('Part Meronyms for [tree]:-')
for meronym in part_meronyms:
    print(meronym.name(), '-', meronym.definition())
print('\n')

# substance meronyms
print('Substance meronyms:')
substance_meronyms = tree.substance_meronyms()    
print('Total Substance Meronyms:', len(substance_meronyms))
print('Substance Meronyms for [tree]:-')
for meronym in substance_meronyms:
    print(meronym.name(), '-', meronym.definition())
print('\n')

Total Synsets: 5 

Fruit DataFrame:
                  Synset Part of Speech  \
0  Synset('fruit.n.01')     noun.plant   
1  Synset('yield.n.03')  noun.artifact   
2  Synset('fruit.n.03')     noun.event   
3  Synset('fruit.v.01')  verb.creation   
4  Synset('fruit.v.02')  verb.creation   

                                      Definition          Lemmas  \
0  the ripened reproductive body of a seed plant         [fruit]   
1                         an amount of a product  [yield, fruit]   
2       the consequence of some effort or action         [fruit]   
3                            cause to bear fruit         [fruit]   
4                                     bear fruit         [fruit]   

                                                  Examples  
0                                                       []  
1                                                       []  
2  [he lived long enough to see the fruit of his policies]  
3                                                       [

## Semantic Relationships and Similarities - Starting on page 530

In [2]:
print('Semantic relationships:')
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree, lion, tiger, cat, dog]
entity_names = [entity.name().split('.')[0] for entity in entities]
entity_definitions = [entity.definition() for entity in entities]

for entity, definition in zip(entity_names, entity_definitions):
    print(entity, '-', definition)
print('\n')

print('Common hypernyms:')
common_hypernyms = []
for entity in entities:
    # get pairwise lowest common hypernyms
    common_hypernyms.append([entity.lowest_common_hypernyms(compared_entity)[0]
                            .name().split('.')[0] for compared_entity in entities])
# build pairwise lower common hypernym matrix
common_hypernym_frame = pd.DataFrame(common_hypernyms,
                                     index=entity_names, 
                                     columns=entity_names)
print(common_hypernym_frame, '\n')

# page 533
similarities = []
for entity in entities:
    # get pairwise similarities
    similarities.append([round(entity.path_similarity(compared_entity), 2)
                         for compared_entity in entities])        
# build pairwise similarity matrix                             
similarity_frame = pd.DataFrame(similarities,
                                index=entity_names, 
                                columns=entity_names)
                                     
print('Similarity frame:\n', similarity_frame, '\n')

# word sense disambiguation starting on page 534
from nltk.wsd import lesk
from nltk import word_tokenize
samples = [('The fruits on that plant have ripened', 'n'),
           ('He finally reaped the fruit of his hard work as he won the race', 'n')]

# perform word sense disambiguation
word = 'fruit'
for sentence, pos_tag in samples:
    word_syn = lesk(word_tokenize(sentence.lower()), word, pos_tag)
    print('Sentence:', sentence)
    print('Word synset:', word_syn)
    print('Corresponding defition:', word_syn.definition())
    print('\n')

samples = [('Lead is a very soft, malleable metal', 'n'),
           ('John is the actor who plays the lead in that movie', 'n'),
           ('This road leads to nowhere', 'v')]

# perform word sense disambiguation
word = 'lead'
for sentence, pos_tag in samples:
    word_syn = lesk(word_tokenize(sentence.lower()), word, pos_tag)
    print('Sentence:', sentence)
    print('Word synset:', word_syn)
    print('Corresponding defition:', word_syn.definition())
    print('\n')

Semantic relationships:
tree - a tall perennial woody plant having a main trunk and branches forming a distinct elevated crown; includes both gymnosperms and angiosperms
lion - large gregarious predatory feline of Africa and India having a tawny coat with a shaggy mane in the male
tiger - large feline of forests in most of Asia having a tawny coat with black stripes; endangered
cat - feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats
dog - a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds


Common hypernyms:
           tree       lion      tiger        cat        dog
tree       tree   organism   organism   organism   organism
lion   organism       lion    big_cat     feline  carnivore
tiger  organism    big_cat      tiger     feline  carnivore
cat    organism     feline     feline        cat  carnivore
dog    organism  carnivore  carnivore  carni

## Named Entity Recognition - starting on age 537

In [6]:
import spacy
import re

text = """Three more countries have joined an “international grand committee” of 
parliaments, adding to calls for Facebook’s boss, Mark Zuckerberg, to give evidence 
on misinformation to the coalition. Brazil, Latvia and Singapore bring the total to 
eight different parliaments across the world, with plans to send representatives to 
London on 27 November with the intention of hearing from Zuckerberg. Since the 
Cambridge Analytica scandal broke, the Facebook chief has only appeared in front of 
two legislatures: the American Senate and House of Representatives, and the European 
parliament. Facebook has consistently rebuffed attempts from others, including the 
UK and Canadian parliaments, to hear from Zuckerberg. He added that an article in 
the New York Times on Thursday, in which the paper alleged a pattern of behavior 
from Facebook to “delay, deny and deflect” negative news stories, “raises further 
questions about recent data breaches were allegedly dealt with within Facebook.”
"""

text = re.sub(r'\n', '', text) # remove extra newlines
nlp = spacy.load('en_core_web_sm')
text_nlp = nlp(text)

# print named entities in article
print('Named entities (spaCy):')
ner_tagged = [(word.text, word.ent_type_) for word in text_nlp]
print(ner_tagged, '\n')

# visualize spacy page 539
from spacy import displacy

# visualize named entities
displacy.render(text_nlp, style='ent', jupyter=True)

# Programmatic extraction staring on page 539
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
print('Named entities:\n', named_entities, '\n')

# viewing the top entity types
from collections import Counter
c = Counter([item[1] for item in named_entities])
print('Most common entities:\n', c.most_common(), '\n')

Named entities (spaCy):
[('Three', 'CARDINAL'), ('more', ''), ('countries', ''), ('have', ''), ('joined', ''), ('an', ''), ('“', ''), ('international', ''), ('grand', ''), ('committee', ''), ('”', ''), ('of', ''), ('parliaments', ''), (',', ''), ('adding', ''), ('to', ''), ('calls', ''), ('for', ''), ('Facebook', ''), ('’s', ''), ('boss', ''), (',', ''), ('Mark', 'PERSON'), ('Zuckerberg', 'PERSON'), (',', ''), ('to', ''), ('give', ''), ('evidence', ''), ('on', ''), ('misinformation', ''), ('to', ''), ('the', ''), ('coalition', ''), ('.', ''), ('Brazil', 'GPE'), (',', ''), ('Latvia', 'GPE'), ('and', ''), ('Singapore', 'GPE'), ('bring', ''), ('the', ''), ('total', ''), ('to', ''), ('eight', 'CARDINAL'), ('different', ''), ('parliaments', ''), ('across', ''), ('the', ''), ('world', ''), (',', ''), ('with', ''), ('plans', ''), ('to', ''), ('send', ''), ('representatives', ''), ('to', ''), ('London', 'GPE'), ('on', ''), ('27', 'DATE'), ('November', 'DATE'), ('with', ''), ('the', ''), ('inte

Named entities:
 [('Three', 'CARDINAL'), ('Mark Zuckerberg', 'PERSON'), ('Brazil', 'GPE'), ('Latvia', 'GPE'), ('Singapore', 'GPE'), ('eight', 'CARDINAL'), ('London', 'GPE'), ('27 November', 'DATE'), ('Zuckerberg', 'GPE'), ('Facebook', 'ORG'), ('two', 'CARDINAL'), ('the American Senate', 'ORG'), ('House of Representatives', 'ORG'), ('European', 'NORP'), ('UK', 'GPE'), ('Canadian', 'NORP'), ('Zuckerberg', 'GPE'), ('the New York Times', 'ORG'), ('Thursday', 'DATE'), ('Facebook', 'ORG'), ('Facebook', 'ORG')] 

Most common entities:
 [('GPE', 7), ('ORG', 6), ('CARDINAL', 3), ('DATE', 2), ('NORP', 2), ('PERSON', 1)] 



## NER Using NLTK - Replaces the Stanford Parser Section in the Book
Much easier, faster, and no server runtime required.

In [7]:
import nltk
import pandas as pd
import re

text = """Three more countries have joined an “international grand committee” of 
parliaments, adding to calls for Facebook’s boss, Mark Zuckerberg, to give evidence 
on misinformation to the coalition. Brazil, Latvia and Singapore bring the total to 
eight different parliaments across the world, with plans to send representatives to 
London on 27 November with the intention of hearing from Zuckerberg. Since the 
Cambridge Analytica scandal broke, the Facebook chief has only appeared in front of 
two legislatures: the American Senate and House of Representatives, and the European 
parliament. Facebook has consistently rebuffed attempts from others, including the 
UK and Canadian parliaments, to hear from Zuckerberg. He added that an article in 
the New York Times on Thursday, in which the paper alleged a pattern of behavior 
from Facebook to “delay, deny and deflect” negative news stories, “raises further 
questions about recent data breaches were allegedly dealt with within Facebook.”
"""

text = re.sub(r'\n', '', text)
tokenized_text = nltk.tokenize.sent_tokenize(text)

for sentence in tokenized_text:
    words = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(words)
    named_ent = nltk.ne_chunk(tagged, binary=False)
    print(named_ent)

(S
  Three/CD
  more/JJR
  countries/NNS
  have/VBP
  joined/VBN
  an/DT
  “/NNP
  international/JJ
  grand/JJ
  committee/NN
  ”/NNP
  of/IN
  parliaments/NNS
  ,/,
  adding/VBG
  to/TO
  calls/VB
  for/IN
  (PERSON Facebook/NNP)
  ’/NNP
  s/NN
  boss/NN
  ,/,
  (PERSON Mark/NNP Zuckerberg/NNP)
  ,/,
  to/TO
  give/VB
  evidence/NN
  on/IN
  misinformation/NN
  to/TO
  the/DT
  coalition/NN
  ./.)
(S
  (GPE Brazil/NNP)
  ,/,
  (GPE Latvia/NNP)
  and/CC
  (GPE Singapore/NNP)
  bring/VBP
  the/DT
  total/NN
  to/TO
  eight/CD
  different/JJ
  parliaments/NNS
  across/IN
  the/DT
  world/NN
  ,/,
  with/IN
  plans/NNS
  to/TO
  send/VB
  representatives/NNS
  to/TO
  (GPE London/NNP)
  on/IN
  27/CD
  November/NNP
  with/IN
  the/DT
  intention/NN
  of/IN
  hearing/VBG
  from/IN
  (GPE Zuckerberg/NNP)
  ./.)
(S
  Since/IN
  the/DT
  (ORGANIZATION Cambridge/NNP Analytica/NNP)
  scandal/NN
  broke/VBD
  ,/,
  the/DT
  (ORGANIZATION Facebook/NNP)
  chief/NN
  has/VBZ
  only/RB
  appeared/VB

NOTE: I skipped Building an NER Tagger from Scratch (pages 544-557) because it shows no new concepts. However, this code is criticdal for the assignment.

## Analyzing Semantic Representations - Starting on page 558

In [1]:
import nltk
import pandas as pd
import os

symbol_P = 'P'
symbol_Q = 'Q'
proposition_P = 'He is hungry'
propositon_Q = 'He will eat a sandwich'

p_statuses = [False, False, True, True]
q_statuses = [False, True, False, True]

conjunction = '(P & Q)'
disjunction = '(P | Q)'
implication = '(P -> Q)'
equivalence = '(P <-> Q)'
expressions = [conjunction, disjunction, implication, equivalence]

results = []
for status_p, status_q in zip(p_statuses, q_statuses):
    dom = set([])
    val = nltk.Valuation([(symbol_P, status_p), 
                          (symbol_Q, status_q)])
    assignments = nltk.Assignment(dom)
    model = nltk.Model(dom, val)
    row = [status_p, status_q]
    for expression in expressions:
        result = model.evaluate(expression, assignments)
        row.append(result)
    results.append(row)
    
columns = [symbol_P, symbol_Q, conjunction, 
           disjunction, implication, equivalence]           
result_frame = pd.DataFrame(results, columns=columns)

print('P:', proposition_P)
print('Q:', propositon_Q)
print('\n')
print('Expression Outcomes:-')
print(result_frame, '\n')

# first order logic
read_expr = nltk.sem.Expression.fromstring
os.environ['PROVER9'] = r'E:/prover9/bin'
prover = nltk.Prover9()
prover = nltk.ResolutionProver()

# set the rule expression
rule = read_expr('all x. all y. (jumps_over(x, y) -> -jumps_over(y, x))')
# set the event occured
event = read_expr('jumps_over(fox, dog)')
# set the outcome we want to evaluate -- the goal
test_outcome = read_expr('jumps_over(dog, fox)')

# get the result
prover.prove(goal=test_outcome, 
             assumptions=[event, rule],
             verbose=True)

# set the rule expression                          
rule = read_expr('all x. (studies(x, exam) -> pass(x, exam))') 
# set the events and outcomes we want to determine
event1 = read_expr('-studies(John, exam)')  
test_outcome1 = read_expr('pass(John, exam)') 
event2 = read_expr('studies(Pierre, exam)')  
test_outcome2 = read_expr('pass(Pierre, exam)') 

prover.prove(goal=test_outcome1, 
             assumptions=[event1, rule],
             verbose=True)  
             
prover.prove(goal=test_outcome2, 
             assumptions=[event2, rule],
             verbose=True)               

# define symbols (entities\functions) and their values
rules = """
    rover => r
    felix => f
    garfield => g
    alex => a
    dog => {r, a}
    cat => {g}
    fox => {f}
    runs => {a, f}
    sleeps => {r, g}
    jumps_over => {(f, g), (a, g), (f, r), (a, r)}
    """
val = nltk.Valuation.fromstring(rules)
print(val, '\n')

dom = {'r', 'f', 'g', 'a'}
m = nltk.Model(dom, val)
print(m.evaluate('jumps_over(felix, rover) & dog(rover) & runs(rover)', None))
print(m.evaluate('jumps_over(felix, rover) & dog(rover) & -runs(rover)', None))
print(m.evaluate('jumps_over(alex, garfield) & dog(alex) & cat(garfield) & sleeps(garfield)', None), '\n')

g = nltk.Assignment(dom, [('x', 'r'), ('y', 'f')])   
print(m.evaluate('runs(y) & jumps_over(y, x) & sleeps(x)', g))
print(m.evaluate('exists y. (fox(y) & runs(y))', g), '\n')

formula = read_expr('runs(x)')
print(m.satisfiers(formula, 'x', g), '\n')

formula = read_expr('runs(x) & fox(x)')
print(m.satisfiers(formula, 'x', g))

P: He is hungry
Q: He will eat a sandwich


Expression Outcomes:-
       P      Q  (P & Q)  (P | Q)  (P -> Q)  (P <-> Q)
0  False  False    False    False      True       True
1  False   True    False     True      True      False
2   True  False    False     True     False      False
3   True   True     True     True      True       True 

[1] {-jumps_over(dog,fox)}                    A 
[2] {jumps_over(fox,dog)}                     A 
[3] {-jumps_over(z3,z4), -jumps_over(z4,z3)}  A 
[4] {-jumps_over(dog,fox)}                    (2, 3) 

[1] {-pass(John,exam)}                  A 
[2] {-studies(John,exam)}               A 
[3] {-studies(z6,exam), pass(z6,exam)}  A 
[4] {-studies(John,exam)}               (1, 3) 

[1] {-pass(Pierre,exam)}                A 
[2] {studies(Pierre,exam)}              A 
[3] {-studies(z8,exam), pass(z8,exam)}  A 
[4] {-studies(Pierre,exam)}             (1, 3) 
[5] {pass(Pierre,exam)}                 (2, 3) 
[6] {}                                  (1, 5) 

{'a