# WordNet and Name Entity Recognition
## subsection of _Semantic Analysis_

* Exploring WordNet
    1. Understanding Synsets
    2. Analyzing Lexical Semantic Relationships
* Word Sense Disambiguity
* Named Entity Recognition

# Exploring WordNet 

## Understanding Synsets

In [None]:
from nltk.corpus import wordnet as wn
import pandas as pd

term = 'fruit'
synsets = wn.synsets(term)
# display total synsets
print('Total Synsets:', len(synsets))

In [None]:
pd.options.display.max_colwidth = 200
fruit_df = pd.DataFrame([{'Synset': synset, 
                          'Part of Speech': synset.lexname(), 
                          'Definition': synset.definition(), 
                          'Lemmas': synset.lemma_names(), 
                          'Examples': synset.examples()}
                                for synset in synsets])
fruit_df = fruit_df[['Synset', 'Part of Speech', 'Definition', 'Lemmas', 'Examples']]
fruit_df

## Analyzing Lexical Semantic Relationships

### Entailments

In [1]:
for action in ['walk', 'eat', 'digest']:
    action_syn = wn.synsets(action, pos='v')[0]
    print(action_syn, '-- entails -->', action_syn.entailments())

NameError: name 'wn' is not defined

### Homonyms and Homographs

In [None]:
for synset in wn.synsets('bank'):
    print(synset.name(), '-', synset.definition())

### Synonyms and Antonyms

In [None]:
term = 'large'
synsets = wn.synsets(term)
adj_large = synsets[1]
adj_large = adj_large.lemmas()[0]
adj_large_synonym = adj_large.synset()
adj_large_antonym = adj_large.antonyms()[0].synset()

print('Synonym:', adj_large_synonym.name())
print('Definition:', adj_large_synonym.definition())
print('Antonym:', adj_large_antonym.name())
print('Definition:', adj_large_antonym.definition())

In [None]:
term = 'rich'
synsets = wn.synsets(term)[:3]

for synset in synsets:
    rich = synset.lemmas()[0]
    rich_synonym = rich.synset()
    rich_antonym = rich.antonyms()[0].synset()

    print('Synonym:', rich_synonym.name())
    print('Definition:', rich_synonym.definition())
    print('Antonym:', rich_antonym.name())
    print('Definition:', rich_antonym.definition())
    print()

### Hyponyms and Hypernyms

In [None]:
term = 'tree'
synsets = wn.synsets(term)
tree = synsets[0]

print('Name:', tree.name())
print('Definition:', tree.definition())

In [None]:
hyponyms = tree.hyponyms()
print('Total Hyponyms:', len(hyponyms))
print('Sample Hyponyms')
for hyponym in hyponyms[:10]:
    print(hyponym.name(), '-', hyponym.definition())
    print()

In [None]:
hypernyms = tree.hypernyms()
print(hypernyms)

In [None]:
# get total hierarchy pathways for 'tree'
hypernym_paths = tree.hypernym_paths()
print('Total Hypernym paths:', len(hypernym_paths))

In [None]:
# print the entire hypernym hierarchy
print('Hypernym Hierarchy')
print(' -> '.join(synset.name() for synset in hypernym_paths[0]))

### Holonyms and Meronyms

In [None]:
member_holonyms = tree.member_holonyms()
print('Total Member Holonyms:', len(member_holonyms))
print('Member Holonyms for [tree]:-')
for holonym in member_holonyms:
    print(holonym.name(), '-', holonym.definition())
    print()

In [None]:
part_meronyms = tree.part_meronyms()
print('Total Part Meronyms:', len(part_meronyms))
print('Part Meronyms for [tree]:-')
for meronym in part_meronyms:
    print(meronym.name(), '-', meronym.definition())
    print()

In [None]:
# substance based meronyms for tree
substance_meronyms = tree.substance_meronyms()
print('Total Substance Meronyms:', len(substance_meronyms))
print('Substance Meronyms for [tree]:-')
for meronym in substance_meronyms:
    print(meronym.name(), '-', meronym.definition())
    print()

### Semantic Relationships and Similarity

In [None]:
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

# create entities and extract names and definitions
entities = [tree, lion, tiger, cat, dog]
entity_names = [entity.name().split('.')[0] for entity in entities]
entity_definitions = [entity.definition() for entity in entities]

# print entiries and their definitions
for entity, definition in zip(entity_names, entity_definitions):
    print(entity, '-', definition)
    print()

In [None]:
common_hypernyms = []
for entity in entities:
    # get pairwise lowest common hypernyms
    common_hypernyms.append([entity.lowest_common_hypernyms(compared_entity)[0]
                                            .name().split('.')[0]
                             for compared_entity in entities])

# build pairwise lower common hypernym matrix
common_hypernym_frame = pd.DataFrame(common_hypernyms,
                                     index=entity_names, 
                                     columns=entity_names)
common_hypernym_frame

In [None]:
similarities = []
for entity in entities:
    # get pairwise similarities
    similarities.append([round(entity.path_similarity(compared_entity), 2) for compared_entity in entities])

# build pairwise similarity matrix
similarity_frame = pd.DataFrame(similarities, index=entity_names, columns=entity_names)
similarity_frame

# Word Sense Disambiguation

In [None]:
from nltk.wsd import lesk
from nltk import word_tokenize

# sample text and word to disambiguate
samples = [('The fruits on that plant have ripened', 'n'), 
            ('He finally reaped the fruit of his hard work as he won the race', 'n')]

# perform word sense disambiguity
word = 'fruit'
for sentence, pos_tag in samples:
    word_syn = lesk(word_tokenize(sentence.lower()), word, pos_tag)
    print('Sentence:', sentence)
    print('Word synset:', word_syn)
    print('Corresponding definition:', word_syn.definition())
    print()

In [None]:
# sample text and word to disambiguate
samples = [('Lead is a very soft, malleable metal', 'n'), 
            ('John is the actor who plays the lead in that movie', 'n'), 
            ('This road leads to nowhere', 'v')]
word = 'lead'

# perform word sense disambiguation
for sentence, pos_tag in samples:
    word_syn = lesk(word_tokenize(sentence.lower()), word, pos_tag)
    print('Sentence:', sentence)
    print('Word synset:', word_syn)
    print('Corresponding definition:', word_syn.definition())
    print()

# Named Entity Recognition

In [None]:
text = """Three more countries have joined an “international grand committee” of parliaments, adding to calls for 
Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore 
bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 
November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief 
has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. 
Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. 
He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook 
to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly 
dealt with within Facebook.”
"""

In [None]:
import re

text = re.sub(r'\n', '', text) # remove extra newlines

# not working
#import spacy
#nlp = spacy.load('web_en_core_sm')

# github help
import en_core_web_sm
nlp = en_core_web_sm.load()

text_nlp = nlp(text)
# print named entites in article
ner_tagged = [(word.text, word.ent_type_) for word in text_nlp]
print(ner_tagged)

In [None]:
from spacy import displacy

# visualize named entities
displacy.render(text_nlp, style='ent', jupyter=True)

In [None]:
# extract named entities
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
print(named_entities)

In [None]:
# viewing the top entity types
from collections import Counter
c = Counter([item[1] for item in named_entities])
c.most_common()

In [None]:
import os
from nltk.tag import StanfordNERTagger

STANFORD_CLASSIFIER_PATH = r'/Users/beliciarodriguez/Downloads/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz'
STANFORD_NER_JAR_PATH = r'/Users/beliciarodriguez/Downloads/stanford-ner-2014-08-27/stanford-ner-3.4.1.jar'

sn = StanfordNERTagger(STANFORD_CLASSIFIER_PATH, path_to_jar=STANFORD_NER_JAR_PATH)

In [None]:
# perform NER tagging & extract relevant entities
text_enc = text.encode('ascii', errors='ignore').decode('utf-8')
ner_tagged = sn.tag(text_enc.split())

named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
print(named_entities)

In [None]:
# get more frequent entities
c = Counter([item[1] for item in named_entities])
c.most_common()

In [None]:
# using Stanford's Core NLP (connected to server)
from nltk.parse import CoreNLPParser
import nltk

# NER Tagging
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
tags = list(ner_tagger.raw_tag_sents(nltk.sent_tokenize(text)))
tags = [sublist[0] for sublist in tags]
tags = [word_tag for sublist in tags for word_tag in sublist]

# Extract Named Entities
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in tags:
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
print(named_entities)

In [None]:
# find out top named entity types
c = Counter([item[1] for item in named_entities])
c.most_common()