# Civil War Entities: Extracting information using pos tagging

In [None]:
import wikipediaapi
import re
from string import punctuation
import nltk
from gensim import corpora
from gensim import models
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Load the civil war corpus

In [None]:
import wikipediaapi
pages = [
    "American Civil War",
    "Abraham Lincoln",
    "Slavery in the United States",
    "Slave states and free states",
    "Emancipation Proclamation",
    "Robert E. Lee",
    "Ulysses S. Grant",
    "Conclusion of the American Civil War",
    "Origins of the American Civil War",
    "Issues of the American Civil War"
]
import re

def underscorize(pagename):
    return re.sub(" ", "_", pagename)

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_text = p_wiki.text.split("\n")
    page_paras = [para for para in page_text if len(para) > 1]
    page_dict[pagename] = page_paras

## Chunking: Finding noun phrases

### Tag all of the sentences

In [None]:
import nltk
def tag_paragraph_sentences(para):
    sentences = nltk.sent_tokenize(para)
    tagged_sentences = []
    for sent in sentences:
        tokenized_sentence = nltk.word_tokenize(sent)
        tagged_sentence = nltk.pos_tag(tokenized_sentence)
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences
tagged_sentences = []
for name, page in page_dict.items():
    for para in page:
        tagged_sentences += tag_paragraph_sentences(para)

In [None]:
tagged_sentences[0]

In [None]:
len(tagged_sentences)

### Using NLTK's regular expression parser

Construct the parser

In [None]:
grammar = "NounPhrase: {<DT>?<J.*>*<N.*>+}"
cp = nltk.RegexpParser(grammar)

Apply the parser to one sentence.

It produces an NLTK `tree` object

In [None]:
tree = cp.parse(tagged_sentences[1])

In [None]:
tree.draw()

In [None]:
from IPython.display import display, clear_output

In [None]:
display(tree)

In [None]:
tree.draw()
import 

#### Extract the found nounphrases from the trees

In [None]:
current_chunk = []
for i in tree:
    if type(i) == nltk.Tree and i.label() == "NounPhrase":
        current_chunk.append(" ".join([token.lower() for token, pos in i.leaves()]))

In [None]:
current_chunk

In [None]:
np_dist = nltk.FreqDist()
for i, sent in enumerate(tagged_sentences):
    if i % 100 == 0:
        clear_output(wait=True)
        print('Sentence {} of {}'.format(i, len(tagged_sentences)))
    tree = cp.parse(sent)
    for i in tree:
        if type(i) == nltk.Tree and i.label() == "NounPhrase":
            np = " ".join([token.lower() for token, pos in i.leaves()])
            np_dist[np] += 1
clear_output(wait=True)
print("done")

In [None]:
np_dist.most_common(20)

## Chunking: Finding named entities using NLTK's named expression chunker

In [None]:
tree = nltk.ne_chunk(tagged_sentences[0])
for x in tree:
    print(x)

In [None]:
tree.draw()

### Chunk them all. 
This will take a while

In [None]:
chunked_sentences = []
for n, sent in enumerate(tagged_sentences):
    if n % 500 == 0:
        clear_output(wait=True)
        print('Sentence {} of {}'.format(n, len(tagged_sentences)))
    chunked_sentences.append(nltk.ne_chunk(sent))
clear_output(wait=True)
print("done")

### Count them up

In [None]:
def entity_to_tuple(t):
    return tuple([t.label(), " ".join([token for token, pos in t.leaves()])])

def extract_entities(chunked_sentence):
    entities = []
    for i in chunked_sentence:
        if type(i) == nltk.Tree:
            entities.append(entity_to_tuple(i))
    return entities

In [None]:
entity_fdist = nltk.FreqDist()
for n, chunked_sent in enumerate(chunked_sentences):
    entities = extract_entities(chunked_sent)
    entity_fdist.update(entities)
entity_fdist.most_common(25)

## Find relations between named entities

In [None]:
def matching_label(it, label):
    if type(it) == nltk.Tree:
        if it.label() == label:
            return True
    return False

def matching_entity(it, entity):
    if type(it) == nltk.Tree:
        if entity == "*" or " ".join(entity_to_tuple(it)) == entity:
            return True
    return False

def get_matching_chunks(the_tree, e1, e2):
    in_match = False
    matches = []
    current_match = []
    for k in the_tree:
        if in_match:
            if matching_entity(k, e2):
                current_match.append(entity_to_tuple(k))
                matches.append(current_match)
                in_match = False
            else:
                current_match.append(k)
        else:
            if matching_entity(k, e1):
                in_match = True
                current_match = [entity_to_tuple(k)]
    return matches

In [None]:
e1 = "LOCATION North"
e2 = "LOCATION South"
get_matching_chunks(tree, e1, e2)

In [None]:
e1 = "LOCATION North"
e2 = "LOCATION South"
e1 = "PERSON Lincoln"
e2 = "PERSON Grant"

In [None]:
e1 = "PERSON Lincoln"
e2 = "PERSON Grant"
relations = []
for n, chunked_sentence in enumerate(chunked_sentences):
    if n % 25 == 0:
        clear_output(wait=True)
        print('Sentence {} of {}'.format(n, len(chunked_sentences)))
    relations += (get_matching_chunks(chunked_sentence, e1, e2))

In [None]:
class RelationList(list):
    def get_one_relation(self, rel):
        html = "<b>{}</b>".format(rel[0][1])
        for row in rel[1:-1]:
            html += " " + str(row[0])         
        html += " " + "<b>{}</b>".format(rel[-1][1])
        return html
    
    def _repr_html_(self):
        html = ""
        for rel in self:
            html += "<div>" + self.get_one_relation(rel) + "</div>"
        return html

In [None]:
RelationList(relations)

In [None]:
def print_relation(rel):
    outstring = rel[0][1]
    for it in rel[1:-1]:
        outstring += " " + str(it[0])
    outstring += " " + rel[-1][1]
    print(outstring)

In [None]:
for relation in relations:
    print_relation(relation)