In [None]:
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
import nltk

# Network of relations

In [None]:
import wikipediaapi
from string import punctuation
from itertools import combinations
import nltk
import numpy as np
from IPython.display import display, clear_output

## Read in the civil war corpus

In [None]:
import wikipediaapi
pages = [
    "American Civil War",
    "Abraham Lincoln",
    "Slavery in the United States",
    "Slave states and free states",
    "Emancipation Proclamation",
    "Robert E. Lee",
    "Ulysses S. Grant",
    "Conclusion of the American Civil War",
    "Origins of the American Civil War",
    "Issues of the American Civil War"
]
import re

def underscorize(pagename):
    return re.sub(" ", "_", pagename)

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_text = p_wiki.text.split("\n")
    page_paras = [para for para in page_text if len(para) > 1]
    page_dict[pagename] = page_paras

## Extract the named entities

Tag every word

In [None]:
def tag_paragraph_sentences(para):
    sentences = nltk.sent_tokenize(para)
    tagged_sentences = []
    for sent in sentences:
        tokenized_sentence = nltk.word_tokenize(sent)
        tagged_sentence = nltk.pos_tag(tokenized_sentence)
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences
tagged_sentences = []
for name, page in page_dict.items():
    for para in page:
        tagged_sentences += tag_paragraph_sentences(para)

Identify the chunks using the nltk named-entity chunker

In [None]:
chunked_sentences = []
for n, sent in enumerate(tagged_sentences):
    if n % 500 == 0:
        clear_output(wait=True)
        print('Sentence {} of {}'.format(n, len(tagged_sentences)))
    chunked_sentences.append(nltk.ne_chunk(sent))
clear_output(wait=True)
print("done")

In [None]:
def entity_to_tuple(t):
    return tuple([t.label(), " ".join([token for token, pos in t.leaves()])])

def extract_entities(chunked_sentence):
    entities = []
    for i in chunked_sentence:
        if type(i) == nltk.Tree:
            entities.append(entity_to_tuple(i))
    return entities

def tuple_to_string(t):
    return t[1] + "_" + t[0]

In [None]:
extract_entities(chunked_sentences[0])

Find the most common entities.

Also create a list that has the list of entities in each sentence.

In [None]:
entity_fdist = nltk.FreqDist()
ents_list = []
for sent in chunked_sentences:
    ents = extract_entities(sent)
    ents_list.append(ents)
    entity_fdist.update(ents)

In [None]:
entity_fdist.most_common(25)

## Build the adjacency matrix

Our "vocabulary" will consist of the most common entities.

Then we'll compute a vector for each sentence. The elements of the vector will be the count of how many times each of the entities in this vocabulary appeared.

We'll use the vectors to build a termxdocument matrix. Then we'll convert this to a termxterm matrix, which we'll use as our adjacency matrix.

In [None]:
def compute_vector(words, vocab):
    new_vector = []
    for w in vocab:
        tf = words.count(w)
        new_vector.append(tf)
    return np.array(new_vector)

In [None]:
vocab_size = 100
vocab = [w[0] for w in entity_fdist.most_common(vocab_size)]
doc_vectors = {}
N = len(ents_list)
entity_vector_list = []
for ents in ents_list:
    entity_vector_list.append(compute_vector(ents, medium_vocab))
td_matrix = np.zeros([len(medium_vocab), len(entity_vector_list)])
i = 0
for entity_vector in entity_vector_list:
    td_matrix[:, i] = entity_vector
    i = i + 1
for r in range(len(medium_vocab)):
    td_matrix[r, :] = td_matrix[r, :]
tt_matrix = np.dot(td_matrix, td_matrix.transpose())

In [None]:
def build_weighted_graph_from_Aij(am, vocab):
    g = nx.Graph()
    entity_strings = [tuple_to_string(tup) for tup in vocab]
    for entity_data in vocab:
        g.add_node(tuple_to_string(entity_data), type=entity_data[0], name=entity_data[1])
    dim = len(am)
    for r in range(dim):
        for c in range(dim):
            if r != c and am[r][c] != 0:
                g.add_weighted_edges_from([(entity_strings[r], entity_strings[c], am[r][c])])
    return g

In [None]:
cw_g = build_weighted_graph_from_Aij(tt_matrix, vocab)

## Draw the network

Let's give different colors to each node depending on which type of entity it is.

In [None]:
label_dict = {}
for i, n in enumerate(cw_g.nodes):
    label_dict[n] = vocab[i][1]
weights = [cw_g[u][v]['weight'] / 5 for u, v in cw_g.edges()]
color_dict = {"PERSON": "blue",
             "GPE": "green",
             "ORGANIZATION": "red",
             "LOCATION": "orange",
             "GSP": "gray"}
node_colors = []
for n in cw_g.nodes():
    node_colors.append(color_dict[cw_g.nodes()[n]["type"]])
    pos = nx.spring_layout(cw_g, k=2)
plt.figure(figsize=(30, 30))
plt.axis('off')
nx.draw_networkx(cw_g, pos=pos, labels=label_dict, node_color=node_colors, width=weights, alpha = .5, font_size = 20, node_size = 500)

## Export the network to a file

In [None]:
nx.write_gexf(cw_g, "civil_war.gexf")