## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re
import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

#import bokeh, bokeh.plotting, bokeh.models, 
import matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
#import types, glob
import textacy.keyterms

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

DATA_FOLDER = '../data'
PATTERN = '*.txt'
PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv('../data/tagset.csv', sep='\t').fillna('')
WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)
TREATY_TIME_GROUPINGS = WTI_INDEX.get_treaty_time_groupings()

%matplotlib inline
# set_matplotlib_formats('svg')   
#bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
import textacy_corpus_utility as textacy_utility
import textacy_corpus_gui
try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, WTI_INDEX, container)
except Exception as ex:
    logger.error(ex)

## <span style='color: green;'>MODEL</span> Display Named Entities<span style='color: green; float: right'>SKIP</span>
Spacy NER, note that "ner" must be enabled in corpus pipeline.

In [4]:
# Display Named Entities
import gui_utility
import textacy_corpus_utility as textacy_utility
from spacy import displacy

def display_document_entities_gui(corpus, wti_index):
    
    def display_document_entities(corpus, treaty_id):
        
        doc = textacy_utility.get_treaty_doc(corpus, treaty_id)
        
        displacy.render(doc.spacy_doc, style='ent', jupyter=True)

    document_options = [('All Treaties', None)] + gui_utility.get_treaty_dropdown_options(wti_index, corpus)
            
    treaty_ids = widgets.Dropdown(description='Treaty', options=document_options, value=document_options[1][1], layout=widgets.Layout(width='80%'))

    itw = widgets.interactive(
        display_document_entities,
        corpus=widgets.fixed(corpus),
        treaty_id=treaty_ids
    )
    
    display(widgets.VBox([
        treaty_ids,
        widgets.VBox([itw.children[-1]], layout=widgets.Layout(margin_top='20px', height='500px',width='100%'))
    ]))

    itw.update()
    
try:
    corpus = current_corpus()
    display_document_entities_gui(corpus, WTI_INDEX)
except Exception as ex:
    logger.error(ex)


VBox(children=(Dropdown(description='Treaty', index=1, layout=Layout(width='80%'), options=(('All Treaties', N…

## <span style='color: green;'>MODEL</span> TESTS - IGNORE EVERYTHING BELOW<span style='color: green; float: right'>SKIP</span>

In [None]:
try:
    from nltk.parse import corenlp
    corenlp_tagger = corenlp.CoreNLPParser(url='http://localhost:9001', encoding='utf8', tagtype='ner')
    input_tokens = 'Stony Brook University in NY'.split()
    tagged_output = corenlp_tagger.tag(input_tokens)
    print('Stanford tagger is up and running!')
except: # (ConnectionError, ConnectionRefusedError):
    logger.error('Server not found! Please start Stanford CoreNLP Server!')
    

In [None]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import logging

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import textacy_corpus_utility as textacy_utility
import common.treaty_state as treaty_repository
import common.utility as utility
import common.config as config

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display

logger = utility.getLogger('corpus_text_analysis')

import pickle
import topic_model
import topic_model_utility
import treaty_corpus

DATA_FOLDER = '../data'
LANGUAGE = 'en'

WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)
CORPUS_PATH = os.path.join(DATA_FOLDER, "treaty_text_corpora_20181206_preprocessed.zip")

treaties = WTI_INDEX.get_treaties(language=LANGUAGE)
document_stream = treaty_corpus.get_document_stream(CORPUS_PATH, LANGUAGE, treaties)



In [None]:
import os
import io
import codecs
import time
import collections
import nltk.tag
from nltk.parse import corenlp
import nltk.tokenize.stanford as st
import re
import zipfile

def extract_entity_phrases(data, classes=[ 'LOCATION', 'PERSON']):

    # Extract entities of selected classes, add index to enable merge to phrases
    entities = [ (i, word, wclass)
        for (i, (word, wclass)) in enumerate(data) if classes is None or wclass in classes ]

    # Merge adjacent entities having the same classifier
    for i in range(len(entities) - 1, 0, -1):
        if entities[i][0] == entities[i - 1][0] + 1 and entities[i][2] == entities[i - 1][2]:
            entities[i - 1] = (entities[i - 1][0], entities[i - 1][1] + " " + entities[i][1], entities[i - 1][2])
            del entities[i]

    # Remove index in returned data
    return [ (word, wclass) for (i, word, wclass) in entities  ]

def create_ner_tagger(options):
    corenlp_tagger = corenlp.CoreNLPParser(url=options['server_url'], encoding='utf8', tagtype='ner')
    return corenlp_tagger

def create_tokenizer(options):
    corenlp_tokenizer = corenlp.CoreNLPParser(url=options['server_url'], encoding='utf8')
    return corenlp_tokenizer

def create_statistics(entities):
    wc = collections.Counter()
    wc.update(entities)
    return wc

def serialize_content(stats, filename, token_count):
    document_name, treaty_id, lang = extract_document_info(filename)
    data = [ (document_name, treaty_id, lang, word, wclass, stats[(word, wclass)], token_count) for (word, wclass) in stats  ]
    content = '\n'.join(map(lambda x: ';'.join([str(y) for y in x]), data))
    return content

def write_content(outfile, content):
    if content != '':
        outfile.write(content)
        outfile.write('\n')
        
def recognize_entities(options):

    corenlp_tokenizer = create_tokenizer(options)
    corenlp_tagger = create_ner_tagger(options)
    
    outfile = os.path.join(options['output_folder'], "output_" + time.strftime("%Y%m%d_%H%M%S") + ".csv")
    tags = [ 'NUMBER', 'LOCATION', 'DATE', 'MISC', 'ORGANIZATION', 'DURATION', 'SET', 'ORDINAL', 'PERSON' ]
    
    document_stream = treaty_corpus.get_document_stream(options['source_path'], options['language'], treaties)
    for treaty_id, language, filename, content in document_stream:
        print('treaty_id')
        
options = {
    "language": 'en',
    "source_path": "../data/treaty_text_corpora_20181206_preprocessed.zip",
    'server_url': 'http://localhost:9001',
    'output_folder': DATA_FOLDER,
}

recognize_entities(options)


In [None]:
corpus = get_current_corpus().textacy_corpus
gpe = set([])
for doc in corpus:
    candidates = [ x for x in doc if len(x) > 1 and x.ent_type_ == 'GPE' and x.is_alpha ]
    gpe = gpe.union(set([ x.lower_ for x in candidates]))
    gpe = gpe.union(set([ x.lemma_ for x in candidates]))

df = pd.DataFrame({ 'word': list(gpe)})
df.sort_values('word')


In [None]:

data_folder = '../data/'

def include_predicate(filename, options):
    
options = {
    "language": 'en',
    "source_path": "treaty_text_corpora_20181206_preprocessed.zip",
    'server_url': 'http://localhost:9001',
    'output_folder': data_folder,
}

main(options)

for zip_source in options["zip_sources"]:
    with io.open(outfile, 'w', encoding='utf8') as o:
        with zipfile.ZipFile(zip_source) as pope_zip:
            for filename in pope_zip.namelist():
                with pope_zip.open(filename) as pope_file:
                    try:
                        text = pope_file.read().decode("utf-8")
                        tokens = corenlp_tokenizer.tokenize(text)
                        data = corenlp_tagger.tag(tokens)
                        entities = extract_entity_phrases(data, tags)  # [ 'LOCATION', 'PERSON', 'ORGANIZATION' ])
                        statistics = create_statistics(entities)
                        content = serialize_content(statistics, filename, len(tokens))
                        write_content(o, content)
                    except Exception as ex:
                        raise
                        print('Failed: ' + filename)

In [None]:
corpus = get_current_corpus().textacy_corpus
gpe = set([])
for doc in corpus:
    candidates = [ x for x in doc if len(x) > 1 and x.ent_type_ == 'GPE' ]
    gpe = gpe.union(set([ x.lower_ for x in candidates]))
    gpe = gpe.union(set([ x.lemma_ for x in candidates]))

df = pd.DataFrame({ 'word': list(gpe)})
df.sort_values('word')

In [None]:
corpus = [get_current_corpus().textacy_corpus[0]]
ents = set([])
for doc in corpus:
    candidates = set([
            x.lower_ + ' / ' + ' '.join([ t.ent_type_ for t in x ])
        for x in doc.spacy_doc.ents if x.text not in ('', ' ', '\n', '\t')])
    ents = ents.union(candidates)

df_ent = pd.DataFrame({'ent': list(ents)})
df_ent

In [None]:
import spacy
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"FB is hiring a new Vice President of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# the model didn't recognise "FB" as an entity :(

ORG = doc.vocab.strings[u'ORG']  # get hash value of entity label
fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]

ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)
# [(u'FB', 0, 2, 'ORG')] 


In [None]:
args = {
    'as_strings': True,
    'named_entities': False,
    'ngrams': [1, 2],
    'normalize': 'lemma'
}
kwargs = {
    'filter_punct': True,
    'filter_stops': True,
    'include_pos': ('NOUN', 'PROPN'),
    'min_freq': 2
}
tokenizer_args = {
    'args': args,
    'kwargs': kwargs,
    'extra_stop_words': {},
    'mask_gpe': True,
    'min_freq': 2,
    'max_doc_freq': 0.80    
}
corpus = get_current_corpus().textacy_corpus
fx_terms = lambda: ( textacy_utility.textacy_filter_terms(doc, tokenizer_args) for doc in corpus )
terms = fx_terms()