In [1]:
%load_ext autoreload
%autoreload 2

from os import path, listdir
import urllib.request
import json
from collections import Counter
import copy

from tqdm import tqdm_notebook as tqdm

import spacy

import standoffconverter
import geo_helper
import iso3166

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lxml import etree

import ELTeC_dataloader

ModuleNotFoundError: No module named 'geopandas'

In [None]:
file_names = [fn for fn in ELTeC_dataloader.get_file_descriptors()]

IN_DIR = file_names[0].split("/ELTeC-fra/")[0] + "/ELTeC-fra/"
file_names = [fn.split(IN_DIR)[1] for fn in file_names]

# Spacy & TEI

## Standoff Converter

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
from IPython.display import HTML, display
import tabulate

display(HTML(tabulate.tabulate(chunks(sorted(file_names, key=lambda x: x.split("_")[1]), 6), tablefmt='html')))

In [None]:
CHOSEN_XMLS = [
    "FRA0037_Erckmann.xml",
    "FRA0027_Erckmann.xml",
    "FRA0038_Erckmann.xml"
]

standoffs = []

for fn in tqdm(CHOSEN_XMLS):
    tree = etree.fromstring(open(path.join(IN_DIR, fn), "rb").read())
    so = standoffconverter.Standoff()
    so.from_lxml_tree(tree)
    standoffs.append(so)


In [None]:
nlp = spacy.load("fr_core_news_sm")
nlp.max_length = max(map(lambda so: len(so.plain), standoffs))

## Adding extensions

In [None]:
spacy.tokens.Doc.set_extension('author', default=None, force=True)

In [None]:
docs = []
for so in tqdm(standoffs):

    doc = nlp(so.plain)
        
    author_it = list(filter(lambda x: x["tag"] == "{http://www.tei-c.org/ns/1.0}author", so.standoffs))
    if len(author_it) > 0:
        author = author_it[0]
        author = so.plain[author["begin"]:author["end"]]
    
    doc._.set("author", author)
    docs.append(doc)

In [None]:
docs[0]._.author

## Automated Markup: NER

In [None]:
locations = []
for idoc, doc in enumerate(docs):
    for ent in doc.ents:
        if ent.label_ == "LOC":
            tpl = tuple(t.lemma_ for t in ent if t.pos_ == "PROPN")
            if len(tpl) == 1:
                locations.append({
                    "idoc": idoc,
                    "start": ent.start,
                    "end": ent.end,
                    "query_str" : tpl[0]
                })
            elif len(tpl) > 1:
                locations.append(
                    {
                    "idoc": idoc,
                    "start": ent.start,
                    "end": ent.end,
                    "query_str" : " ".join(tpl)
                })
            else:
                pass

In [None]:
locations = pd.DataFrame(locations)
locations = locations[["idoc", "start", "end", "query_str"]]
locations

cnt = Counter(locations.query_str)

## Enriching entity with Geo data

In [None]:
locations["country"] = None
cnt_iso_a3 = {}
for k,v in tqdm(cnt.most_common(10)):
    iso_k = geo_helper.get_iso_a3_of_str(k)
    if iso_k not in cnt_iso_a3:
        cnt_iso_a3[iso_k] = 0
    cnt_iso_a3[iso_k] += v
    locations.loc[locations.query_str==k, "country"] = iso_k

In [None]:
locations[~locations.country.isnull()].sample(10)

In [None]:
if None in cnt_iso_a3:
    del cnt_iso_a3[None]
labels,values = zip(*cnt_iso_a3.items())
labels = np.array(labels)
values = np.array(values)
sorter = np.argsort(values)[::-1]
labels = labels[sorter][:15]
values = values[sorter][:15]

plt.figure(figsize=[10,8])
plt.title("Number of mentions of places grouped by country of destination.")
plt.bar(range(len(values)),values)
plt.ylabel("# mentions")
_ = plt.xticks(range(len(values)), [iso3166.countries.get(l).name for l in labels], rotation=90)

In [None]:
geo_helper.plot_countries(cnt_iso_a3)

## Span Level Custom Attributes

In [None]:
spacy.tokens.Span.set_extension(
    'country_of_entity',
    default=None,
    force=True
)

locations = pd.DataFrame(locations)

for idoc,doc in enumerate(docs):
    clocations = locations[np.logical_and(
        ~locations.country.isnull(),
        locations.idoc == idoc
    )]
    so = standoffs[idoc]
    so.add_spacy_annotations(
        doc,
        clocations.start.tolist(),
        ["location"]*len(clocations),
        [{"iso_a3":a3} for a3 in clocations.country]
    )
    
    for _, loc in locations[locations.idoc == idoc].iterrows():
        doc[loc.start:loc.end]._.country_of_entity = loc.country
        
        

In [None]:
str_ = so.to_xml()

In [None]:
print(str_[:10000])