In [1]:
import spacy
nlp = spacy.load("nl_core_news_lg")

### Importing spaCy and loading texts and annotations

In [2]:

from tqdm import tqdm

import gzip
import os

import random

import re
import matplotlib.pyplot as plt
import seaborn as sns
import pandas

corpus = "./Corpus/"
anns = "./Annotations/"

def load(file, corpus, lines=False):
    with open(os.path.join(corpus, file), "r") as handle:
        if lines:
            return list(handle) # handle.readlines()
        else:
            return handle.read()

### Using csv to read BRAT annotations

In [3]:

def read_csv(file, corpus=anns):
    file = load(file, corpus, lines=True)

    for i, line in enumerate(file):
        if i == 0: print("first line:", line) #to see brat format
        tab_cols = line.split("\t")
        
        if tab_cols[0].startswith("T"):
            last = tab_cols[-1].strip()
            first = tab_cols[0]
            
            middle = tab_cols[1].split()
#             print(middle, middle[0], middle[-1])
            middle = [middle[0], middle[1], middle[-1]]
            
            yield (first, *middle, last)
            
def filter_rows(rows):
    for r in rows:
        if r[1].upper() in ["WOMEN", "INDIGENOUS", "MEN", "GPE"]:
        #if r[1].upper() in ["PERSON", "ORG", "GPE", "WOMEN", "INDIGENOUS", "MEN"]:
            yield r
        
def change_rows(rows):
    for r in rows:
        entity_nr, entity_type, start, end, label = r
        yield int(start), int(end), entity_type.upper(), label


In [7]:
data = {}

for ann_f in os.listdir(anns):
    if not ann_f.endswith('.ann'): continue
        
    if len(load(ann_f, anns)) > 0:
        cur_name = ann_f.strip(".ann")
    
        txt_f = cur_name + ".txt"
        if os.path.isfile(os.path.join(corpus, txt_f)):
        
            #raw_text = load(txt_f, corpus)
            brat_entity_list = list(change_rows(filter_rows(read_csv(ann_f, anns))))
            
            data[txt_f] = (brat_entity_list)
        else:
            print("no file", txt_f)
    else:
        print("annotation file empty")

annotation file empty
annotation file empty
annotation file empty
first line: T1	Person 528 546	Abraham Salbindusz

annotation file empty
first line: T1	Person 42 54;55 61	Frans Willen waltig

annotation file empty
first line: T1	Group 59 63	volk

annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
anno

annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
annotation file empty
first line: T1	Person 134 150	P: van Lelijveld

first line: T1	Person 4 19	Matthem Welborn

first line: T1	Person 51 73	Iohannes van den Bergh

annotation file empty
first line: T1	GPE 1 8	Batavia

first line: T1	Person 1687 1699	Alida Elders

first line: T1	Person 142 154	Darid Jansz:

annotation file empty
first line: T1	Person 1145 1156	David Jansz

first line: T1	Person 297 308	p: Blomherk

annotation file empty
first line: T1	Person 68 86	Abraham van Zenden

annotation file empty
firs

In [8]:
data

{'NL-HaNA_1.04.02_6847_0758.txt': [(1005, 1009, 'MEN', 'Heer'),
  (1118, 1125, 'GPE', 'Batavia'),
  (1207, 1217, 'WOMEN', 'huifsvrouw'),
  (1218, 1223, 'WOMEN', 'mejuf'),
  (1182, 1186, 'MEN', 'Heer')],
 'NL-HaNA_1.04.02_6847_0764.txt': [(149, 158, 'WOMEN', 'degterlje'),
  (397, 403, 'GPE', 'Europa'),
  (1142, 1151, 'WOMEN', 'dogtertje'),
  (1276, 1284, 'WOMEN', 'Juffrouw')],
 'NL-HaNA_1.04.02_6847_0016.txt': [(894, 911, 'GPE', 'nederlands Jordia'),
  (940, 946, 'GPE', 'batera'),
  (1791, 1800, 'WOMEN', 'Erfgename'),
  (1824, 1830, 'INDIGENOUS', 'slaaff'),
  (1844, 1852, 'INDIGENOUS', 'Slavinne'),
  (1844, 1852, 'WOMEN', 'Slavinne'),
  (1924, 1931, 'INDIGENOUS', 'slaeven')],
 'NL-HaNA_1.04.02_6847_0017.txt': [(122, 128, 'WOMEN', 'dogter'),
  (262, 265, 'MEN', 'hij'),
  (429, 434, 'WOMEN', 'haker'),
  (573, 579, 'WOMEN', 'Jnsf=m'),
  (606, 610, 'WOMEN', 'wed:'),
  (727, 730, 'MEN', 'zyn'),
  (1442, 1447, 'MEN', 'mons:')],
 'NL-HaNA_1.04.02_6847_0765.txt': [(312, 321, 'GPE', 'Amsterdam')

### Terms for men, women and indigenous


In [9]:
def get_terms(data, group):
    for entity_list in data.values():
        for row in entity_list:
            if row[2] == group:
                yield row[3]
                
terms_men = set(get_terms(data, "MEN"))
terms_women = set(get_terms(data, "WOMEN"))
terms_indi = set(get_terms(data, "INDIGENOUS"))
terms_gpe = set(get_terms(data, "GPE"))

In [10]:
print(terms_men)

{'M:', 'den E: E: Heer', 'Jonge', 'M=l', 'Den Manh:', 'agtb: h=r', 'haer wel Edele groot agtb', 'Zoon', 'de wel Ed:e gestr Heer en Heeren Comparanten', 'mede Exeiuteuren', 'den otaris', 'heeren weesmeesteren', 'm:e', 'M=r', 'Swager', 'van heeren', 'Eerw: Heer', 'Commandeur', 'scheeps Corporaal', 'M', 'doopzoon', 'S„r', 'Clercq', 'zyn', 'klerken', 'Den Heer', 'Den wel Edele vestrenge Heer M„r', 'koetsier', 'm:', 'Executours', 'heere', 'orgamnst', 'Eerw: heeren', 'haer', 'Clercq ter secratarije van Iustitie', 'Agtb: h=r', 'adjunct geswore klerk', 'keurmeester', 'M„', 'broeders', 'Koopman', 'De wel Edele Heer', 'president in het Eerw', 'hij', 'Notaris publicq', 'Iongens', 'den wel Ed: Heer', 'Commandeur en Equi„ pagiemeeste', 'den heer', 'den wel Edele Heer', 'oud gouverneur', 'notaris', 'Secretaris', 'Schoon zoon', 'sijn', 'notaris publicq', 'dE', 'quartiermeester', 'Den E', 'Attestor', 'wager', 'Ed„e gestren Edelens', 'De Wel Edele groot Agtb: Heeren', 'Clercquen als getuijgen', 'mijne 

In [11]:
print(terms_women) #skip haar? testatrice? zij?

{'Stariynen', 'lavenne', 'dogte', 'ngt', 'voogdesje', 'meid', 'Jns=m', 'Meju„ „frouw', 'Iufrouw', 'mepnst=', 'bagter', 'zuster', 'mesust=r', 'dogtertje', 'huijsv:r', 'Testatrice', 'aogter', 'huijs„ vrouw', 'dogters', 'dogter', 'gedagdesse', 'Enlandsche Crietene', 'Excutrice', 'Mejuff=r', 'Just=', 'Jntt=e', 'Pijs=m', 'zuiders', 'me juw=m', 'doop dogter', 'juff=en', 'wed:e', 'Erfgenaame', 'slavinnen', 'huijs vrouw', 'Hluste', 'slavinne', 'Zuster', 'vrije ontristen vrouw', 'huisv', 'Mejuff:r', 'Jongste dogter', 'sla „kinnen', 'Jnst=m', 'Jop=m', 'Juff=re', 'machome', 'puysvrouw', 'Iuff=m', 'moeder', 'wijf', 'testatrice', 'me jnst=m', 'vrije Cristen vion', 'moeden', 'doop dogtertje', 'haker', 'erfgenaame', 'hogter', 'huisv:', 'wij', 'Juff„', 'Jnsf=m', 'wij/', 'Sister', 'Jnst=r', 'Memss=', 'Iuffrouw', 'wede', 'M„r', 'mejn p=m', 'huijsv:', 'Jusf=t', 'moederlijke', 'Susters', 'natuurlijke dogter', 'me jusf=r', 'vrije fristen vrouw', 'wed', 'huis vrouw', 'suester', 'Ins=m', 'mejnst=m', 'Suster'

In [None]:
print(terms_indi)

In [None]:
terms_indi_women = terms_women & terms_indi


In [None]:
print(terms_gpe)

### Creating an Index for women, indigenous (test corpus of 126 files)



In [None]:
test_corpus = "./TestCorpus"
docs = {f: load(f, test_corpus) for f in filter(lambda f: f.endswith(".txt"), os.listdir(test_corpus))}

In [None]:
def build_group_regex(term_set):
    escaped = {re.escape(s) for s in term_set}
    return r"|".join(["[\s+|^]("+t+")\s+" for t in reversed(sorted(escaped, key=len))])

def get_mentions(text, group_regex):
    for m in re.finditer(group_regex, text):
        found_term = m.group()
        stripped = found_term.lstrip()
        
        if stripped == found_term:
            yield m.start(), found_term.rstrip()
        else:
            yield (m.start() + (len(found_term) - len(stripped))), stripped.rstrip()


def get_group_records(docs, group_terms):
    group_regex = build_group_regex(group_terms)

    for fname, txt in docs.items():
        for ind, found_term in get_mentions(txt, group_regex):
            yield(fname, ind, found_term)

In [None]:
import pandas as pd
women_records = list(get_group_records(docs, terms_women))
women_df = pandas.DataFrame.from_records(women_records, columns=["filename", "start_index", "identifier_term"])

men_records = list(get_group_records(docs, terms_men))
men_df = pandas.DataFrame.from_records(men_records, columns=["filename", "start_index", "identifier_term"])

indi_records = list(get_group_records(docs, terms_indi))
indi_df = pandas.DataFrame.from_records(indi_records, columns=["filename", "start_index", "identifier_term"])

indi_women_records = list(get_group_records(docs, terms_indi_women))
indi_women_df = pandas.DataFrame.from_records(indi_women_records, columns=["filename", "start_index", "identifier_term"])

In [None]:
women_df

In [None]:
women_df.to_excel("ppl.xlsx", sheet_name = "women")

In [None]:
indi_df

In [None]:
indi_women_df

In [None]:
with pd.ExcelWriter('ppl.xlsx') as writer:
    indi_df.to_excel(writer, sheet_name = "indigenous")
    women_df.to_excel(writer, sheet_name = "women")
    indi_women_df.to_excel(writer, sheet_name= "indigenous women")

## Statistics of (Test) Corpus

### Statistics Women

In [None]:
print("Percentage of docs with at least one mention of a women: ", women_df.filename.unique().shape[0]/len(docs))
women_per_doc = women_df.groupby("filename").apply(lambda df: df.shape[0])
print("Mean and std. dev. number of women metions per document: ", women_per_doc.mean(), women_per_doc.var()**.5)

In [None]:
print("Percentage of docs with at least one mention of a men: ", men_df.filename.unique().shape[0]/len(docs)) #no qualifiers?
men_per_doc = men_df.groupby("filename").apply(lambda df: df.shape[0])
print("Mean and std. dev. number of men metions per document: ", men_per_doc.mean(), men_per_doc.var()**.5)

In [None]:
import seaborn as sns

g = sns.histplot(women_df, x="identifier_term")
sns.set(rc={'figure.figsize':(10, 8)})
plt.xticks(rotation=90)
plt.title("Histogram of usage of identifier terms for women") 


In [None]:
men_per_doc = men_df.groupby("filename").apply(lambda df: df.shape[0])
#women_per_doc = women_df.groupby("filename").apply(lambda df: df.shape[0])

sns.jointplot(x=women_per_doc, y=men_per_doc, kind="hex")
_ = plt.xlabel("Women per Doc")
_ = plt.ylabel("Men per Doc")

In [None]:
#for x in women_df.iterrows():
    #print(x[1].filename)
    #print("----")

### Statistics Indigenous

In [None]:
print("Percentage of docs with at least one mention of an indigenous person: ", indi_df.filename.unique().shape[0]/len(docs))
indi_per_doc = indi_df.groupby("filename").apply(lambda df: df.shape[0])
print("Mean and std. dev. number of indigenous peoples metions per document: ", indi_per_doc.mean(), indi_per_doc.var()**.5)

In [None]:
import seaborn as sns

g = sns.histplot(indi_df, x="identifier_term")
sns.set(rc={'figure.figsize':(10,8)})
plt.xticks(rotation=90)
plt.title("Histogram of usage of identifier terms for indigenous people") 
