In [91]:
import os
import re
import lxml
import pandas as pd
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
nlp = spacy.load('en')

In [64]:
data_dir = os.path.join('thesis_data', 'vangogh')
clean_text = os.path.join('thesis_data', 'vangogh_clean_text')
gold_standard = os.path.join('thesis_data', 'vangogh_gold_standard')

There are 903 Van Gogh Letters.

NER Pipeline:

    1. Parse TEI tags using BeautifulSoup.
    2. Add beginning and ending character indices for entity tags.
    3. Extract raw text and save to vangogh_clean_text directory.
    4. Tag named entities using Stanford NER.
    5. Tag named entities using SpaCy.
    6. Calculate precision, recall, F-score, and ROC.

In [83]:
letters = os.listdir(data_dir)
letters = [x for x in letters if 'let' in x]

In [7]:
letter = letters[35]
with open(os.path.join(data_dir, letter), 'r') as f:
    content = f.read()
soup = BeautifulSoup(content, 'lxml')
translations = soup.findAll('div', {'type': 'translation'})
translation = translations[0]
translation_text = translation.text.strip()
print(translation.text.strip())

doc = nlp(translation_text)
displacy.render(doc, style='ent', jupyter=True)
html = displacy.render(doc, style='ent')

[Letterhead: Goupil Paris]

Paris, 13 August 1875

My dear Theo,
I had wanted to write to you earlier. I’m glad that Pa has accepted the call to Etten; under the circumstances I also think it good that Willemien is going along with Anna. I’d also have liked to be with all of you that Sunday at Helvoirt; have I already told you that I was with Soek and his family at Ville-d’Avray that day? I was surprised to find 3 paintings by Corot in the church there.
On Sunday last and Sunday a fortnight ago I went to Mr Mercier’s church and heard him speak on ‘all things work together for good to them that love God’ (in Dutch it says ‘for those who love God all things will work together for good’) and on ‘He created man in his own image’, it was glorious and grand. You should also go to church every Sunday if you can, even if it isn’t so very beautiful; 
 do that, you won’t regret it. Have you ever been to hear the Rev. Zubli?
In the list of what I have hanging in my room I forgot:

N. Maes  The na

In [105]:
letters = os.listdir(data_dir)
letters = [x for x in letters if 'let' in x]

for letter in letters:
    with open(os.path.join(data_dir, letter), 'r') as f:
        content = f.read()
        
    soup = BeautifulSoup(content, 'lxml')
    translations = soup.findAll('div', {'type': 'translation'})
    translation = translations[0]

    tag_types = []
    for child in translation.recursiveChildGenerator():
        if child.name:
            tag_types.append(child.name)
    to_remove = list(set(tag_types)-set(['ab', 'rs']))
    for tag in to_remove:
        for x in translation(tag):
            x.decompose()

    start_offsets = []
    end_offsets = []
    keys = []
    types = []
    entities = []
    ix = 0
    for text in translation('ab'):
        for tag in text('rs'):
            begin = text.text.index(tag.text) + ix
            end = len(tag.text) + begin
            start_offsets.append(begin)
            end_offsets.append(end)
            keys.append(tag.attrs['key'])
            types.append(tag.attrs['type'])
            entities.append(tag.text)
        ix += len(text.text)

    df = pd.DataFrame({'start_offsets': start_offsets,
                      'end_offsets': end_offsets,
                      'keys': keys,
                      'types': types,
                      'entities': entities})
    df.to_csv(os.path.join(gold_standard, '{}.csv'.format(letter[0:-4])), index=False)
    with open(os.path.join(clean_text, '{}.txt'.format(letter[0:-4])), 'w') as f:
        f.write(translation.text.strip())

In [106]:
df.head()

Unnamed: 0,start_offsets,end_offsets,keys,types,entities
0,1360,1365,569,pers,Dante
1,1367,1375,721,pers,Petrarch
2,1377,1386,544,pers,Boccaccio
3,2103,2110,515,pers,Milliet
4,3122,3128,1353,pers,Seurat


In [86]:
ll = letters[0:5]
ll = [x[0:-4] for x in ll]
print(ll)

['let690', 'let848', 'let684', 'let874', 'let860']


In [123]:
which = 'let027'
gs_data = pd.read_csv(os.path.join(gold_standard, '{}.csv'.format(which)))
gs_data.sort_values(by=['start_offsets'], inplace=True)
gs_data

Unnamed: 0,start_offsets,end_offsets,keys,types,entities
0,62,70,707,pers,Michelet
1,740,744,412,pers,Anna
2,940,942,526,pers,Pa
3,983,985,524,pers,Ma
4,1273,1284,398,pers,Grandfather
5,1336,1341,499 501,pers,aunts
6,1485,1489,412,pers,Anna
7,1723,1737,442 443,pers,the Haanebeeks
8,1831,1842,1202,pers,Thijs Maris
9,1848,1859,787,pers,Mr Tersteeg


In [124]:
with open(os.path.join(clean_text, '{}.txt'.format(which)), 'r') as f:
    text = f.read()
print(text)

London, 31 July 1874

My dear Theo,
I’m glad you’ve been reading Michelet and that you really understand it. A book like that at least teaches one to see that there’s a lot more to love than people usually think.
That book was a revelation and immediately a gospel to me.

‘There is no such thing as an old woman!’

(This isn’t to say that there are no old women, but that a woman doesn’t grow old as long as she loves and is loved.)
And then a chapter like The longing for autumn, how rich it is. 

That a woman is ‘a completely different being’ from a man, and a being that we , or at least only very superficially, as you say, yes, that I certainly believe. And that a woman and a man can become , that is,  and not two halves, that I believe too.
Anna is managing well, we go on wonderful walks together. It’s so beautiful here, if only one has a good and a single eye, without many beams in it.
But if one has that, then it’s beautiful everywhere.
Pa isn’t at all better, even though he and Ma s

In [128]:
text = text.strip()
text = re.sub('\n', '', text)
text = re.sub('\t', '', text)
doc = nlp(text)
ents = doc.ents
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents if e.label_ in ['PERSON', 'GPE', 'ORG']]
for e in ents:
    print(e)

('London', 0, 6, 'GPE')
('Theo', 28, 32, 'GPE')
('I’m', 33, 36, 'GPE')
('Michelet', 62, 70, 'ORG')
('Anna', 739, 743, 'PERSON')
('Ma', 982, 984, 'PERSON')
('&c. &c.', 1196, 1203, 'ORG')
('Theo', 1236, 1240, 'GPE')
('Grandfather', 1272, 1283, 'ORG')
('Young Jochem', 1465, 1477, 'PERSON')
('Anna', 1483, 1487, 'PERSON')
('Governess', 1627, 1636, 'ORG')
('Thijs Maris', 1829, 1840, 'PERSON')
('Tersteeg', 1849, 1857, 'PERSON')
('England', 2038, 2045, 'GPE')
('Obach', 2195, 2200, 'PERSON')
('Paris', 2207, 2212, 'GPE')
('Anna', 2370, 2374, 'PERSON')
('Boughton', 2460, 2468, 'PERSON')
('Maris', 2470, 2475, 'PERSON')
('Jacquet', 2480, 2487, 'ORG')
('Holland', 2738, 2745, 'GPE')
('Anna', 2818, 2822, 'PERSON')
