In [34]:
import os
import lxml
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict

In [2]:
data_dir = 'mitford-letters'
clean_text_dir = '../mitford_letters_clean'
gold_standard_dir = '../mitford_letters_gs'
letters = os.listdir(data_dir)
letters = [x for x in letters if x not in ['si.xml', '__contents__.xml']]

There are 97 Mitford letters.

In [4]:
tag_dict = {
    'persname': 'person',
    'persName': 'person',
    'placename': 'place',
    'placeName': 'place',
}

In [41]:
for letter in letters:
    print(letter)
    with open(os.path.join(data_dir, letter), 'r') as f:
        content = f.read()
    try:
        soup = BeautifulSoup(content, 'lxml')
        soup = soup.findAll('div', {'type': 'letter'})
        soup = soup[0]

        tags = []
        begin_offsets = []
        end_offsets = []
        tag_text = []
        refs = []
        previous_names = defaultdict(int)
        for tag in tag_dict.keys():
            for persname in soup.findAll(tag):
                begin = soup.text.index(persname.text, previous_names[persname.text])
                end = len(persname.text) + begin
                tags.append(tag_dict[tag])
                begin_offsets.append(begin)
                end_offsets.append(end)
                tag_text.append(persname.text)
                previous_names[persname.text] = end
                try:
                    refs.append(persname['ref'])
                except:
                    refs.append(None)           
        df = pd.DataFrame({'tag': tags,
                          'begin_offset': begin_offsets,
                          'end_offset': end_offsets,
                          'text': tag_text,
                          'ref': refs})
        #df.drop_duplicates(subset=['begin_offset', 'end_offset', 'text'], inplace=True)
    except:
        pass
    df.sort_values(by=['begin_offset'], inplace=True)
    df.to_csv(os.path.join(gold_standard_dir, letter), index=False)
    print(df)

1821-05-02-BRHaydon.xml
       tag  begin_offset  end_offset           text              ref
22   place            13          26  Seymour Court       #SeymourCt
23   place            32          38         Marlow          #Marlow
0   person           115         125     John Keats           #Keats
1   person           435         445     John Keats           #Keats
2   person           569         580    Mr. Johnson      #Johnson_Mr
3   person           639         645         Sister    #Johnson_Miss
4   person           736         747    Mr. Johnson      #Johnson_Mr
5   person           747         758    Mr. Johnson      #Johnson_Mr
6   person           876         889  Mr. Northmore  #Northmore_Thos
7   person          1656        1669  Mr. Northmore  #Northmore_Thos
8   person          1675        1686    Mr. Johnson      #Johnson_Mr
24   place          1829        1836        Bristol         #Bristol
25   place          2029        2035         Marlow          #Marlow
26   place

       tag  begin_offset  end_offset              text                ref
24   place            16          32  Three Mile Cross    #ThreeMileCross
0   person           116         120              Papa       #Mitford_Geo
1   person           160         166            Daphne        #Daphne_pet
2   person           553         559            Daphne        #Daphne_pet
3   person           629         639        Mr. Haydon            #Haydon
9   person           633         639            Haydon            #Haydon
4   person          1276        1282            Daphne        #Daphne_pet
5   person          1572        1587   Duke of Norfolk        #Howard_Tho
6   person          1591        1606   Queen Elizabeth             #ElizI
25   place          1748        1757         Edinburgh         #Edinburgh
7   person          1798        1805           Mitford               #MRM
12  person          1816        1822            Haydon            #Haydon
26   place          1889        1898  

       tag  begin_offset  end_offset                  text                ref
18  person             0           0                                     #MRM
20   place           153         160               Reading      #Reading_city
0   person           350         360            Mr. Elford       #Elford_John
1   person           393         404           Thomas Ford       #Ford_Thomas
22   place           434         441               Reading      #Reading_city
21   place           732         743           Oxfordshire       #Oxfordshire
2   person           872         880              Mr. Ford        Ford_Thomas
3   person           904         916          Mr. French's      #French_Peter
4   person          1728        1739           Mrs. Elford       #Elford_MrsM
5   person          1954        1964            Mr. Ford's       #Ford_Thomas
23   place          2006        2013               Reading      #Reading_city
6   person          2057        2073      Johnathan Elford      

       tag  begin_offset  end_offset                  text                ref
0   person           245         256           Mrs. Haydon        #Haydon_Mrs
1   person           364         368                  Pope          Pope_Alex
2   person           499         511          Mr. C Kemble          #Kemble_C
3   person           836         848          Mr. Macready       #Macready_Wm
35   place          1234        1240                London       #London_city
4   person          1508        1515               Mitford               #MRM
6   person          1686        1693               Mitford               #MRM
36   place          1808        1814                London       #London_city
37   place          1870        1882          Lisson Grove      #Lisson_Grove
17  person          1873        1876                   son        #Foscari_Fr
5   person          2073        2083            Mr. Haydon             Haydon
8   person          2731        2738               Mitford      

       tag  begin_offset  end_offset                  text                ref
16   place           153         160               Reading      #Reading_city
12   place           406         412                London       #London_city
13   place           463         472             Edinburgh         #Edinburgh
14   place           656         662                London       #London_city
15   place           755         761                London       #London_city
0   person           841         857      Mr. Charles Lamb         #Lamb_Chas
1   person           916         936  Mr. Edgeworth Benson        #Scott_John
2   person          2154        2160                Daphne        #Daphne_pet
3   person          2381        2386                 Puppy          #Whim_pet
4   person          2508        2512                  Papa       #Mitford_Geo
5   person          2624        2629                 Puppy          #Whim_pet
6   person          2711        2717                Father      

       tag  begin_offset  end_offset                text  \
0   person             6          23   B.R. Haydon Esqr.   
8    place           336         349       Royal Academy   
1   person           416         422              Titian   
2   person           424         430              Rubens   
3   person           432         441           Rembrandt   
4   person           953         964         Mr. Hazlitt   
9    place          1039        1047            Fonthill   
5   person          1398        1409         Mrs. Haydon   
6   person          1440        1452        Mr. Macready   
10   place          1462        1475       Covent garden   
7   person          1615        1633  B. R. Haydon Esqre   
11   place          1634        1652  8 Paddington Green   
12   place          1636        1646          Paddington   

                       ref  
0                  #Haydon  
8           #Royal_Academy  
1                  #Titian  
2                  #Rubens  
3             

       tag  begin_offset  end_offset                       text  \
0   person             0          16           To Sir W. Elford   
19   place            32          48           Three Mile Cross   
20   place           256         272           Oakhampton House   
21   place           300         307                    Bickham   
3   person           537         543                     friend   
22   place          1506        1514                   Richmond   
23   place          2146        2153                    Reading   
24   place          2274        2281                    Reading   
1   person          2546        2558               Mr. Macready   
2   person          2667        2679               Mr. Macready   
4   person          3212        3224               Mr. Macready   
5   person          3342        3348                     Fiesco   
6   person          3379        3400      Sir Charles Grandison   
7   person          3652        3664               Mr. Macread

16      #Plymouth_city  
1820-02-11-MWebb.xml
       tag  begin_offset  end_offset                     text  \
22   place            43          56            Bertram House   
0   person            74          78                     Mary   
1   person            85         102        Mr. James Wheeler   
2   person           257         269             Mr. Lawrence   
3   person           311         323             Mr. Sherwood   
4   person           349         362            Mrs. Jolliffe   
5   person           385         397             Mr. Lawrence   
6   person           426         438             Mr. Sherwood   
7   person           651         668        Mr. James Wheeler   
8   person           712         718                   Mr. S.   
9   person           789         807       Mr. Sherwood Esqre   
10  person           789         801             Mr. Sherwood   
23   place           808         819              Church Lane   
24   place           821         828        

      tag  begin_offset  end_offset         text                 ref
0  person           204         208         Papa        #Mitford_Geo
9   place           242         251    Hampshire   #Hampshire_county
1  person           294         303    Aunt Mary    #Webb_Mary_elder
5  person           479         483         Papa        #Mitford_Geo
2  person           617         627   Miss Eliza                None
3  person           768         776     Napoleon           #Napoleon
4  person           850         859    Mary Webb  #Webb_Mary_younger
6  person           950         961  Henry Marsh                None
7  person           991        1001   Mr. Milman          #Milman_HH
8  person          2009        2015       M.R.M.                #MRM
1819-01-09-Elford.xml
       tag  begin_offset  end_offset                    text  \
29  person           295         306             Shakespeare   
0   person           933         940                 Chaucer   
1   person          1013   

       tag  begin_offset  end_offset               text                   ref
46   place             0          16   Three Mile Cross       #ThreeMileCross
0   person           132         142         Lord Byron                #Byron
1   person           375         384          Mr. Quale            #Quayle_Mr
18  person           673         680            Foscari           #Foscari_Fr
2   person           748         758         Lord Byron                #Byron
3   person           993        1007     Doge of Venice          #Doge_F_hist
4   person          1256        1261              Coles                #coles
5   person          1335        1345         Adam Smith             #Smith_Ad
6   person          2473        2479             Murray          #Murray_John
7   person          2551        2563       noble Author                #Byron
8   person          2638        2649        John Murray          #Murray_John
9   person          2709        2721       Mr. Talfourd        #

       tag  begin_offset  end_offset                text                   ref
0   person             6          22    B.R. Haydon Esqr               #Haydon
1   person           600         611         Mrs. Haydon           #Haydon_Mrs
15   place          1126        1133             England              #England
2   person          1790        1798            Napoleon             #Napoleon
3   person          2074        2084          Mr. Bewick            #Bewick_Wm
4   person          2530        2543       Mr. Chatfield         #Chatfield_Mr
5   person          3121        3132         Mrs. Haydon           #Haydon_Mrs
6   person          3135        3149      the Little one            #Haydon_FS
13   place          3442        3451           Bramshill      #Bramshill_house
14   place          3461        3470           Hampshire     #Hampshire_county
7   person          3477        3485            Mr. Budd              #Budd_EH
8   person          3777        3785            Mr. 

In [8]:
for letter in letters:
    print(letter)
    with open(os.path.join(data_dir, letter), 'r') as f:
        content = f.read()
    try:
        soup = BeautifulSoup(content, 'lxml')
        soup = soup.findAll('div', {'type': 'letter'})
        soup = soup[0]
    
        text = soup.text
        with open(os.path.join(clean_text_dir, letter), 'w') as f:
            f.write(text)
    except:
        pass


1821-05-02-BRHaydon.xml
1819-08-08-f378-BRHaydon.xml
1819-08-22_MossysDeath.xml
1823-05-13_Elford.xml
1821-11-16-Talfourd.xml
1819-12-xa_Webb.xml
1819-01-20-MaryWebb.xml
1823-04-25-WElford.xml
1819-03-02-MWebb.xml
1820-11-11-SirWilliamElford.xml
1820-11-04Unknown.xml
1819-05-30_Elford.xml
1819-05-16_MWebb.xml
1823-01-13_WElford.xml
1821Nov12_13.Talfourd.xml
1819-06-29-Elford.xml
1822-08-31_Talfourd.xml
1818-01-12_WElford.xml
1821-04-19-Talfourd.xml
1820-07-05_WElford.xml
1822-10-12-Elford.xml
1821-11-30_Talfourd.xml
1820-09-01_Haydon.xml
1819-01-10-MaryWebb.xml
1820-12-08-MWebb.xml
1823-04-09-Hamilton.xml
1820-06-29_Haydon.xml
1821-10-31-BRHaydon.xml
1823-02-28_WElford.xml
1819-05-14_Elford.xml
1821-01-29_Webb.xml
1820-03-20-WElford.xml
1822-06-13-BRHaydon.xml
1821-03-10-BRHaydon.xml
1820-09-30-Elford.xml
1821-10-22_Talfourd.xml
1820-11-27-SirWilliamElford.xml
1820-09-14_BRHaydon.xml
1821-02-09-BRHaydon.xml
1825-06-29-Talfourd.xml
1822-05-19-Talfourd.xml
1825-05-11-Talfourd.xml
1821-01

In [32]:
soup.text.index(persname.text)
soup.text[4524] = ' '

TypeError: 'str' object does not support item assignment