In [None]:
# @title Setup

import dateparser
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from google.colab import drive
drive.mount('/content/drive')

root_folder = '/content/drive/MyDrive/BallitoreColab/'
# !ls {root_folder}

import os
metadata_folder = os.path.join(root_folder, 'Collection Metadata')
texts_folder = os.path.join(root_folder, 'TXT Files (all)')
# !ls "{metadata_folder}"

In [158]:
# @title Get metadata

def get_metadata():
    o=[]

    def fix_col(x):
        x=x.lower()
        if x=='internal id': return 'id'
        # if 'date' in x: return 'date'
        if x == 'date (mm/dd/yyyy)': return 'date'
        if x == 'date (yyyy)': return 'year'
        if 'unnamed' in x: return 'notes'
        return x

    for fn in os.listdir(metadata_folder):
        df=pd.read_excel(os.path.join(metadata_folder,fn))
        df.columns = [fix_col(c) for c in df]
        df=df[[c for c in df if c!='year']]
        o.append(df)

    df=pd.concat(o).fillna('')
    df['id'] = df['id'].apply(lambda x: x.strip().split('.txt')[0])
    df['datetime'] = df['date'].apply(str).progress_apply(dateparser.parse)
    df['dateyear'] = df['datetime'].apply(lambda x: x.year if x.year>0 else 0)
    return df

df = get_metadata()
df

100%|██████████| 2983/2983 [00:11<00:00, 253.80it/s]


Unnamed: 0,id,sender,recipient,date,location from,location to,notes,datetime,dateyear
0,mss4-b5-f1-001,Elizabeth Abell,Mary Shackleton Leadbeater,April 4 1778,Cork,Ballitore,,1778-04-04,1778
1,mss4-b5-f1-002,Elizabeth Abell,Mary Shackleton Leadbeater,Jan 12 1782,Cork,Ballitore,,1782-01-12,1782
2,mss4-b5-f1-003,Elizabeth Abell,Mary Shackleton Leadbeater,Oct 20 1783,Cork,[unclear],,1783-10-20,1783
3,mss4-b5-f1-004,Elizabeth Abell,Mary Shackleton Leadbeater,Feb 19 1788,Cork,Ballitore,,1788-02-19,1788
4,mss4-b5-f2-001,James Abell,Mary Shackleton Leadbeater,12/29/1786,Cork,Ballitore,,1786-12-29,1786
...,...,...,...,...,...,...,...,...,...
226,mss4-b7-f8-070,William Rayner,Mary Shackleton Leadbeater,3/14/1819,"Ballyfair, Kilcullen",Ballitore,,1819-03-14,1819
227,mss4-b7-f8-071,William Rayner,Mary Shackleton Leadbeater,1/18/1821,"Ballyfair, Kilcullen",Ballitore,,1821-01-18,1821
228,mss4-b7-f8-072,William Rayner,Mary Shackleton Leadbeater,10/17/1821,Dublin,Ballitore,,1821-10-17,1821
229,mss4-b7-f9-001,W. Reed,Mary Shackleton Leadbeater,1/12/1819,Dublin,Ballitore,,1819-01-12,1819


In [160]:
# @title Get texts

def get_df_txt():
    paths_txt = [
        os.path.join(root,fn)
        for root,dirs,fns in os.walk(texts_folder)
        for fn in fns
        if fn.endswith('.txt')
    ]

    o=[]
    for fnfn in tqdm(sorted(paths_txt)):
        with open(fnfn) as f:
            txt=f.read()
        o.append({'id':os.path.basename(fnfn).split('.txt')[0], 'txt':txt})
    df_txt = pd.DataFrame(o).drop_duplicates('id')
    return df_txt


df_txt = get_df_txt()
df_txt

100%|██████████| 4131/4131 [00:33<00:00, 122.06it/s]


Unnamed: 0,id,txt
0,consensus_text_90344821,Vol Ⅰ\nJuly 21st 1852\nOctober 15th 1852\n[ext...
1,consensus_text_90344822,Cork - Patrick St\n1852. July 21st.. Up at\nsi...
2,consensus_text_90344823,written about. So we all\npacked on an outside...
3,consensus_text_90344824,Margaret had to leave at the\nHarris's for Dr ...
4,consensus_text_90344825,when Margaret left the\nbook. And then home by...
...,...,...
4126,mss4-b9-f7-031,Dublin 30th March 1829\nMy dear Elizabeth\nI d...
4127,mss4-b9-f7-032,Dublin April 6th 1829\nMy dear Elizabeth\nI do...
4128,mss4-b9-f8-001,Fassarae\n16 Sept 1867\naffectionate Uncle\nEd...
4129,mss4-b9-f9-001,Dear Frank\nI believe we may send\nhanna the P...


In [162]:
[x for x in df_txt.id if '.txt' in x]

[]

In [163]:
# @title Merge metadata and text

odf=df.merge(df_txt, on='id',how='outer').fillna('').set_index('id')
odf['dateyear'] = odf['datetime'].apply(lambda x: x.year if x.year>0 else 0)

def getboxnum(x):
    if 'consensus' in x: return '14'
    o=[]
    ok=False
    for y in x:
        if ok and y.isdigit():
            o.append(y)
        if y=='b': ok=True
        elif not y.isdigit(): ok=False
    return ''.join(o)

odf['box']=[getboxnum(x) for x in odf.index]

odf

Unnamed: 0_level_0,sender,recipient,date,location from,location to,notes,datetime,dateyear,txt,box
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mss4-b5-f1-001,Elizabeth Abell,Mary Shackleton Leadbeater,April 4 1778,Cork,Ballitore,,1778-04-04,1778,I would wish to write a few lines to my dear f...,5
mss4-b5-f1-002,Elizabeth Abell,Mary Shackleton Leadbeater,Jan 12 1782,Cork,Ballitore,,1782-01-12,1782,I recd. my Dear Molly Shackletons kind favour\...,5
mss4-b5-f1-003,Elizabeth Abell,Mary Shackleton Leadbeater,Oct 20 1783,Cork,[unclear],,1783-10-20,1783,Cork the 20th of 10th m. 1785\nMy Dear Mary\nI...,5
mss4-b5-f1-004,Elizabeth Abell,Mary Shackleton Leadbeater,Feb 19 1788,Cork,Ballitore,,1788-02-19,1788,I doubt not but my dear Friend will be\nrather...,5
mss4-b5-f2-001,James Abell,Mary Shackleton Leadbeater,12/29/1786,Cork,Ballitore,,1786-12-29,1786,My dear Molly - if thoull bear being call'd so...,5
...,...,...,...,...,...,...,...,...,...,...
mss4-b9-f24-002,,,,,,,NaT,0,which I expended some 400 more in improvements...,9
mss4-b9-f4-001,,,,,,,NaT,0,Limerick 18 of November 1823\nMy dear brother ...,9
mss4-b9-f4-002,,,,,,,NaT,0,vexation a pleasure-tourist sometimes undergoe...,9
mss4-b9-f7-019,,,,,,,NaT,0,for the [unclear][/unclear] of Earl [unclear][...,9


In [164]:
# @title Merging letters across pages
from collections import defaultdict
last_id=None
id2group=defaultdict(list)
for id,row in odf.iterrows():
    if row.notes.strip().startswith('p.') and not id.endswith('001'):
        id2group[last_id]+=[{'id':id, 'notes':row.notes, 'txt':row.txt}]
    else:
        last_id = id

id2group

defaultdict(list,
            {'mss4-b1-f4-001-5': [{'id': 'mss4-b1-f4-001-6',
               'notes': 'p. 2 of previous letter',
               'txt': 'P\n152\nThis place is very agreeable; rendered still more so by the little\nprattling babes, among whom I walk about, like a sort of a Patriarch,\n153\nwhile they call me "granfaddy," and present me with their little\nofferings. Yet still home, and the dearest of all earthly objects attracts\nme home. I hope thy mother is satisfied with my stay, and that\nyou all exert yourselves to make her comfortable, and my absence tolerable\nto her. She deserves every mark of duty, affection, respect, & attention\nfrom you. She also wants your assistance, and the defects of memory\nand recollection, and depredations of time, to be supplied and made\nup by your more youthful activity. *** The good dispositions of\nmy children are the joy of my heart, and weigh it down with humble\ngratitude. The prospect which opens for my dear Debby is\nindeed hig

In [165]:
duplicated = {d['id'] for ld in id2group.values() for d in ld}
len(duplicated)

429

In [166]:
duplicated = {d['id'] for ld in id2group.values() for d in ld}
odf2=odf[~odf.index.isin(duplicated)].copy()
id2txt=dict(zip(odf2.index, odf2.txt))
id2notes=dict(zip(odf2.index, odf2.notes))
id2ids=defaultdict(list)
# for id in odf2.index: id2ids[id]=[id]
for id,ld in id2group.items():
    for d in sorted(ld, key=lambda x: x['id']):
        id2txt[id] += d['txt']
        id2notes[id] += '; '+d['notes']
        id2ids[id]+=[d['id']]


id='mss4-b1-f4-001-5'
[id2txt[id], id2notes[id], id2ids[id]]

odf2['notes'] = odf2.index.map(id2notes)
odf2['txt'] = odf2.index.map(id2txt)
odf2['supplemental_ids'] = odf2.index.map(id2ids)
odf2['supplemental_ids'] = odf2['supplemental_ids'].apply(lambda x: '; '.join(x))
odf2[odf2.supplemental_ids.apply(len)>0]
odf2

Unnamed: 0_level_0,sender,recipient,date,location from,location to,notes,datetime,dateyear,txt,box,supplemental_ids
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
mss4-b5-f1-001,Elizabeth Abell,Mary Shackleton Leadbeater,April 4 1778,Cork,Ballitore,,1778-04-04,1778,I would wish to write a few lines to my dear f...,5,
mss4-b5-f1-002,Elizabeth Abell,Mary Shackleton Leadbeater,Jan 12 1782,Cork,Ballitore,,1782-01-12,1782,I recd. my Dear Molly Shackletons kind favour\...,5,
mss4-b5-f1-003,Elizabeth Abell,Mary Shackleton Leadbeater,Oct 20 1783,Cork,[unclear],,1783-10-20,1783,Cork the 20th of 10th m. 1785\nMy Dear Mary\nI...,5,
mss4-b5-f1-004,Elizabeth Abell,Mary Shackleton Leadbeater,Feb 19 1788,Cork,Ballitore,,1788-02-19,1788,I doubt not but my dear Friend will be\nrather...,5,
mss4-b5-f2-001,James Abell,Mary Shackleton Leadbeater,12/29/1786,Cork,Ballitore,,1786-12-29,1786,My dear Molly - if thoull bear being call'd so...,5,
...,...,...,...,...,...,...,...,...,...,...,...
mss4-b9-f24-002,,,,,,,NaT,0,which I expended some 400 more in improvements...,9,
mss4-b9-f4-001,,,,,,,NaT,0,Limerick 18 of November 1823\nMy dear brother ...,9,
mss4-b9-f4-002,,,,,,,NaT,0,vexation a pleasure-tourist sometimes undergoe...,9,
mss4-b9-f7-019,,,,,,,NaT,0,for the [unclear][/unclear] of Earl [unclear][...,9,


In [167]:
odf2[odf2.sender==''].box.value_counts()

box
14    1120
13     437
9      205
1      121
6      116
12      71
10      56
5       51
7       15
4       13
2       11
11       9
3        4
8        1
Name: count, dtype: int64

In [168]:
[x for x in odf2.index if '.txt' in x]

[]

In [169]:
ofn=os.path.join(root_folder, 'ballitore_data.xlsx')
odf2.to_excel(ofn)

ModuleNotFoundError: No module named 'tomotopy'