# Prepare data for information retrieval fine-tuning 

## Map Bible verse to body segments with the relevant citations

In [3]:
import re,json,os
import pandas as pd 
def combine_punc_with_text(segment):
    segment = re.sub(r'\s+([,.?!;:)])', r'\1', segment)
    segment = re.sub(r'([(])\s+', r'\1', segment)
    segment = re.sub(r"\s+"," ",segment)
    return segment

In [5]:
def process(era,c_type="verse"): 
    relevant = {}
    with open(f"../assets/citations/{era}_{c_type}_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for cited, segments in c_to_seg.items():
        if "Ibidem" in cited: continue 
        cited = " ".join(cited.split("-"))
        if re.search(r"\d+ \d+",cited):
            cited = cited.split(" ")
            if c_type == "verse": 
                cited = " ".cited(cited[:-2]) +" " + ".".join(cited[-2:])
            else: cited = " ".join(cited)
        for s in segments: 
            seg_id = (s.split(",")[0],s.split(",")[1])
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(cited)
    print(era, len(c_to_seg),'citations',len(seg_to_c),"segments")

    for prefix in os.listdir(f"../assets/processed/{era}/sub-segments"):
        if prefix == ".DS_Store": continue
        with open(f"../assets/processed/{era}/sub-segments/{prefix}",'r') as file:
            s_ids, s_text, s_orig, _ = json.load(file)
        
        for idx, s_id in enumerate(s_ids):
            tcpID, sidx, nidx = s_id[0][0], str(s_id[0][1]), s_id[0][2]
            if nidx >= 0: continue # only body content
            if (tcpID,sidx) in seg_to_c: 
                s = combine_punc_with_text(s_text[idx])
                t = combine_punc_with_text(s_orig[idx])
                if s not in relevant: 
                    relevant[s] = ({},{},{}) # citations, original, location 
                if len(s.split(" "))< 5: continue # at least 5 words long  
                relevant[s][2][(tcpID,sidx,str(s_id[1]))] = None 
                relevant[s][1][t] = None 
                for c in seg_to_c[(tcpID,sidx)]:
                    relevant[s][0][c] = None 
    for s,r in relevant.items(): 
        relevant[s] = (list(r[0].keys()), list(r[1].keys()), list(r[2].keys()))
    print(len(set(relevant)),'unique passages')
    if c_type == "chapter": 
        with open(f"../assets/relevant/{era}_chapter_citations.json","w+") as file: 
            json.dump(relevant, file)
    else: 
        with open(f"../assets/relevant/{era}.json","w+") as file: 
            json.dump(relevant, file)

In [9]:
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    process(era)

Elizabethan 8858 verses 8357 segments
37274 unique passages
Carolinian 18164 verses 25308 segments
139871 unique passages
WilliamAndMary 12923 verses 18364 segments
103595 unique passages
pre-Elizabethan 101 verses 77 segments
286 unique passages
Jacobean 17288 verses 26125 segments
133130 unique passages
CharlesII 24223 verses 54104 segments
318993 unique passages
CivilWar 15625 verses 16766 segments
105768 unique passages
JamesII 5410 verses 3533 segments
21208 unique passages
Interregnum 18670 verses 29877 segments
190009 unique passages


In [6]:
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    process(era,"chapter")

Elizabethan 1344 citations 5695 segments
23698 unique passages
Carolinian 1549 citations 7054 segments
45759 unique passages
WilliamAndMary 1011 citations 2826 segments
19027 unique passages
pre-Elizabethan 441 citations 823 segments
2424 unique passages
Jacobean 1583 citations 7679 segments
45188 unique passages
CharlesII 1819 citations 10002 segments
71576 unique passages
CivilWar 1265 citations 4174 segments
31299 unique passages
JamesII 671 citations 874 segments
5985 unique passages
Interregnum 2681 citations 6366 segments
49594 unique passages


## Get non-citation segments 

In [35]:
folder = "../assets"
locations = {}
for fp in os.listdir(f"{folder}/relevant"):
    if not re.search(f'CivilWar|Interregnum',fp): continue
    else:
      with open(f"{folder}/relevant/{fp}","r") as file:
        r = json.load(file)
        for k, v in r.items(): 
            loc = v[2]
            for l in loc: 
               locations[tuple(l)] = None 
len(locations)

299273

In [4]:
from tqdm import tqdm 
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    if not re.search("CivilWar|Interregnum",era): continue
    parts = {}
    for prefix in tqdm(os.listdir(f"../assets/processed/{era}/sub-segments")):
        if prefix == ".DS_Store": continue

        with open(f"../assets/processed/{era}/sub-segments/{prefix}",'r') as file:
            s_ids, s_text, s_orig, _ = json.load(file)
        for idx, s_id in enumerate(s_ids):
            tcpID, sidx, nidx = s_id[0][0], str(s_id[0][1]), s_id[0][2]
            pidx = s_id[1]
            if nidx >= 0: continue # only body content
            s = combine_punc_with_text(s_text[idx])
            t = combine_punc_with_text(s_orig[idx])
            t = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",t)
            t = re.sub(r"\s+"," ",t)
            t = t.strip(" ")
            if s not in parts: 
                parts[s] = ({},{}) # original, locations 
            if len(s.split(" "))< 5: continue # at least 5 words long  
            parts[s][1][(tcpID,sidx,str(s_id[1]))] = None 
            parts[s][0][t] = None 
    for s,r in parts.items(): 
        parts[s] = [r[0], list(r[1].keys())]
    
    batches = []
    batch_size = 150000
    batch_num = 0 
    for i in range(0, len(parts), batch_size):
        batch = {}
        for p in list(parts.keys())[i: i + batch_size]:
            batch[p] = parts[p]
        batches.append(batch)
        with open(f"../assets/unique_body/{era}_{batch_num}","w+") as file: 
            json.dump(parts,file)
        batch_num += 1 
    print(f"{era}: {len(batches)} batches. Total {len(parts)} parts.")

'''
CivilWar: 5 batches. Total 628259 parts.
Interregnum: 8 batches. Total 1071663 parts.
'''

100%|██████████| 9/9 [00:48<00:00,  5.41s/it]


CivilWar: 5 batches. Total 628259 parts.


100%|██████████| 9/9 [01:23<00:00,  9.25s/it]


Interregnum: 8 batches. Total 1071663 parts.


'\n\n'

In [15]:
# Marginalia 
era = "CivilWar"
parts = {}
for fp in tqdm(os.listdir(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}")):
    if "margin" not in fp: continue 
    margin = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None, names=["tcpID","sidx","nidx","original","standardized"])
        
    for idx, t in enumerate(margin["original"]):
        tcpID, sidx, nidx = margin["tcpID"][idx], margin["sidx"][idx], margin["nidx"][idx]
        s = margin["standardized"][idx]
        if not isinstance(s,str): continue 
        if len(s) == 0: continue 
        t = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",t)
        t = re.sub(r"\s+"," ",t)
        t = t.strip(" ")
        if s not in parts: 
            parts[s] = ({},{}) # original, locations 
        parts[s][1][(tcpID,str(sidx),str(nidx))] = None 
        parts[s][0][t] = None 
for s,r in parts.items(): 
    parts[s] = [r[0], list(r[1].keys())]

batches = []
batch_size = 150000
batch_num = 0 
for i in range(0, len(parts), batch_size):
    batch = {}
    for p in list(parts.keys())[i: i + batch_size]:
        batch[p] = parts[p]
    batches.append(batch)
    with open(f"../assets/unique_body/{era}_margin_{batch_num}","w+") as file: 
        json.dump(parts,file)
    batch_num += 1 
''' 
CivilWar: 1 batches. Total 47610 parts.
'''
print(f"{era}: {len(batches)} batches. Total {len(parts)} parts.")

100%|██████████| 19/19 [00:02<00:00,  8.10it/s]


CivilWar: 1 batches. Total 47610 parts.
