# Prepare data for information retrieval fine-tuning 

## Map Bible verse to body segments with the relevant citations

In [1]:
import re,json,os
import pandas as pd 
from tqdm import tqdm 
import warnings
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

def clean_text(s): 
    s = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>|\d+\^PAGE[S]*\^MISSING","",s)
    s = re.sub(r"\s+"," ",s)
    s = s.strip(" ")
    return s 

In [None]:
relevant = {}
c_type = "verse"

with open('../assets/corpora.json',"r") as file: 
    eras = json.load(file)
for era in eras: 
    with open(f"../assets/citations/{era}_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for cited, segments in c_to_seg.items():
        if "Ibidem" in cited or "Verse" in cited: continue 
        cited = " ".join(cited.split("-"))
        if re.search(r"\d+ \d+",cited):
            cited = cited.split(" ")
            if c_type == "verse": 
                cited = " ".join(cited[:-2]) +" " + ".".join(cited[-2:])
            else: cited = " ".join(cited)
        for s in segments: 
            seg_id = (s.split(",")[0],int(s.split(",")[1]))
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(cited)
    # print(era, len(c_to_seg),'citations',len(seg_to_c),"segments")

    for fp in tqdm(os.listdir(f"../../SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{fp}", header=None)
        for idx, tcpID in enumerate(text[0]):
            sidx = str(text[1][idx])
            
            close_to_citation = False 
            citation_sidx = []
            # window of 2 segments 
            if (tcpID,int(sidx)) in seg_to_c:
                close_to_citation = True
                citation_sidx.append(int(sidx))
            else: 
                i = 1 
                if (tcpID,int(sidx)-i) in seg_to_c:
                    close_to_citation = True
                    citation_sidx.append(int(sidx)-i)
                elif (tcpID,int(sidx)+i) in seg_to_c:
                    close_to_citation = True 
                    citation_sidx.append(int(sidx)+i)
            if close_to_citation: 
                s = clean_text(text[6][idx])
                if len(s.split(" "))< 5: continue # at least 5 words long
                if s not in relevant: 
                    relevant[s] = ({},{}) # citations, location 
                relevant[s][1][(tcpID,sidx)] = None  
                for entry in citation_sidx: 
                    for c in seg_to_c[(tcpID,entry)]:
                        relevant[s][0][c] = None 
    print("Finished processing",era,len(set(relevant)),'unique passages')



100%|██████████| 12/12 [00:01<00:00,  6.82it/s]


Finished processing pre-Elizabeth 8354 unique passages


100%|██████████| 12/12 [00:13<00:00,  1.10s/it]


Finished processing Elizabeth 75689 unique passages


100%|██████████| 14/14 [00:22<00:00,  1.63s/it]


Finished processing JamesI 196188 unique passages


100%|██████████| 22/22 [00:18<00:00,  1.17it/s]


Finished processing CharlesI 300794 unique passages


100%|██████████| 18/18 [00:11<00:00,  1.54it/s]


Finished processing CivilWar 361234 unique passages


100%|██████████| 18/18 [00:18<00:00,  1.01s/it]


Finished processing Interregnum 443145 unique passages


100%|██████████| 20/20 [00:33<00:00,  1.70s/it]


Finished processing CharlesII 600117 unique passages


100%|██████████| 18/18 [00:03<00:00,  4.74it/s]


Finished processing JamesII 624026 unique passages


100%|██████████| 20/20 [00:14<00:00,  1.34it/s]

Finished processing WilliamAndMary 682413 unique passages





In [22]:
output = {}
for s,r in relevant.items(): 
    output[s] = (list(r[0].keys()), list(r[1].keys()))
with open(f"../assets/all_relevant.json","w+") as file: 
    json.dump(output, file)
del output

## Get non-citation segments 

In [17]:
# Body Texts  
from tqdm import tqdm 

body = {}
margin_p = {}

for era in eras: 
    for fp in tqdm(os.listdir(f"{folder}/DH/SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"{folder}/DH/SERMONS_APP/db/data/{era}/{fp}", header=None)
        text = text.to_dict(orient="records")
        for item in text:
            s = clean_text(item[6])
            if s in relevant: continue 
            tcpID, sidx = item[0], item[1]
            if not isinstance(s,str): continue 
            if len(s) == 0: continue 
            if s not in body: 
                body[s] = {} # locations 
            body[s][(tcpID,str(sidx))] = None 

    print(f"Finished {era}: Total {len(body)} body passages.")

    # Marginalia
    
    for fp in tqdm(os.listdir(f"{folder}/DH/SERMONS_APP/db/data/{era}")):
        if "margin" not in fp: continue 
        margin = pd.read_csv(f"{folder}/DH/SERMONS_APP/db/data/{era}/{fp}", header=None, names=["tcpID","sidx","nidx","original","standardized"])
        margin = margin.to_dict(orient="records")
        for m in margin:
            t = m["original"]
            if isinstance(t,float): continue
            t = clean_text(t)
            tcpID, sidx, nidx = m["tcpID"], m["sidx"], m["nidx"]
            if s not in margin_p: 
                margin_p[t] = {} # locations 
            margin_p[t][(tcpID,str(sidx),str(nidx))] = None 

    print(f"Finished {era} marginalia: Total {len(margin_p)} margin passages.")

100%|██████████| 12/12 [00:02<00:00,  4.75it/s]


Finished pre-Elizabeth: Total 51029 body passages.


100%|██████████| 12/12 [00:00<00:00, 112.91it/s]


Finished pre-Elizabeth marginalia: Total 5150 margin passages.


100%|██████████| 12/12 [00:20<00:00,  1.71s/it]


Finished Elizabeth: Total 505904 body passages.


100%|██████████| 12/12 [00:00<00:00, 13.55it/s]


Finished Elizabeth marginalia: Total 56351 margin passages.


100%|██████████| 14/14 [00:28<00:00,  2.05s/it]


Finished JamesI: Total 1166971 body passages.


100%|██████████| 14/14 [00:02<00:00,  5.65it/s]


Finished JamesI marginalia: Total 164304 margin passages.


100%|██████████| 22/22 [00:29<00:00,  1.33s/it]


Finished CharlesI: Total 1809118 body passages.


100%|██████████| 22/22 [00:01<00:00, 11.31it/s]


Finished CharlesI marginalia: Total 247991 margin passages.


100%|██████████| 18/18 [00:19<00:00,  1.10s/it]


Finished CivilWar: Total 2242270 body passages.


100%|██████████| 18/18 [00:01<00:00, 17.21it/s]


Finished CivilWar marginalia: Total 287593 margin passages.


100%|██████████| 18/18 [00:33<00:00,  1.84s/it]


Finished Interregnum: Total 2977286 body passages.


100%|██████████| 18/18 [00:02<00:00,  8.67it/s]


Finished Interregnum marginalia: Total 335020 margin passages.


100%|██████████| 20/20 [01:15<00:00,  3.79s/it]


Finished CharlesII: Total 4547095 body passages.


100%|██████████| 20/20 [00:02<00:00,  9.57it/s]


Finished CharlesII marginalia: Total 393336 margin passages.


100%|██████████| 18/18 [00:06<00:00,  2.85it/s]


Finished JamesII: Total 4690403 body passages.


100%|██████████| 18/18 [00:00<00:00, 72.12it/s]


Finished JamesII marginalia: Total 401144 margin passages.


100%|██████████| 20/20 [00:33<00:00,  1.70s/it]


Finished WilliamAndMary: Total 5440835 body passages.


100%|██████████| 20/20 [00:00<00:00, 26.74it/s]

Finished WilliamAndMary marginalia: Total 417205 margin passages.





In [24]:
for s,r in body.items(): 
    body[s] = list(r.keys())

with open(f"../assets/all_unique.json","w+") as file: 
    json.dump(body,file)

for s,r in margin_p.items(): 
    margin_p[s] = list(r.keys())
with open(f"../assets/all_unique_marginalia.json","w+") as file: 
    json.dump(margin_p,file)