# Prepare data for information retrieval fine-tuning 

## Map Bible verse to body segments with the relevant citations

In [8]:
import re,json,os
import pandas as pd 
from tqdm import tqdm 
import string 
import warnings
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

def clean_text(s): 
    s = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>,","",s) # \d+\^PAGE[S]*\^MISSING"
    s = re.sub(r"\s+"," ",s)
    return s.strip(" ").lower()
    ret_str = []
    for token in s.lower().split(): 
        ret_str.append(token.strip(string.punctuation))
    return " ".join(ret_str).strip()
with open('../assets/corpora.json',"r") as file: 
    eras = json.load(file)
    

In [None]:
relevant = {}
c_type = "verse"

for era in eras: 
    with open(f"../assets/citations/{era}_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for cited, segments in c_to_seg.items():
        if "Ibidem" in cited or "Verse" in cited: continue 
        cited = " ".join(cited.split("-"))
        if re.search(r"\d+ \d+",cited):
            cited = cited.split(" ")
            if c_type == "verse": 
                cited = " ".join(cited[:-2]) +" " + ".".join(cited[-2:])
            else: cited = " ".join(cited)
        for s in segments: 
            seg_id = (s.split(",")[0],int(s.split(",")[1]))
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(cited)
    # print(era, len(c_to_seg),'citations',len(seg_to_c),"segments")

    for fp in tqdm(os.listdir(f"../../SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{fp}", header=None)
        for idx, tcpID in enumerate(text[0]):
            sidx = str(text[1][idx])
            
            close_to_citation = False 
            citation_sidx = []
            # window of 2 segments 
            if (tcpID,int(sidx)) in seg_to_c:
                close_to_citation = True
                citation_sidx.append(int(sidx))
            else: 
                i = 1 
                if (tcpID,int(sidx)-i) in seg_to_c:
                    close_to_citation = True
                    citation_sidx.append(int(sidx)-i)
                elif (tcpID,int(sidx)+i) in seg_to_c:
                    close_to_citation = True 
                    citation_sidx.append(int(sidx)+i)
            if close_to_citation: 
                s = clean_text(text[6][idx])
                if len(s.split(" "))< 5: continue # at least 5 words long
                if s not in relevant: 
                    relevant[s] = ({},{}) # citations, location 
                relevant[s][1][(tcpID,sidx)] = None  
                for entry in citation_sidx: 
                    for c in seg_to_c[(tcpID,entry)]:
                        relevant[s][0][c] = None 
    print("Finished processing",era,len(set(relevant)),'unique passages')



In [3]:
output = {}
for s,r in relevant.items(): 
    output[s] = (list(r[0].keys()), list(r[1].keys()))
with open(f"../../segments/all_relevant.json","w+") as file: 
    json.dump(output, file)
del output

## Get non-citation segments 

In [9]:
# Body Texts  
from tqdm import tqdm 

body = {}
margin_p = {}

for era in eras: 
    for fp in tqdm(os.listdir(f"../../SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{fp}", header=None)
        text = text.to_dict(orient="records")
        for item in text:
            s = clean_text(item[6])
            # if s in relevant: continue 
            tcpID, sidx = item[0], item[1]
            if not isinstance(s,str): continue 
            if len(s) == 0: continue 
            if s not in body: 
                body[s] = []# locations 
            body[s].append((tcpID,str(sidx),'In-Text')) 

    print(f"Finished {era}: Total {len(body)} body passages.")

    # Marginalia
    
    for fp in tqdm(os.listdir(f"../../SERMONS_APP/db/data/{era}")):
        if "margin" not in fp: continue 
        margin = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{fp}", header=None)
        margin = margin.to_dict(orient="records")
        for m in margin:
            s = m[3]
            if isinstance(s,float): continue
            s = clean_text(s)
            tcpID, sidx, nidx = m[0], m[1], m[2]
            if s not in margin_p: 
                margin_p[s] = [] # locations 
            margin_p[s].append((tcpID,str(sidx),"Note " + str(nidx)))

    print(f"Finished {era} marginalia: Total {len(margin_p)} margin passages.")

100%|██████████| 13/13 [00:02<00:00,  6.01it/s]


Finished pre-Elizabeth: Total 59415 body passages.


100%|██████████| 13/13 [00:00<00:00, 105.63it/s]


Finished pre-Elizabeth marginalia: Total 5235 margin passages.


100%|██████████| 13/13 [00:16<00:00,  1.24s/it]


Finished Elizabeth: Total 582603 body passages.


100%|██████████| 13/13 [00:00<00:00, 14.87it/s]


Finished Elizabeth marginalia: Total 58360 margin passages.


100%|██████████| 15/15 [00:25<00:00,  1.68s/it]


Finished JamesI: Total 1368373 body passages.


100%|██████████| 15/15 [00:02<00:00,  6.97it/s]


Finished JamesI marginalia: Total 171017 margin passages.


100%|██████████| 23/23 [00:25<00:00,  1.12s/it]


Finished CharlesI: Total 2114885 body passages.


100%|██████████| 23/23 [00:01<00:00, 11.99it/s]


Finished CharlesI marginalia: Total 259718 margin passages.


100%|██████████| 19/19 [00:15<00:00,  1.22it/s]


Finished CivilWar: Total 2608479 body passages.


100%|██████████| 19/19 [00:00<00:00, 21.87it/s]


Finished CivilWar marginalia: Total 300726 margin passages.


100%|██████████| 19/19 [00:26<00:00,  1.38s/it]


Finished Interregnum: Total 3426736 body passages.


100%|██████████| 19/19 [00:01<00:00, 16.33it/s]


Finished Interregnum marginalia: Total 349580 margin passages.


100%|██████████| 20/20 [00:54<00:00,  2.71s/it]


Finished CharlesII: Total 5158214 body passages.


100%|██████████| 20/20 [00:01<00:00, 13.63it/s]


Finished CharlesII marginalia: Total 410640 margin passages.


100%|██████████| 19/19 [00:05<00:00,  3.57it/s]


Finished JamesII: Total 5324306 body passages.


100%|██████████| 19/19 [00:00<00:00, 78.25it/s]


Finished JamesII marginalia: Total 418776 margin passages.


100%|██████████| 20/20 [00:26<00:00,  1.33s/it]


Finished WilliamAndMary: Total 6125769 body passages.


100%|██████████| 20/20 [00:00<00:00, 42.63it/s]

Finished WilliamAndMary marginalia: Total 435906 margin passages.





In [11]:
with open(f"../../segments/all_unique.json","w+") as file: 
    json.dump(body,file)

with open(f"../../segments/all_unique_marginalia.json","w+") as file: 
    json.dump(margin_p,file)

In [5]:
del body, margin_p