# Prepare data for information retrieval fine-tuning 

## Map Bible verse to body segments with the relevant citations

In [2]:
import re,json,os
import pandas as pd 
def combine_punc_with_text(segment):
    segment = re.sub(r'\s+([,.?!;:)])', r'\1', segment)
    segment = re.sub(r'([(])\s+', r'\1', segment)
    segment = re.sub(r"\s+"," ",segment)
    return segment

In [6]:
def process(era,c_type="verse"): 
    relevant = {}
    with open(f"../assets/citations/{era}_{c_type}_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for cited, segments in c_to_seg.items():
        if "Ibidem" in cited: continue 
        cited = " ".join(cited.split("-"))
        if re.search(r"\d+ \d+",cited):
            cited = cited.split(" ")
            if c_type == "verse": 
                cited = " ".join(cited[:-2]) +" " + ".".join(cited[-2:])
            else: cited = " ".join(cited)
        for s in segments: 
            seg_id = (s.split(",")[0],int(s.split(",")[1]))
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(cited)
    print(era, len(c_to_seg),'citations',len(seg_to_c),"segments")

    for prefix in os.listdir(f"../assets/processed/{era}/sub-segments"):
        with open(f"../assets/processed/{era}/sub-segments/{prefix}",'r') as file:
            s_ids, s_text, s_orig, _ = json.load(file)
        
        for idx, s_id in enumerate(s_ids):
            tcpID, sidx, nidx = s_id[0][0], str(s_id[0][1]), s_id[0][2]
            if nidx >= 0: continue # only body content
            if (tcpID,int(sidx)) in seg_to_c: 
                s = combine_punc_with_text(s_text[idx])
                t = combine_punc_with_text(s_orig[idx])
                if len(t.split(" "))< 5: continue # at least 5 words long
                if s not in relevant: 
                    relevant[s] = ({},{},{}) # citations, original, location 
                relevant[s][2][(tcpID,sidx,str(s_id[1]))] = None 
                relevant[s][1][t] = None 
                for c in seg_to_c[(tcpID,int(sidx))]:
                    relevant[s][0][c] = None 
    for s,r in relevant.items(): 
        relevant[s] = (list(r[0].keys()), list(r[1].keys()), list(r[2].keys()))
    print(len(set(relevant)),'unique passages')
    if c_type == "chapter": 
        with open(f"../assets/relevant/{era}_chapter_citations.json","w+") as file: 
            json.dump(relevant, file)
    else: 
        with open(f"../assets/relevant/{era}.json","w+") as file: 
            json.dump(relevant, file)

In [7]:
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    process(era)

Elizabethan 15541 citations 26648 segments
80586 unique passages
Carolinian 25188 citations 68937 segments
259178 unique passages
WilliamAndMary 20994 citations 65281 segments
231206 unique passages
pre-Elizabethan 434 citations 476 segments
970 unique passages
Jacobean 24121 citations 71260 segments
247392 unique passages
CharlesII 32891 citations 169548 segments
644862 unique passages
CivilWar 22412 citations 45459 segments
190585 unique passages
JamesII 9997 citations 11107 segments
42944 unique passages
Interregnum 25695 citations 84429 segments
350522 unique passages


In [8]:
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    process(era,"chapter")

Elizabethan 1988 citations 22602 segments
60760 unique passages
Carolinian 2177 citations 20759 segments
91236 unique passages
WilliamAndMary 1664 citations 9991 segments
39221 unique passages
pre-Elizabethan 910 citations 4143 segments
8389 unique passages
Jacobean 2229 citations 23093 segments
89681 unique passages
CharlesII 2625 citations 32753 segments
141959 unique passages
CivilWar 1743 citations 11986 segments
57617 unique passages
JamesII 1148 citations 3362 segments
13873 unique passages
Interregnum 3203 citations 19442 segments
96161 unique passages


## Get non-citation segments 

In [4]:
folder = "../assets"
locations = {}
for fp in os.listdir(f"{folder}/relevant"):
    if not re.search(f'CivilWar',fp): continue
    else:
      with open(f"{folder}/relevant/{fp}","r") as file:
        r = json.load(file)
        for k, v in r.items(): 
            loc = v[2]
            for l in loc: 
               locations[tuple(l)] = None 
len(locations)

218958

In [12]:
from tqdm import tqdm 
for era in os.listdir('../assets/processed'): 
    if era == ".DS_Store": continue 
    if not re.search("CivilWar",era): continue
    parts = {}
    for prefix in tqdm(os.listdir(f"../assets/processed/{era}/sub-segments")):
        if prefix == ".DS_Store": continue

        with open(f"../assets/processed/{era}/sub-segments/{prefix}",'r') as file:
            s_ids, s_text, s_orig, _ = json.load(file)
        for idx, s_id in enumerate(s_ids):
            tcpID, sidx, nidx = s_id[0][0], str(s_id[0][1]), s_id[0][2]
            pidx = s_id[1]
            if nidx >= 0: continue # only body content
            s = combine_punc_with_text(s_text[idx])
            t = combine_punc_with_text(s_orig[idx])
            # t = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",t)
            # t = re.sub(r"\s+"," ",t)
            t = t.strip(" ")
            if s not in parts: 
                parts[s] = ({},{}) # original, locations 
            if len(s.split(" "))< 5: continue # at least 5 words long  
            parts[s][1][(tcpID,sidx,str(s_id[1]))] = None 
            parts[s][0][t] = None 
    for s,r in parts.items(): 
        parts[s] = [r[0], list(r[1].keys())]
    
    batches = []
    batch_size = 40000
    batch_num = 0 
    for i in range(0, len(parts), batch_size):
        batch = {}
        for p in list(parts.keys())[i: i + batch_size]:
            batch[p] = parts[p]
        batches.append(batch)
        with open(f"../assets/unique_body/{era}_{batch_num}","w+") as file: 
            json.dump(parts,file)
        batch_num += 1 
    print(f"{era}: {len(batches)} batches. Total {len(parts)} parts.")

'''
 CivilWar: 16 batches. Total 639551 parts.
'''

100%|██████████| 9/9 [00:48<00:00,  5.37s/it]


CivilWar: 16 batches. Total 639551 parts.


'\n \n'

In [31]:
# Marginalia 
from tqdm import tqdm 

era = "CivilWar"
parts = {}
for fp in tqdm(os.listdir(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}")):
    if "margin" not in fp: continue 
    margin = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None, names=["tcpID","sidx","nidx","original","standardized"])
    margin = margin.to_dict(orient="records")
    for m in margin:
        t = m["original"]
        tcpID, sidx, nidx = m["tcpID"], m["sidx"], m["nidx"]
        if isinstance(tcpID,int): 
            print(fp,m)
            break
        s = m["standardized"]
        if not isinstance(s,str): continue 
        if len(s) == 0: continue 
        if s not in parts: 
            parts[s] = ({},{}) # original, locations 
        parts[s][1][(tcpID,str(sidx),str(nidx))] = None 
        parts[s][0][t] = None 
for s,r in parts.items(): 
    parts[s] = [[str(k) for k in r[0]], list(r[1].keys())]

100%|██████████| 20/20 [00:03<00:00,  5.99it/s]


In [33]:
with open(f"../assets/unique_body/{era}_margin","w+") as file: 
    json.dump(parts,file)
''' 
CivilWar: Total 47941 parts.
'''
print(f"{era}: Total {len(parts)} parts.")

CivilWar: Total 47941 parts.
