# Prepare data for information retrieval fine-tuning 

## Map Bible verse to body segments with the relevant citations

In [3]:
import re,json,os
import pandas as pd 
from tqdm import tqdm 

In [7]:
def process(era,c_type="verse"): 
    relevant = {}
    with open(f"../assets/citations/{era}_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for cited, segments in c_to_seg.items():
        if "Ibidem" in cited: continue 
        cited = " ".join(cited.split("-"))
        if re.search(r"\d+ \d+",cited):
            cited = cited.split(" ")
            if c_type == "verse": 
                cited = " ".join(cited[:-2]) +" " + ".".join(cited[-2:])
            else: cited = " ".join(cited)
        for s in segments: 
            seg_id = (s.split(",")[0],int(s.split(",")[1]))
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(cited)
    print(era, len(c_to_seg),'citations',len(seg_to_c),"segments")

    for fp in tqdm(os.listdir(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None)
        for idx, tcpID in enumerate(text[0]):
            sidx = str(text[1][idx])
            
            close_to_citation = False 
            citation_sidx = []
            # window of 2 segments 
            if (tcpID,int(sidx)) in seg_to_c:
                close_to_citation = True
                citation_sidx.append(int(sidx))
            else: 
                i = 1 
                if (tcpID,int(sidx)-i) in seg_to_c:
                    close_to_citation = True
                    citation_sidx.append(int(sidx)-i)
                elif (tcpID,int(sidx)+i) in seg_to_c:
                    close_to_citation = True 
                    citation_sidx.append(int(sidx)+i)
            if close_to_citation: 
                s = text[7][idx]
                t = text[6][idx]
                t = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",t)
                t = re.sub(r"\s+"," ",t)
                t = t.strip(" ")
                if len(t.split(" "))< 5: continue # at least 5 words long
                if s not in relevant: 
                    relevant[s] = ({},{},{}) # citations, original, location 
                relevant[s][2][(tcpID,sidx)] = None 
                relevant[s][1][t] = None 
                for entry in citation_sidx: 
                    for c in seg_to_c[(tcpID,entry)]:
                        relevant[s][0][c] = None 
    for s,r in relevant.items(): 
        relevant[s] = (list(r[0].keys()), list(r[1].keys()), list(r[2].keys()))
    print(len(set(relevant)),'unique passages')
    with open(f"../assets/relevant/{era}_citations.json","w+") as file: 
        json.dump(relevant, file)


In [8]:
with open('../assets/corpora.json',"r") as file: 
    eras = json.load(file)
for era in eras: 
    process(era)

pre-Elizabethan 1255 citations 5335 segments


100%|██████████| 19/19 [00:01<00:00, 10.75it/s]


13180 unique passages
Elizabethan 18612 citations 66304 segments


100%|██████████| 19/19 [00:18<00:00,  1.05it/s]


157078 unique passages
Jacobean 28028 citations 128122 segments


100%|██████████| 22/22 [00:23<00:00,  1.05s/it]


282511 unique passages
Carolinian 29558 citations 125648 segments


100%|██████████| 33/33 [00:23<00:00,  1.41it/s]


279490 unique passages
CivilWar 24626 citations 76868 segments


100%|██████████| 29/29 [00:14<00:00,  1.97it/s]


182425 unique passages
Interregnum 29512 citations 137974 segments


  text = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None)
100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


313682 unique passages
CharlesII 36222 citations 260707 segments


100%|██████████| 30/30 [00:59<00:00,  1.98s/it]


600644 unique passages
JamesII 11290 citations 17852 segments


100%|██████████| 28/28 [00:04<00:00,  6.97it/s]


42781 unique passages
WilliamAndMary 24357 citations 160023 segments


100%|██████████| 32/32 [00:24<00:00,  1.32it/s]


333484 unique passages


## Get non-citation segments 

In [2]:
folder = "../assets"
locations = {}
for fp in os.listdir(f"{folder}/relevant"):
    if not re.search(f'CivilWar',fp): continue
    else:
      with open(f"{folder}/relevant/{fp}","r") as file:
        r = json.load(file)
        for k, v in r.items(): 
            loc = v[2]
            for l in loc: 
               locations[tuple(l)] = None 
len(locations)

218958

In [29]:
# Body Texts  
from tqdm import tqdm 

def get_era(era):
    parts = {}
    for fp in tqdm(os.listdir(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}")):
        if "body" not in fp: continue 
        text = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None)
        text = text.to_dict(orient="records")
        for item in text:
            original = item[6]
            original = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",original)
            original = re.sub(r"\s+"," ",original)
            original = original.strip(" ")
            tcpID, sidx = item[0], item[1]
            s = item[7]
            if not isinstance(s,str): continue 
            if len(s) == 0: continue 
            if s not in parts: 
                parts[s] = ({},{}) # original, locations 
            parts[s][1][(tcpID,str(sidx))] = None 
            parts[s][0][original] = None 
    for s,r in parts.items(): 
        parts[s] = [[str(k) for k in r[0]], list(r[1].keys())]

    batches = []
    batch_size = 40000
    batch_num = 0 
    progress = tqdm( range(0, len(parts), batch_size))
    for i in progress:
        batch = {}
        for p in list(parts.keys())[i: i + batch_size]:
            batch[p] = parts[p]
        batches.append(batch)
        with open(f"../assets/unique_body/{era}_{batch_num}.json","w+") as file: 
            json.dump(parts,file)
        batch_num += 1 

    print(f"{era}: Total {len(parts)} parts.")

    # Marginalia 
    parts = {}
    for fp in tqdm(os.listdir(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}")):
        if "margin" not in fp: continue 
        margin = pd.read_csv(f"/Users/amycweng/DH/SERMONS_APP/db/data/{era}/{fp}", header=None, names=["tcpID","sidx","nidx","original","standardized"])
        margin = margin.to_dict(orient="records")
        for m in margin:
            t = m["original"]
            original = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>","",t)
            original = re.sub(r"\s+"," ",original)
            t = original.strip(" ")
            tcpID, sidx, nidx = m["tcpID"], m["sidx"], m["nidx"]
            s = m["standardized"]
            if not isinstance(s,str): continue 
            if len(s) == 0: continue 
            if s not in parts: 
                parts[s] = ({},{}) # original, locations 
            parts[s][1][(tcpID,str(sidx),str(nidx))] = None 
            parts[s][0][t] = None 
    for s,r in parts.items(): 
        parts[s] = [[str(k) for k in r[0]], list(r[1].keys())]
    with open(f"../assets/unique_body/{era}_margin.json","w+") as file: 
        json.dump(parts,file)
    print(f"{era} marginalia: Total {len(parts)} parts.")

In [27]:
get_era("pre-Elizabethan")

100%|██████████| 19/19 [00:02<00:00,  6.49it/s]
100%|██████████| 3/3 [00:05<00:00,  1.73s/it]


pre-Elizabethan: Total 109838 parts.


100%|██████████| 19/19 [00:00<00:00, 33.87it/s]

pre-Elizabethan marginalia: Total 7066 parts.





In [None]:
get_era("Elizabethan")

In [None]:
get_era("Jacobean")

In [None]:
get_era("Carolinian")

In [None]:
get_era("Interregnum")