# Prepare data for information retrieval fine-tuning 

Map Bible verse to body segments with the relevant citations

In [55]:
import re,json,os
import pandas as pd 
def combine_punc_with_text(segment):
    segment = re.sub(r'\s+([,.?!;:)])', r'\1', segment)
    segment = re.sub(r'([(])\s+', r'\1', segment)
    segment = re.sub(r"\s+"," ",segment)
    return segment

convert_book = {"KJV":
                        {"Wisdom-of-Solomon":"Wisdom","Prayer-of-Azariah":"Azariah",
                        "Song-of-Solomon":"Canticles","Epistle-of-Jeremiah":"Jeremiah",
                        "Prayer-of-Manasseh":"Manasseh","Acts-of-the-Apostles":"Acts"},
                "Douay-Rheims": 
                        {"Psalm":"Psalms","Song-of-Solomon":"Canticles"},
                "LV": {'Gen': "Genesis", 'Exo': "Exodus", 'Lev': "Leviticus", 
                        'Num': "Numbers", 'Deu': "Deuteronomy", 'Jos': "Joshua", 
                        'Jdg': "Judges", 'Rut': "Ruth", 'Sa1': "1 Samuel", 
                        'Sa2': "2 Samuel", 'Kg1': "1 Kings", 'Kg2': "2 Kings", 
                        'Ch1': "1 Chronicles", 'Ch2': "2 Chronicles", 'Ezr': "Ezra", 
                        'Neh': "Nehemiah", 'Tob': "Tobit", 'Jdt': "Judith", 
                        'Est': "Esther", 'Job': "Job", 'Psa': "Psalms", 
                        'Pro': "Proverbs", 'Ecc': "Ecclesiastes", 'Sol': "Canticles", 
                        'Wis': "Wisdom", 'Sir': "Ecclesiasticus", 'Isa': "Isaiah", 
                        'Jer': "Jeremiah", 'Lam': "Lamentations", 'Bar': "Baruch", 
                        'Eze': "Ezekiel", 'Dan': "Daniel", 'Hos': "Hosea", 
                        'Joe': "Joel", 'Amo': "Amos", 'Oba': "Obadiah", 
                        'Jon': "Jonah", 'Mic': "Micah", 'Nah': "Nahum", 
                        'Hab': "Habakkuk", 'Zep': "Zephaniah", 'Hag': "Haggai", 
                        'Zac': "Zechariah", 'Mal': "Malachi", 'Ma1': "1 Maccabees", 
                        'Ma2': "2 Maccabees", 'Mat': "Matthew", 'Mar': "Mark", 
                        'Luk': "Luke", 'Joh': "John", 'Act': "Acts", 
                        'Rom': "Romans", 'Co1': "1 Corinthians", 'Co2': "2 Corinthians", 
                        'Gal': "Galatians", 'Eph': "Ephesians", 'Phi': "Philippians", 
                        'Col': "Colossians", 'Th1': "1 Thessalonians", 'Th2': "2 Thessalonians", 
                        'Ti1': "1 Timothy", 'Ti2': "2 Timothy", 'Tit': "Titus", 
                        'Plm': "Philemon", 'Heb': "Hebrews", 'Jam': "James", 
                        'Pe1': "1 Peter", 'Pe2': "2 Peter", 'Jo1': "1 John", 
                        'Jo2': "2 John", 'Jo3': "3 John", 'Jde': "Jude", 
                        'Rev': "Revelation"}
                }

def read_bible(ver):
    bible = pd.read_csv(f'../assets/bible/{ver}.csv')
    bible_dict = {}
    bible_books = {}
    for idx,verse_id in enumerate(bible['doc_id']): 
        verse_id = verse_id.split("(KJV)")[0].strip(" ")
        verse_id = re.sub(r"[\s\:]","-",verse_id)
        book = "-".join(bible['book'][idx].split(" "))
        if book in convert_book[ver]: 
            verse_id = re.sub(book,convert_book[ver][book],verse_id)
            book = convert_book[ver][book]
        bible_dict[verse_id] = bible['text'][idx]
        bible_books[book] = bible['part'][idx]
    return bible_dict, bible_books


kjv, kjv_books = read_bible("KJV")
drv, drv_books = read_bible("Douay-Rheims")


def read_vulgate(): 
    with open("../assets/bible/vuldat.txt") as file: 
        bible = file.readlines()
    bible_dict = {}
    bible_books = {}
    for b in bible: 
        book,chapter,verse,text = b.split("|")
        if book in convert_book["LV"]: 
            book = convert_book["LV"][book]
            book = "-".join(book.split(" "))
        verse_id = "-".join([book,chapter,verse])
        bible_dict[verse_id] = text 
        bible_books[book] = None # drv_books[book] 
    return bible_dict, bible_books

lv, lv_books = read_vulgate()

In [8]:
relevant = {}
for era in os.listdir('../assets/processed'):
    if era == ".DS_Store": continue
    if era not in ["CivilWar"]: continue
    with open(f"../assets/citations/{era}_verse_citation_segments.json") as file:
        c_to_seg = json.load(file)
    seg_to_c = {}
    for verse, segments in c_to_seg.items():
        if "Ibidem" in verse: continue 
        for s in segments: 
            seg_id = (s.split(",")[0],s.split(",")[1])
            if seg_id not in seg_to_c: 
                seg_to_c[seg_id] = []
            seg_to_c[seg_id].append(verse)
    print(era, len(c_to_seg),'verses',len(seg_to_c),"segments")
    
    count = 0 
    for prefix in os.listdir(f"../assets/processed/{era}/sub-segments"):
        if prefix == ".DS_Store": continue
        with open(f"../assets/processed/{era}/sub-segments/{prefix}",'r') as file:
            s_ids, s_text, _ = json.load(file)
        
        for idx, s_id in enumerate(s_ids):
            tcpID, sidx, nidx = s_id[0][0], str(s_id[0][1]), s_id[0][2]
            if nidx >= 0: continue # only body content
            if (tcpID,sidx) in seg_to_c:  # must have a verse citation
                s = combine_punc_with_text(s_text[idx])
                for c in seg_to_c[(tcpID,sidx)]:
                    if len(s.split(" "))< 10: continue # at least ten words long 
                    if c not in relevant: relevant[c] = []
                    relevant[c].append(s)
relevant = {c:list(set(r)) for c,r in relevant.items()}
len(set(relevant)), sum([len(v) for v in relevant.values()])

# over two million across all eras 
# over one million across all pre-Restoration eras 
# over 700k across all pre-CW eras
# CW & IR: 706k  
# CW: 236465

CivilWar 15625 verses 16766 segments


(15417, 236465)

In [21]:
import sys
sys.path.append('../')
from lib.standardization import * 

In [58]:
all_parts = []
for r in relevant.values(): 
    all_parts.extend(r)
has_citations = {"All": [], "KJV":[], "DRV":[], "LV":[],"Catholic":[]} # Catholic is both DRV & LV 

no_citations = []
from tqdm import tqdm 
progress =tqdm(all_parts)
count = 0 
for r in progress: 
    progress.set_description(str(count))
    cited = extract_citations(r)
    if len(cited[0]) > 0:
        for cidx, clist in cited[0].items(): 
            for c in clist: 
                if "." in c:
                    c = "-".join(c.split(".")) 
                    c = "-".join(c.split(" "))
                    if c in kjv and (c in drv or c in lv): 
                        has_citations["All"].append((c,r))
                        count += 1 
                    elif c in kjv and c not in drv and c not in lv:
                        has_citations['KJV'].append((c,r))
                        count += 1 
                    elif c not in kjv and c not in drv and c in lv: 
                        has_citations["LV"].append((c,r))
                        count += 1 
                    elif c not in kjv and (c in drv or c in lv): 
                        has_citations["Catholic"].append((c,r))
                        count += 1 
    else: 
        no_citations.append(r)
print({k:len(v) for k,v in has_citations.items()})

12633: 100%|██████████| 236465/236465 [11:52<00:00, 331.92it/s]


{'All': 12354, 'KJV': 243, 'DRV': 0, 'LV': 36, 'Catholic': 0}


In [59]:
with open("../assets/relevant/CivilWar.json","w+") as file: 
    json.dump((has_citations,no_citations), file)

In [45]:
books = {"-".join(r.split("-")[:-2]) for r,v in relevant.items()}