# Setup

In [1]:
import os, json,re
from tqdm import tqdm 

## Bible books 

In [2]:
corrected_books = {
    'Gen': 'Genesis',
    'Exo': 'Exodus',
    'Lev': 'Leviticus',
    'Num': 'Numbers',
    'Deu': 'Deuteronomy',
    'Jos': 'Joshua',
    'Jdg': 'Judges',
    'Rut': 'Ruth',
    '1Sa': '1 Samuel',
    '2Sa': '2 Samuel',
    '1Ki': '1 Kings',
    '2Ki': '2 Kings',
    '1Ch': '1 Chronicles',
    '2Ch': '2 Chronicles',
    'Ezr': 'Ezra',
    'Neh':'Nehemiah',
    'Est': 'Esther',
    'Job': 'Job',
    'Psa': 'Psalms',
    'Pro': 'Proverbs',
    'Ecc': 'Ecclesiastes',
    'Sng': 'Canticles',
    'Isa': 'Isaiah',
    'Jer': 'Jeremiah',
    'Lam': 'Lamentations',
    'Ezk': 'Ezekiel',
    'Dan': 'Daniel',
    'Hos': 'Hosea',
    'Jol': 'Joel',
    'Amo': 'Amos',
    'Oba': 'Obadiah',
    'Jon': 'Jonah',
    'Mic': 'Micah',
    'Nam': 'Nahum',
    'Hab': 'Habakkuk',
    'Zep': 'Zephaniah',
    'Hag': 'Haggai',
    'Zec': 'Zechariah',
    'Mal': 'Malachi',
    'Mat': 'Matthew',
    'Mrk': 'Mark',
    'Luk': 'Luke',
    'Jhn': 'John',
    'Act': 'Acts',
    'Rom': 'Romans',
    '1Co': '1 Corinthians',
    '2Co': '2 Corinthians',
    'Gal': 'Galatians',
    'Eph': 'Ephesians',
    'Php': 'Philippians',
    'Col': 'Colossians',
    '1Th': '1 Thessalonians',
    '2Th': '2 Thessalonians',
    '1Ti': '1 Timothy',
    '2Ti': '2 Timothy',
    'Tit': 'Titus',
    'Phm': 'Philemon',
    'Heb': 'Hebrews',
    'Jas': 'James',
    '1Pe': '1 Peter',
    '2Pe': '2 Peter',
    '1Jn': '1 John',
    '2Jn': '2 John',
    '3Jn': '3 John',
    'Jud': 'Jude',
    'Rev': 'Revelation',
    'Sus': "Susanna",
    'Bel':"Bel and the Dragon",
    'Bar':'Baruch',
    'Sir':'Ecclesiasticus',
    'Wis':'Wisdom',
    '1Ma':'1 Maccabees',
    '2Ma':'2 Maccabees',
    '1Es':"1 Esdras",
    '2Es':'2 Esdras',
    '3Es': '3 Esdras',
    '4Es': '4 Esdras',
    'Tob':'Tobit',
    'Jdt':'Judith',
    '4Ma':'4 Maccabees',
    'Man':'Manasseh',
    'S3Y':"Song of the Three Young Men",
    'Oda':'Obadiah',
    'Mrk':'Mark'
}

# Versification

## Get verse mappings

In [6]:
def convert_test(expression):
    def replace_match(match):
        abbrev = match.group(1)
        chapter = match.group(2)
        verse = match.group(3)
        full_book = corrected_books.get(abbrev, abbrev)
        return f"{full_book} {chapter}.{verse}"

    expression = re.sub(r'([\d\w]+)\.(\d+):(\d+(\.\d+)?)', replace_match, expression)

    if '=' in expression:
        parts = expression.split('=')
        operation = 'LAST' if 'Last' in parts[1] else 'E' if 'Exist' == parts[1] else 'NE'
        start = parts[0].strip()
        end = None
    elif '>' in expression:
        parts = expression.split('>')
        start = parts[0].strip()
        end = parts[1].strip()
        operation = 'GT'
    elif '<' in expression:
        parts = expression.split('<')
        start = parts[0].strip()
        end = parts[1].strip()
        operation = 'LT'
    else:
        start = expression.strip()
        end = None
        operation = 'E'

    return (start, end, operation)

In [7]:
with open(f"../../Bibles/TVTMS - Translators Versification Traditions with Methodology for Standardisation for Eng+Heb+Lat+Grk+Others - STEPBible.org CC BY.txt") as file: 
    data = file.readlines()

in_section = False 
tradition_tests = {} 
mapping = {}
test_key = None 
def convert_name(item): 
    if "Ps2" in item or 'Esg' in item or "LJe" in item or "Absent" in item: return None 
    if "." not in item: return None 
    book, nums = item.split(".")[:2]
    book = corrected_books[book]
    nums = re.sub(":",".",nums)
    return f"{book} {nums}"

for idx, line in enumerate(data): 
    if idx >= 3718: break
    line = line.strip()
    if not line:
        continue
    if re.search(r'\$Rom.14:22-23', line): 
        in_section =False
        continue 
    if re.search(r"\$[\d\w]+\.\d+[\:]*",line):
        if "English" not in line: continue 
        line = line.split("\t")
        key = line[0]
        found_new_test_key = False
        if ":" not in key: # $Dan.13	English KJV	Hebrew	Latin*	Greek	Greek2-NETS 
            book, start = key.strip("$#").split(".")
            book = corrected_books[book]
            test_key = ";".join([book, start, start])
            tradition_tests[test_key] = {k:[] for k in line[1:]} 
            found_new_test_key=True
        elif "-" in key: # across chapter division 
            if "--" in key: 
                start, end = key.strip("$#").split("--")
            else: 
                start, end = key.strip("$#").split("-")
            # print(start,end)
            book,start = start.split(".")[:2]
            if book not in corrected_books: 
                in_section = False 
                continue 
            in_section = True 
            book = corrected_books[book]
            start = re.sub(":",".",start)            
            end = re.sub(":",".", end)
            test_key = ";".join([book, start, end])
            found_new_test_key=True
            tradition_tests[test_key] = {k:[] for k in line[1:]} 
        else: 
            key = key.strip("$#$")
            book,start = key.split(".")[:2]
            if book not in corrected_books: 
                in_section = False 
                continue 
            in_section = True 
            book = corrected_books[book]
            start = re.sub(":",".",start)
            test_key = ";".join([book, start, start])
            found_new_test_key=True
            tradition_tests[test_key] = {k:[] for k in line[1:]} 
        
        if not found_new_test_key: 
            in_section = False 

    if "TEST: " in line and in_section: 
        line = line.split("\t")[1:]
        if len(line) != len(tradition_tests[test_key]): 
            print(test_key)
            # print(tradition_tests[test_key])
            print(line)
            continue 
        for idx, key in enumerate(tradition_tests[test_key]):
            test = line[idx]
            tradition_tests[test_key][key].append(convert_test(line[idx]))
    elif ("To" in line or "Verse" in line or "TextMayBeMissing" in line) and in_section: 
        line = line.split("\t")[1:]
        if len(line) != len(tradition_tests[test_key]): continue 
        verse_key = None 
        for idx, key in enumerate(tradition_tests[test_key]): 
            if  "English KJV" in key: 
                verse_key = convert_name(line[idx])
        if verse_key is None: continue
        if "Title" in verse_key: continue  
        for idx, key in enumerate(tradition_tests[test_key]): 
            if verse_key not in mapping: 
                mapping[verse_key] = {}
            if "Absent [=" in line[idx]: 
                mapping[verse_key][key] = convert_name(line[idx].split("[=")[-1].strip("]"))
            elif 'NoVerse' in line[idx] or "Absent" in line[idx]: 
                mapping[verse_key][key] = None
            else: 
                mapping[verse_key][key] = convert_name(line[idx])

def expand_references(reference_list):
    book = reference_list[0]
    start_ref = reference_list[1]
    end_ref = reference_list[2]

    start_chapter, start_verse = map(int, start_ref.split('.'))
    if "." not in end_ref: 
        end_chapter, end_verse = start_chapter, int(end_ref) 
    else: 
        end_chapter, end_verse = map(int, end_ref.split('.'))

    result = []

    for chapter in range(start_chapter, end_chapter + 1):
        start_v = start_verse if chapter == start_chapter else 1
        end_v = end_verse if chapter == end_chapter else 151  

        for verse in range(start_v, end_v + 1):
            full_ref = f'{book} {chapter}.{verse}'
            result.append(full_ref)
    return result

final_mapping = {}
for v_id, mlist in mapping.items(): 
    if "-" in v_id: 
        book = " ".join(v_id.split()[:-1])
        cv = v_id.split()[-1]
        cv1, cv2 = v_id.split()[-1].split("-")
        refs = expand_references([book,cv1,cv2])
        for ref in refs: 
            if ref not in final_mapping: 
                final_mapping[ref] = {}
        for trad, v_id2 in mlist.items():
            if v_id2 is None: 
                for idx, ref in enumerate(refs): 
                    final_mapping[ref][trad] = None 
            else: 
                book = " ".join(v_id2.split()[:-1])
                cv = v_id2.split()[-1].split("-")
                if len(cv) == 2: 
                    cv1, cv2 = cv
                    refs2 = expand_references([book,cv1,cv2])
                    for idx, ref in enumerate(refs): 
                        if len(refs2) > idx: 
                            final_mapping[ref][trad] = refs2[idx]
                else: 
                    for ref in refs: 
                        final_mapping[ref][trad] = v_id2
    else: 
        if v_id not in final_mapping: 
            final_mapping[v_id] = {}
        for trad, v_id2 in mlist.items():
            final_mapping[v_id][trad] = v_id2
mapping = final_mapping 
len(tradition_tests), len(mapping)

(429, 7287)

In [8]:
for key, tests in tradition_tests.items(): 
    if "Susanna" not in key and "Daniel" not in key and "Bel and the Dragon" not in key: 
        continue 
    print(key, tests)

Daniel;3.23;4.3 {'English KJV': [('Daniel 3.30', None, 'LAST')], 'Hebrew': [('Daniel 3.33', None, 'LAST')], 'Latin*': [('Daniel 3.100', None, 'LAST')], 'Greek*': [('Daniel 3.97', None, 'LAST')]}
Daniel;4.4;37 {'English KJV': [('Daniel 4.37', None, 'LAST'), ('', None, 'E'), ('Daniel 3.30', None, 'LAST')], 'Hebrew': [('Daniel 4.34', None, 'LAST'), ('', None, 'E'), ('Daniel 3.33', None, 'LAST')], 'Latin': [('Daniel 4.34', None, 'LAST'), ('', None, 'E'), ('Daniel 3.100', None, 'LAST')], 'Greek': [('Daniel 4.34', None, 'LAST'), ('Daniel 4.34.3', None, 'E'), ('Daniel 3.97', None, 'LAST')], 'Greek2': [('Daniel 4.37', None, 'LAST'), ('Daniel 4.37.3', None, 'E'), ('Daniel 3.97', None, 'LAST')], 'Greek Undivided': [('Daniel 4.34', None, 'LAST'), ('Daniel 4.34.3', None, 'NE'), ('Daniel 3.97', None, 'LAST')], 'Greek2 Undivided': [('Daniel 4.37', None, 'LAST'), ('Daniel 4.37.3', None, 'NE'), ('Daniel 3.97', None, 'LAST')]}
Daniel;5.31;6.28 {'English KJV': [('Daniel 6.28', None, 'LAST')], 'Hebrew': 

In [9]:
for key, tests in tradition_tests.items(): 
    if "3 John;" not in key: 
        continue 
    print(key, tests)

3 John;1.14;1.14 {'English KJV': [('3 John 1.14', None, 'LAST')], 'Greek+NRSV': [('3 John 1.15', None, 'LAST')], 'Greek2': [('3 John 1.14', None, 'LAST')]}


In [10]:
tradition_tests['Canticles;6.1;7.13'], mapping['Canticles 6.13']

({'English KJV': [('Canticles 7.13', None, 'LAST'),
   ('Canticles 6.13', None, 'LAST')],
  'Hebrew': [('Canticles 7.14', None, 'LAST'),
   ('Canticles 6.12', None, 'LAST')],
  'Latin': [('Canticles 7.13', None, 'LAST'),
   ('Canticles 6.12', None, 'LAST')],
  'Greek': [('Canticles 7.14', None, 'LAST'),
   ('Canticles 6.12', None, 'LAST')]},
 {'English KJV': 'Canticles 6.13',
  'Hebrew': 'Canticles 7.1',
  'Latin': 'Canticles 6.12',
  'Greek': 'Canticles 7.1'})

In [11]:
with open("../../Bibles/versification.json",'w+') as file: 
    json.dump([tradition_tests, mapping],file)

## Get Parallel Verses

In [3]:
bible = {}
import pandas as pd 
b_versions = ['AKJV','Geneva', 'ODRV','Douay-Rheims', 'Tyndale', 'Wycliffe','Vulgate']
for bname in b_versions:
    data = pd.read_csv(f"../../Bibles/{bname}.csv",header=None)
    data = data.to_dict(orient="records")
    for entry in tqdm(data):
        key = entry[0]
        v_id = key.split(" (")[0]
        text = entry[6]
        if re.search("Douay-Rheims",key):
            if re.sub("Douay-Rheims","ODRV",key) in bible: continue
        if len(text.split(" ")) < 200:
            bible[key] = text # f"{v_id} {text}"

        parts = re.split(r'(?<=[\.\?]) (?=[A-Z])|(?<=[\!\:\;])', text)
        parts = [re.sub(r'\s+', ' ', p).strip() for p in parts if len(p.strip(" ")) > 0]
        if (len(parts[0].split(" ")) <= 5 or len(parts[-1].split(" ")) <= 5 or re.search(r"\&\w+\;",parts[0])): 
            for pidx, p in enumerate(parts): continue
        elif len(parts) > 1:  
            for pidx, p in enumerate(parts):
              p_id = f"{key} - {pidx}"
              if len(p) == 0: continue
              if re.search(r"\&\w+\;",p) or len(p.split(" ")) <= 5: continue
              bible[p_id] = p # f"Part {pidx+1} of {v_id}: {p}"
bible_verses = list(bible.values())
bible_ids = list(bible.keys())
id_to_idx = {v_id:idx for idx, v_id in enumerate(bible_ids)}
len(bible_verses)

100%|██████████| 36702/36702 [00:01<00:00, 25015.54it/s]
100%|██████████| 31090/31090 [00:01<00:00, 26828.48it/s]
100%|██████████| 14736/14736 [00:00<00:00, 27352.72it/s]
100%|██████████| 35811/35811 [00:00<00:00, 37786.47it/s]
100%|██████████| 7954/7954 [00:00<00:00, 30214.86it/s]
100%|██████████| 9622/9622 [00:00<00:00, 28633.60it/s]
100%|██████████| 35809/35809 [00:01<00:00, 27563.55it/s]


276624

In [13]:
def fix_name(v_id):
  if "1 Kings" in v_id: v_id = "3 Kings" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id: v_id = "4 Kings" + v_id.split("2 Kings")[-1]
  elif "1 Samuel" in v_id: v_id = "1 Kings" + v_id.split("1 Samuel")[-1]
  elif "2 Samuel" in v_id: v_id = "2 Kings" + v_id.split("2 Samuel")[-1]
  elif re.search(r"^\d+ Chronicles",v_id): v_id = re.sub(r"Chronicles","Paralipomenon",v_id)
  return v_id

def expand_test_id(reference_list, version):
    book = reference_list[0]
    if version in ['Douay-Rheims','Vulgate','ODRV']: 
        book = fix_name(book)
    start_ref = reference_list[1].strip("abc")
    end_ref = reference_list[2].strip('abc')
    start_chapter, start_verse = map(int, start_ref.split('.'))
    if "." not in end_ref: 
        end_chapter, end_verse = start_chapter, int(end_ref) 
    else: 
        end_chapter, end_verse = map(int, end_ref.split('.'))

    result = []

    for chapter in range(start_chapter, end_chapter + 1):
        start_v = start_verse if chapter == start_chapter else 1
        end_v = end_verse if chapter == end_chapter else 151

        for verse in range(start_v, end_v + 1):
            full_ref = f'{book} {chapter}.{verse} ({version})'
            if full_ref in bible: 
                result.append(full_ref)
    return result

def normalize_cv(cv):
    cv = cv.split("*")[0]
    parts = cv.split(".")
    return ".".join(parts[:2])

def E(book, cv, version):
    ref = f"{book} {normalize_cv(cv)} ({version})"
    return ref in bible

def LENGTH(v1, version):
    if "+" in v1: 
        v1,v2 = v1.split("+")
        v1key, v2key = v1.split("*")[0],v2.split("*")[0]
        ref1 = f"{v1key} ({version})"
        ref2 = f"{v2key} ({version})"
        mult1,mult2 = 1, 1 
        if "*2" in v1: mult1 = 2 
        if "*2" in v2: mult2 = 2 
        if ref1 in bible and ref2 in bible: 
            return len(bible[ref1].split())*mult1 + len(bible[ref2].split())*mult2
        return False 
    else: 
        v1key = v1.split("*")[0]
        ref1 = f"{v1key} ({version})"
        mult1 = 1 
        if "*2" in v1: mult1 = 2 
        if ref1 in bible: 
            return len(bible[ref1].split())*mult1
        return False  

def LT(v1, v2, version): 
    left = LENGTH(v1, version)
    right = LENGTH(v2, version)
    if not left or not right: 
        return False 
    return left < right 

def GT(v1, v2, version): 
    left = LENGTH(v1, version)
    right = LENGTH(v2, version)
    if not left or not right: 
        return False 
    return left > right 
    
def LAST(book, cv,  version):
    chapter, verse = map(int, normalize_cv(cv).split("."))
    if f"{book} {chapter}.{verse} ({version})" not in bible: 
        return False 
    elif f"{book} {chapter}.{int(verse)+1} ({version})" in bible: 
        return False 
    return True 

In [15]:
def verse_comparison(bcv_list,cv1,larger=True): 
    book = bcv_list[0]
    if larger: 
        if "." not in bcv_list[-1]: 
            v = bcv_list[-1]
            c = bcv_list[-2].split(".")[0]
            cv = f"{c}.{v}"
        else: 
            cv = normalize_cv(bcv_list[-1])
            c, v = cv.split(".")
    else: 
        cv = normalize_cv(bcv_list[-2])
        c, v = cv.split(".")

    cv1 = normalize_cv(cv1)
    c1, v1 = cv1.split(".")
    if "-" in v1:
        v1 = v1.split("-")[-1] 
    v = v.strip('abc')
    v1 = v1.strip("abc")
    c,v,c1,v1 = int(c), int(v), int(c1), int(v1)
    
    if larger: 
        if c == c1: 
            if v1 > v: 
                return cv1 
            else: 
                return cv 
        elif c > c1: 
            return cv 
        else: 
            return cv1 
    else: 
        if c == c1: 
            if v1 < v: 
                return cv1 
            else: 
                return cv 
        elif c < c1: 
            return cv 
        else: 
            return cv1 
# verse_comparison(['Job','40.1','41.34'], '39.30',False)

In [16]:
PV = {}
chap_trad = {}

for ref, tests in tradition_tests.items(): 
    if "Title" in ref: continue

    ref = ref.split(";")
    for trad, tlist in tests.items(): 
        if "Eng" not in trad: 
            continue
        for test in tlist: 
            if test[0] == "": continue 
            # if 'Title' not in test[0]: 
            #     ref[-2] = verse_comparison(ref,test[0].split()[-1],False)
            if test[1] is None: 
                if 'Title' in test[0]: continue 
                ref[-1] = verse_comparison(ref,test[0].split()[-1])
            else: 
                ref[-1] =  verse_comparison(ref,test[1].split()[-1])
    ref[-1] = normalize_cv(ref[-1])
    ref[-2] = normalize_cv(ref[-2])   

    if "-" in ref[-1]: 
        c,v = re.findall(r'(\d+).\d+-(\d+)',ref[-1])[0]
        ref[-1] = f"{c}.{v}"
    all_refs = expand_test_id(ref,'AKJV')
    for r in all_refs:  
        if r not in PV: 
            PV[r] = []
    # print(ref)
    # print(all_refs)
    
    for ver in b_versions: 
        for tradition, tlist in tests.items(): 
            passed = True 
            for test in tlist:  
                if test[0] == "": 
                    continue 
                if "Title" in test[0]: continue 
                if re.search("\d+\.\d+\.\d+", test[0]): 
                    continue 
                if len(test[0].split()) == 0: 
                    print(tradition, test, all_refs)
                cv = test[0].split()[-1]
                book = " ".join(test[0].split()[:-1])
                first, second = test[0], test[1]

                if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                    book = fix_name(book)
                    first = fix_name(test[0])

                if test[1] is not None: 
                    cv2 = test[1].split()[-1]
                    book2 = " ".join(test[1].split()[:-1])
                    if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                        book2 = fix_name(book2)
                        second = fix_name(test[1])
                
                if test[-1] == 'LAST':
                    result = LAST(book, cv, ver)
                elif test[-1] == "E": 
                    result = E(book, cv, ver)
                elif test[-1] == "NE": 
                    exists = E(book, cv, ver)
                    if exists: 
                        result = False 
                    else: 
                        result = True 
                elif test[-1] == "LT": 
                    result = LT(first,second, ver)
                elif test[-1] == "GT": 
                    result = GT(first,second, ver)
                
                if result == False: 
                    passed = False 
                # print(ver, tradition, result, test)
            if 'Vulgate' == ver and tradition == 'Latin':
                passed = True 
            elif ver in ['Geneva','AKJV'] and tradition == 'EngTitleMerged' and ref[0] == 'Psalms' and passed is False: 
                passed =True 
            elif ver in ['Wycliffe'] and ref[0] == 'Numbers':
                if tradition != 'Latin': 
                    passed = False
                else: 
                    passed = True 
            
            # print(ver, tradition, passed)
            if passed: 
                # print(ver, tradition)
                for r in all_refs:
                    new_key = r.split(" (")[0]
                    if new_key not in mapping: 
                        continue 
                    if tradition not in mapping[new_key]: 
                        continue 
                    new_id = mapping[new_key][tradition]
                    if new_id is None: 
                        continue 
                    
                    if "-" in new_id: 
                        b = " ".join(new_id.split()[:-1])
                        cv1,cv2 = new_id.split()[-1].split("-")
                        new_ids = expand_test_id([b,cv1,cv2],ver) 
                    elif "; " in new_id: 
                        if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                            new_id = fix_name(new_id)
                        b = " ".join(new_id.split()[:-1])
                        CVs = new_id.split()[-1].split("; ")
                        new_ids = []
                        for cv in CVs: 
                            new_ids.append(f"{b} {cv} ({ver})")
                    else: 
                        if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                            new_id = fix_name(new_id)
                        new_ids = [new_id + f" ({ver})"]
                    
                    for new_id in new_ids: 
                        if new_id in bible:  
                            book_chap = new_id.split(".")[0]
                            chap_trad[(book_chap,ver)] = tradition 
                            PV[r].append(new_id)
                break
print(len(PV))

8999


In [17]:
chap_trad[('Job 40','Geneva')]

'French NEG 1979 '

In [18]:
def chap_length(v_id,version): 
    bc = v_id.split(".")[0]
    for i in range(1,1000): 
        if f"{bc}.{i} ({version})" not in bible: 
            return i-1 
        
AKJV = [v_id for v_id in bible if "AKJV" in v_id and "-" not in v_id]
for v_id in AKJV:
    key = v_id.split(" (")[0] 
    if v_id in PV: 
        if len(PV[v_id]) > 0: 
            continue 
    
    if key in mapping: 
        b,c = re.findall(r"([\w\d\s]+) (\d+)\.",key)[0]
        c = int(c) 
        for ver in b_versions: 
            for i in range(c-1, c+2): 
                trad_key = f"{b} {i}"
                if (trad_key,ver) in chap_trad: 
                    trad = chap_trad[(trad_key,ver)]
                    new_id = mapping[key][trad]
                    if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                        new_id = fix_name(new_id)
                    if "-" in new_id: 
                        b = " ".join(new_id.split()[:-1])
                        cv1,cv2 = new_id.split()[-1].split("-")
                        new_ids = expand_test_id([b,cv1,cv2],ver) 
                    elif "; " in new_id: 
                        b = " ".join(new_id.split()[:-1])
                        CVs = new_id.split()[-1].split("; ")
                        new_ids = []
                        for cv in CVs: 
                            new_ids.append(f"{b} {cv} ({ver})")
                    else: 
                        new_ids = [new_id + f" ({ver})"]
                    # print(v_id, new_ids)
                    for new_id in new_ids: 
                        if new_id in bible:  
                            if v_id not in PV: PV[v_id] = []
                            PV[v_id].append(new_id)
                    break
    else: 
        for ver in b_versions: 
            if ver in ['Douay-Rheims','ODRV','Vulgate']: 
                new_key = fix_name(key)
            else: 
                new_key = key 
            if f"{new_key} ({ver})" in bible: 
                AKJV_len = chap_length(key, 'AKJV')
                ver_len =  chap_length(new_key, ver)
                if AKJV_len == ver_len:
                    if v_id not in PV: PV[v_id] = []
                    PV[v_id].append(f"{new_key} ({ver})")
                elif AKJV_len == (ver_len+1): 
                    last_verse = key.split(".")[0] + "." + str(AKJV_len)
                    if last_verse + f" (AKJV)" in PV: 
                        if v_id not in PV: PV[v_id] = []
                        PV[v_id].append(f"{new_key} ({ver})")
                        # print(v_id, f"{new_key} ({ver})")
                elif AKJV_len == (ver_len-1): 
                    last_verse = key.split(".")[0] + "." + str(AKJV_len)
                    if last_verse + f" (AKJV)" in PV: 
                        # print(v_id, f"{new_key} ({ver})", last_verse)
                        PV[v_id].append(f"{new_key} ({ver})")
print(len(PV))

36633


In [19]:
PV['1 Chronicles 11.22 (AKJV)']

['1 Chronicles 11.22 (AKJV)',
 '1 Chronicles 11.22 (Geneva)',
 '1 Paralipomenon 11.22 (Douay-Rheims)',
 '1 Paralipomenon 11.22 (Vulgate)']

In [20]:
PV['1 Samuel 20.42 (AKJV)']

['1 Samuel 20.42 (AKJV)',
 '1 Samuel 20.42 (Geneva)',
 '1 Samuel 20.43 (Geneva)',
 '1 Kings 20.42 (Douay-Rheims)',
 '1 Kings 20.43 (Douay-Rheims)',
 '1 Kings 20.42 (Vulgate)',
 '1 Kings 20.43 (Vulgate)']

In [21]:
PV['Judges 5.1 (AKJV)']

['Judges 5.1 (AKJV)',
 'Judges 5.1 (Geneva)',
 'Judges 5.1 (Douay-Rheims)',
 'Judges 5.1 (Vulgate)']

In [22]:
PV['Mark 4.41 (AKJV)']

['Mark 4.41 (AKJV)',
 'Mark 4.41 (Geneva)',
 'Mark 4.40 (ODRV)',
 'Mark 4.41 (Tyndale)',
 'Mark 4.41 (Wycliffe)',
 'Mark 4.40 (Vulgate)']

In [23]:
PV['Psalms 1.1 (AKJV)']

['Psalms 1.1 (AKJV)', 'Psalms 1.1 (Geneva)', 'Psalms 1.1 (Vulgate)']

In [24]:
PV['Judith 15.3 (AKJV)']

['Judith 15.3 (AKJV)',
 'Judith 15.3 (Douay-Rheims)',
 'Judith 15.4 (Douay-Rheims)',
 'Judith 15.3 (Vulgate)',
 'Judith 15.4 (Vulgate)']

In [25]:
PV['Numbers 12.16 (AKJV)']

['Numbers 12.16 (AKJV)',
 'Numbers 13.1 (Geneva)',
 'Numbers 13.1 (Douay-Rheims)',
 'Numbers 13.1 (Wycliffe)',
 'Numbers 13.1 (Vulgate)']

In [26]:
PV['Job 39.30 (AKJV)']

['Job 39.30 (AKJV)',
 'Job 39.33 (Geneva)',
 'Job 39.30 (Douay-Rheims)',
 'Job 39.30 (Vulgate)']

In [10]:
from collections import defaultdict
bible_parts = defaultdict(list)
for key in bible:
    if " - " not in key:
        continue
    v_id = key.split(" - ")[0]
    if not bible_parts[v_id]:
        bible_parts[v_id].append(v_id)
    bible_parts[v_id].append(key)
len(bible_parts)

54498

In [4]:
with open(f"../../Bibles/PARALLEL_VERSES.json",'r') as file: 
    PV, _ = json.load(file)

In [7]:
apoc = {"Susanna",
    "Bel and the Dragon",
    'Baruch',
    'Ecclesiasticus',
    'Wisdom',
    '1 Maccabees',
    '2 Maccabees',
    "1 Esdras",
    '2 Esdras',
    '3 Esdras',
    '4 Esdras',
    'Tobit',
    'Judith',
    '4 Maccabees',
    'Manasseh',
    "Song of the Three Young Men"}

exclude = {
    ('John 18', 'Tyndale'),
    ('Mark 11','Tyndale'),
    ('Psalms','Douay-Rheims'),
    ('Psalms','ODRV')
}

In [13]:
p_data = {}
for v_id, vlist in PV.items():
    key = v_id.split(" (")[0]
    bc = key.split(".")[0]
    version = v_id.split(" (")[-1].strip(")")
    book = re.findall(r"(.*?) \d+\.\d+",key)[0]
    if book in apoc: continue 
    if (bc,version) in exclude or (book,version) in exclude: 
       continue
    p_data[key] = { 'q_id': key,
                    'queries':{},
                    'pos':list(vlist),
                }
    for q_id in vlist:
        for part_id in bible_parts.get(q_id, []):
            p_data[key]['queries'][part_id] = None 
            p_data[key]['pos'].append(part_id)
        p_data[key]['queries'][q_id] = None 
        p_data[key]['pos'].append(q_id)
    p_data[key]['pos'] = list(set(p_data[key]['pos']))
    
print(len(p_data))
PV_dict = defaultdict(dict)
for master_id, item in p_data.items():
  for child_id in item['pos']:
    PV_dict[child_id][master_id] = True
print(len(PV_dict), PV_dict['1 Kings 1.1 (Vulgate)'])

with open(f"../../Bibles/PARALLEL_VERSES.json",'w+') as file: 
    json.dump([p_data,PV_dict], file)

31175
251257 {'1 Samuel 1.1': True}


In [66]:
bible['Acts 2.16 (Vulgate)']

'sed hoc est quod dictum est per prophetam Joël:'

In [67]:
PV['Susanna 1.1 (AKJV)']

['Susanna 1.1 (AKJV)', 'Daniel 13.1 (ODRV)', 'Daniel 13.1 (Vulgate)']

In [68]:
PV['Judges 5.31 (AKJV)']

['Judges 5.31 (AKJV)',
 'Judges 5.31 (Geneva)',
 'Judges 5.31 (Douay-Rheims)',
 'Judges 5.32 (Douay-Rheims)',
 'Judges 5.31 (Vulgate)',
 'Judges 5.32 (Vulgate)']

In [69]:
PV['1 Chronicles 11.22 (AKJV)']

['1 Chronicles 11.22 (AKJV)',
 '1 Chronicles 11.22 (Geneva)',
 '1 Paralipomenon 11.22 (Douay-Rheims)',
 '1 Paralipomenon 11.22 (Vulgate)']

In [70]:
PV['1 Esdras 8.1 (AKJV)']

['1 Esdras 8.1 (AKJV)']

In [71]:
PV['Genesis 2.24 (AKJV)']

['Genesis 2.24 (AKJV)',
 'Genesis 2.24 (Geneva)',
 'Genesis 2.24 (ODRV)',
 'Genesis 2.24 (Wycliffe)',
 'Genesis 2.24 (Vulgate)']

In [72]:
PV['2 Samuel 1.27 (AKJV)']

['2 Samuel 1.27 (AKJV)',
 '2 Kings 1.27 (Douay-Rheims)',
 '2 Kings 1.27 (Vulgate)']

In [73]:
PV['Daniel 4.12 (AKJV)']

['Daniel 4.12 (AKJV)',
 'Daniel 4.9 (Geneva)',
 'Daniel 4.9 (ODRV)',
 'Daniel 4.9 (Vulgate)']

In [37]:
PV['1 Kings 22.53 (AKJV)']

['1 Kings 22.53 (AKJV)',
 '1 Kings 22.53 (Geneva)',
 '3 Kings 22.54 (Douay-Rheims)',
 '3 Kings 22.54 (Vulgate)']

In [38]:
PV['Exodus 39.30 (AKJV)']

['Exodus 39.30 (AKJV)',
 'Exodus 39.30 (Geneva)',
 'Exodus 39.29 (ODRV)',
 'Exodus 39.29 (Wycliffe)',
 'Exodus 39.29 (Vulgate)']

# Proper Nouns 

In [None]:
# Biblical entities 
with open(f"../assets/Bibles/TIPNR - Translators Individualised Proper Names with all References - STEPBible.org CC BY.txt") as file: 
    data = file.readlines()
in_entities_section = False
e_to_v = {}
for idx, line in enumerate(data): 
    line = line.strip()
    if not line:
        continue
    
    if line.startswith('$========== '):
        in_entities_section = True
        continue  
    
    if idx < 112: continue
    if idx > 14394: continue
    if in_entities_section:
        if line[0] != "–": continue 
        if not re.search("@",line): continue 
        name = re.findall("^(.*?)@",line)[0]
        refs = line.split("reference=")[-1].split("\t")[-1]
        name = name.split("\t")[-1].split("|")
        KJV_name = re.findall(r"KJV\s*=\s*(.*?)[);,]",line)
        if len(KJV_name) > 0: 
            KJV_name = KJV_name[0]
            name.append(KJV_name)
        for n in name: 
            n = re.sub("_"," ",n)
            if n not in e_to_v: e_to_v[n] = []
            ref_list = [re.findall("([\d\w]+)\.(\d+\.\d+)",r) for r in refs.split("; ")]
            ref_list = [f"{corrected_books[r[0][0]]} {r[0][1]}" for r in ref_list if len(r) > 0 if r[0][0] != 'Etc']
            e_to_v[n].extend(sorted(list(set(ref_list))))
    else:
        continue

In [None]:
with open(f"../assets/Bibles/proper_nouns.json",'w+') as file: 
    json.dump(e_to_v, file)

# Citations 

In [3]:
import pandas as pd
from tqdm import tqdm  
def clean_text(s): 
    s = re.sub(r"\<\/i\>|\<NOTE\>|NONLATINALPHABET|\<i\>|\d+\^PAGE[S]*\^MISSING","",s)
    s = re.sub(r"\s+"," ",s)
    s = s.strip(" ")
    return s 
all_c = {}
c_repo = "../../CITATIONS"
c_files = sorted(k for k in os.listdir(c_repo) if ".csv" in k)
for file in tqdm(c_files): 
    era, prefix = file.split("_citations.csv")[0].split("_")
    # print(era,prefix)
    data = pd.read_csv(f"{c_repo}/{file}",header=None).to_dict(orient='records')
    data = {(d[0],d[1],d[2]):d for d in data}
    text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{prefix}_body.csv",header=None).to_dict(orient='records')
    text = {(d[0],d[1],"In-Text"):clean_text(d[6]) for d in text}
    for key, t in text.items(): 
        if key in data: 
            data[key]['text'] = t 
    if f"{prefix}_margin.csv" in os.listdir(f"../../SERMONS_APP/db/data/{era}"): 
        text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{prefix}_margin.csv",header=None).to_dict(orient='records')
        text = {(d[0],d[1],"Note " + str(d[2])):clean_text(d[3]) for d in text}
        for key, t in text.items(): 
            if key in data: 
                data[key]['text'] = t
    for d in data.values(): 
        all_c[d[6]] = d[4] 
    # data = pd.DataFrame(list(data.values()))
    # data.to_csv(f"{c_repo}/{file}",index=False)

  text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{prefix}_body.csv",header=None).to_dict(orient='records')
  text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{prefix}_body.csv",header=None).to_dict(orient='records')
  text = pd.read_csv(f"../../SERMONS_APP/db/data/{era}/{prefix}_body.csv",header=None).to_dict(orient='records')
100%|██████████| 76/76 [03:42<00:00,  2.93s/it]


In [4]:
list(all_c.items())[1:10]

[('Hosea 11. 11.', 'Hosea 11.11'),
 ('Zek. 16.', 'Ezekiel 16'),
 ('Cant. 5.', 'Canticles 5'),
 ('Psalms 18.', 'Psalms 18'),
 ('Psalms 91.', 'Psalms 91'),
 ('John 15. 4:', 'John 15.4'),
 ('Psalms 18. 2.', 'Psalms 18.2'),
 ('Heb. 1.', 'Hebrews 1'),
 ('Isaiah 56.', 'Isaiah 56')]

In [5]:
import sys 
sys.path.append('../')
from lib.EEPS_helper import isNumeral
from collections import Counter

In [6]:
formats = {}

def space_punctuation(text):
    text = re.sub(r'([\.\,\:\;\!\?\(\)\-\&])(?=\w)', r'\1 ', text)
    text = re.sub(r'(?<=\w)([\.\,\:\;\!\?\(\)\-\&])', r' \1', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for orig in tqdm(all_c): 
    if orig == '6': continue 
    text = space_punctuation(orig)
    tokens = text.split(" ")
    format = []
    for t in tokens: 
        if isNumeral(t): 
            format.append('N')
        elif re.search(r"[\.\,\:\;\!\?\(\)\-\&]",t):  
            format.append(t)
        else: 
            format.append('W')
    format = " ".join(format)
    if format not in formats: formats[format] = 0
    formats[format] += 1       

100%|██████████| 304499/304499 [00:14<00:00, 21340.45it/s]


In [7]:
Counter(formats).most_common(n=10)

[('W . N . N .', 77019),
 ('W N . N .', 26893),
 ('W . N . N , N .', 15401),
 ('W . N . N', 12563),
 ('W . N . N . N .', 12353),
 ('N W . N . N .', 11008),
 ('W . N .', 8755),
 ('N . W . N . N .', 6687),
 ('W . N . N .)', 4893),
 ('W . N N .', 4877)]

In [9]:
all_books = {}
for orig, std in all_c.items(): 
    if orig == '6' and std == '4': continue 
    words = re.findall(r"([A-Za-z]+)",orig)
    book = re.findall(r"[A-Za-z]+",std)[0].lower()
    if book not in all_books: 
        all_books[book] = {}
    for w in words: 
        if isNumeral(w): 
            continue 
        w = w.lower()
        if w in ['of','cap','chap','verse','vers','chapter','ch']:
            continue 
        if w not in all_books[book]: 
            all_books[book][w] = 0 
        all_books[book][w] += 1 

common_abbrev = {}
for book, abbrevs in all_books.items(): 
    if book == "children": continue 
    top = Counter(abbrevs).most_common(n=1)[0]
    common_abbrev[book] = top[0] 

with open(f"../../Bibles/top_abbrev.json",'w+') as file: 
    json.dump(common_abbrev, file)
common_abbrev 

{'canticles': 'cant',
 'hosea': 'hos',
 'ezekiel': 'ezek',
 'psalms': 'psal',
 'john': 'john',
 'hebrews': 'heb',
 'isaiah': 'isa',
 'romans': 'rom',
 'philippians': 'phil',
 'corinthians': 'cor',
 'matthew': 'mat',
 'jeremiah': 'jer',
 'proverbs': 'prov',
 'acts': 'acts',
 'kings': 'king',
 'verse': 'ver',
 'revelation': 'rev',
 'luke': 'luke',
 'chronicles': 'chron',
 'timothy': 'tim',
 'james': 'jam',
 'peter': 'pet',
 'titus': 'tit',
 'jude': 'iud',
 'ephesians': 'eph',
 'colossians': 'col',
 'galatians': 'gal',
 'deuteronomy': 'deut',
 'samuel': 'sam',
 'job': 'job',
 'genesis': 'gen',
 'micah': 'micah',
 'lamentations': 'lam',
 'leviticus': 'levit',
 'exodus': 'exod',
 'zephaniah': 'zeph',
 'wisdom': 'wisd',
 'daniel': 'dan',
 'judges': 'judg',
 'joshua': 'josh',
 'mark': 'mark',
 'habakkuk': 'hab',
 'thessalonians': 'thes',
 'esther': 'es',
 'ecclesiastes': 'eccles',
 'zechariah': 'zach',
 'ezra': 'ezra',
 'amos': 'amos',
 'malachi': 'mal',
 'numbers': 'numb',
 'haggai': 'hag',
