# Setup

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.util import paraphrase_mining,semantic_search,pytorch_cos_sim

In [None]:
model_name = "emanjavacas/MacBERTh"
word_embedding_model = models.Transformer(model_name, max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "mean")
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [5]:
from google.colab import drive
drive.mount('/content/drive')

folder = f"/content/drive/MyDrive/DH"
output_folder = f"{folder}/EEPS"
import pandas as pd
import os, json,re
from tqdm import notebook as tqdm
import torch

Mounted at /content/drive


In [6]:
try:
    assert torch.cuda.is_available()
    device = torch.device("cuda")
except:
    device = torch.device("cpu")
print("Using device:", device)

Using device: cuda


In [7]:
bible = {}

b_versions = ['AKJV','ODRV','Geneva', 'Douay-Rheims', 'Tyndale', 'Wycliffe','Vulgate']
for bname in b_versions:
    data = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/Bibles/{bname}.csv",header=None)
    data = data.to_dict(orient="records")
    for entry in tqdm.tqdm(data):
        key = entry[0]
        v_id = key.split(" (")[0]
        text = entry[6]
        if re.search("Douay-Rheims",key):
            if re.sub("Douay-Rheims","ODRV",key) in bible: continue
        if len(text.split(" ")) < 200:
            bible[key] = f"{v_id} {text}"

        parts = re.split(r'(?<=[\.\?]) (?=[A-Z])|(?<=[\!\:\;])', text)
        parts = [re.sub(r'\s+', ' ', p).strip() for p in parts if len(p.strip(" ")) > 0]
        if (len(parts[0].split(" ")) <= 5 or len(parts[-1].split(" ")) <= 5 or re.search(r"\&\w+\;",parts[0])):
            for pidx, p in enumerate(parts): continue
        elif len(parts) > 1:
            for pidx, p in enumerate(parts):
              p_id = f"{key} - {pidx}"
              if len(p) == 0: continue
              if re.search(r"\&\w+\;",p) or len(p.split(" ")) <= 5: continue
              bible[p_id] = f"Part {pidx+1} of {v_id}: {p}"

bible['NonQP 0.0'] = "No Biblical quotation or paraphrase found"

bible_verses = list(bible.values())
bible_ids = list(bible.keys())
id_to_idx = {v_id:idx for idx, v_id in enumerate(bible_ids)}
len(bible_verses)

  0%|          | 0/36702 [00:00<?, ?it/s]

  0%|          | 0/14737 [00:00<?, ?it/s]

  0%|          | 0/31090 [00:00<?, ?it/s]

  0%|          | 0/35811 [00:00<?, ?it/s]

  0%|          | 0/7954 [00:00<?, ?it/s]

  0%|          | 0/9622 [00:00<?, ?it/s]

  0%|          | 0/35809 [00:00<?, ?it/s]

276628

In [None]:
for idx,v_id in enumerate(bible_ids):
  if re.search("1 Paralipomenon 29\.2 ",v_id):
    # if " - " in v_id: continue
    # if "AKJV" not in v_id and "ODRV" not in v_id: continue
    print(v_id, bible_verses[idx])

1 Paralipomenon 29.2 (Douay-Rheims) 1 Paralipomenon 29.2 And I with all my ability have prepared the expenses for the house of my God. Gold for vessels of gold, and silver for vessels of silver, brass for things of brass, iron for things of iron, wood for things of wood: and onyx stones, and stones like alabaster, and of divers colours, and all manner of precious stones, and marble of Paros in great abundance.
1 Paralipomenon 29.2 (Douay-Rheims) - 0 Part 1 of 1 Paralipomenon 29.2: And I with all my ability have prepared the expenses for the house of my God.
1 Paralipomenon 29.2 (Douay-Rheims) - 1 Part 2 of 1 Paralipomenon 29.2: Gold for vessels of gold, and silver for vessels of silver, brass for things of brass, iron for things of iron, wood for things of wood:
1 Paralipomenon 29.2 (Douay-Rheims) - 2 Part 3 of 1 Paralipomenon 29.2: and onyx stones, and stones like alabaster, and of divers colours, and all manner of precious stones, and marble of Paros in great abundance.
1 Paralipomen

In [8]:
# '''Vectorize for the first time'''
# bible_vectors = bi_encoder.encode(bible_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
# torch.save(bible_vectors, f'{output_folder}/Bibles_{model_name}.pt')

# Load saved Bible vectors
bible_vectors = torch.load(f"{folder}/EEPS/Bibles_MacBERTh.pt",map_location=device)

bible_vectors.shape, len(bible_verses), len(bible_ids)

(torch.Size([276628, 768]), 276628, 276628)

In [None]:
from google.colab import runtime
runtime.unassign()

In [9]:
def fix_name(v_id):
  if "1 Kings" in v_id:
    v_id = "3 Kings" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id:
    v_id = "4 Kings" + v_id.split("2 Kings")[-1]
  elif "1 Kings" in v_id:
    v_id = "1 Samuel" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id:
    v_id = "2 Samuel" + v_id.split("2 Kings")[-1]
  elif re.search(r"^\d+ Chronicles",v_id):
    v_id = re.sub(r"Chronicles","Paralipomenon",v_id)
  return v_id

# ESV Parallel Text - Predicted Negatives in the Sample Dataset

In [10]:
version_vectors = {b:[] for b in b_versions}
for b in b_versions:
  b_ids = {idx:None for idx, v_id in enumerate(bible_ids) if b in v_id and " - " not in v_id}
  version_vectors[b] = ([vec for idx, vec in enumerate(bible_vectors) if idx in b_ids], [text for idx, text in enumerate(bible_verses) if idx in b_ids])
  print(b, len(version_vectors[b][1]))

AKJV 36700
ODRV 14730
Geneva 31090
Douay-Rheims 21103
Tyndale 7954
Wycliffe 9622
Vulgate 35808


In [11]:
def fix_name_revert(v_id):
  if "3 Kings" in v_id:
    v_id = "1 Kings" + v_id.split("3 Kings")[-1]
  elif "4 Kings" in v_id:
    v_id = "2 Kings" + v_id.split("4 Kings")[-1]
  elif "1 Kings" in v_id:
    v_id = "1 Samuel" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id:
    v_id = "2 Samuel" + v_id.split("2 Kings")[-1]
  elif re.search(r"^\d+ Paralipomenon",v_id):
    v_id = re.sub(r"Paralipomenon","Chronicles",v_id)
  return v_id

In [None]:
data = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - FALSE.csv").to_dict(orient='records')
output = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - FALSE.csv").to_dict(orient='records')
data.extend(pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - TRUE.csv").to_dict(orient='records'))

all_data = {}
for item in data:
  if item['index'] not in all_data:
    v_id = re.findall(r"^[\w\s]+\d+.\d+",item['text'])[0]
    all_data[item['index']] = {'verse_id': v_id,
                                'question': item['text'],
                                 'answers':[], # verse ids with version
                                 'passages':{}, # high similarity and equivalent numberings
                                 'versions':{} # indices corresponding to passages
                                 }
  entry = all_data[item['index']]
  if item['verse_text'] in entry['passages']: continue
  entry['passages'][item['verse_text']] = None
  if item['label'] is True or item['prediction'] is True:

    entry['versions'][item['version']] = True

    v_id = re.findall(r"^[\w\s]+\d+.\d+",item['verse_text'])[0] + f" ({item['version']})"
    if item['version'] in ['Vulgate','Douay-Rheims']:
      v_id = fix_name(v_id)
    entry['answers'].append(v_id)

  if item['version'] in entry['versions']:
    if entry['versions'][item['version']] is True:
      continue
  else:
    entry['versions'][item['version']] = item['label']

In [None]:
sample_vectors = torch.load(f'{folder}/EEPS/parallel_predictions_SAMPLE_MacBERTh.pt',map_location=device)
sample_vectors.shape

torch.Size([2500, 768])

In [None]:
new_output = []
for idx, entry in tqdm.tqdm(all_data.items()):
  targets = [k for k,v in entry['versions'].items() if v is not True]
  if len(targets) == 0: continue
  b, c,v = re.findall("^(.*?) (\d+)\.(\d+)",entry['question'])[0]
  c,v = int(c),int(v)
  for version in targets:
    hits = semantic_search([sample_vectors[idx]],version_vectors[version][0],query_chunk_size=1000,top_k=300)
    for hitlist in hits:
      for hit in hitlist:
        verse_text = version_vectors[version][1][hit['corpus_id']]
        b_hit, c_hit,v_hit = re.findall("^(.*?) (\d+)\.(\d+)",verse_text)[0]
        if version in ['Vulgate','Douay-Rheims']:
          b_hit = fix_name_revert(b_hit)
        c_hit,v_hit = int(c_hit), int(v_hit)
        if b_hit != b: continue
        if c != c_hit: continue
        if v == v_hit: continue
        if abs(v_hit-v) >= 4: continue
        new_output.append({'index':idx,'version':version,'text':entry['question'],'verse_text':verse_text,'score':hit['score'],'label':''})
pd.DataFrame(new_output)

  0%|          | 0/2500 [00:00<?, ?it/s]

Unnamed: 0,index,version,text,verse_text,score,label
0,229,Vulgate,"Joshua 7.4 So about 3,000 men went up there fr...",Joshua 7.1 Filii autem Israël prævaricati sunt...,0.920012,
1,235,Douay-Rheims,Joshua 12.3 and the Arabah to the Sea of Chinn...,"Joshua 12.2 Sehon king of the Amorrhites, who ...",0.970134,
2,235,Douay-Rheims,Joshua 12.3 and the Arabah to the Sea of Chinn...,"Joshua 12.1 These are the kings, whom the chil...",0.960393,
3,235,Douay-Rheims,Joshua 12.3 and the Arabah to the Sea of Chinn...,"Joshua 12.5 Of Gessuri and Machati, and of hal...",0.955323,
4,235,Douay-Rheims,Joshua 12.3 and the Arabah to the Sea of Chinn...,Joshua 12.4 The border of Og the king of Basan...,0.954014,
...,...,...,...,...,...,...
719,2460,Tyndale,Hebrews 7.20 And it was not without an oath. F...,Hebrews 7.23 And amonge them many were made pr...,0.932754,
720,2460,Tyndale,Hebrews 7.20 And it was not without an oath. F...,Hebrews 7.21 Those prestes were made with out ...,0.929509,
721,2460,Tyndale,Hebrews 7.20 And it was not without an oath. F...,Hebrews 7.22 And for that cause was Iesus a st...,0.917938,
722,2460,Tyndale,Hebrews 7.20 And it was not without an oath. F...,Hebrews 7.18 Then the commaundment that went a...,0.916955,


In [None]:
new_df = pd.DataFrame(new_output)
new_df.to_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - NEW.csv",index=False)

# Get All ESV Parallel Text

In [39]:
with open(f"{folder}/Early-Modern-Sermons/assets/Bibles/ESV.json",'r') as file:
  ESV = json.load(file)

fps = [f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - ALL.csv"]
seen = {}
for fp in fps:
  data = pd.read_csv(fp).to_dict(orient='records')
  for qp_pair in tqdm.tqdm(data):
    v_id = re.findall("([\w\s]+ \d+\.\d+)",qp_pair['text'])
    if len(v_id) == 0:
      continue
    seen[v_id[0]] = None
ESV_ids = list(set(ESV.keys()) - set(seen.keys()))
ESV_verses = [ESV[k] for k in ESV_ids]

len(ESV_ids),len(ESV_verses)

  0%|          | 0/13831 [00:00<?, ?it/s]

(28607, 28607)

In [40]:
full_ids = [] # idx to orig idx
full_vectors = []
idx = 0
for orig_idx, v_id in enumerate(bible_ids):
  if " - " not in v_id: # a full verse
    if re.search("AKJV|Geneva",v_id): continue
    full_ids.append(v_id)
    full_vectors.append(bible_vectors[orig_idx])
    idx += 1
len(full_ids),len(full_vectors)

(89218, 89218)

In [41]:
ESV_vectors = bi_encoder.encode(ESV_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [42]:
v_to_idx = {v:idx for idx, v in enumerate(bible_verses)}

In [43]:
psalms_mapping = {
    "Psalms 9": "Psalms 9",
    "Psalms 10": "Psalms 9",
}

for eng, vul in zip(range(11, 114), range(10, 113)):
    psalms_mapping[f"Psalms {eng}"] = f"Psalms {vul}"

psalms_mapping["Psalms 114"] = "Psalms 113"
psalms_mapping["Psalms 115"] = "Psalms 113"

for verse in range(1, 10):
    psalms_mapping[f"Psalms 116.{verse}"] = "Psalms 114"

for verse in range(10, 20):
    psalms_mapping[f"Psalms 116.{verse}"] = "Psalms 115"

for eng, vul in zip(range(117, 147), range(116, 146)):
    psalms_mapping[f"Psalms {eng}"] = f"Psalms {vul}"

for verse in range(1, 12):
    psalms_mapping[f"Psalms 147.{verse}"] = "Psalms 146"
for verse in range(12, 21):
    psalms_mapping[f"Psalms 147.{verse}"] = "Psalms 147"


In [45]:
hits = semantic_search(ESV_vectors,full_vectors,query_chunk_size=1000,top_k=100)

In [53]:
output = []
progress = tqdm.tqdm(hits)
for idx, hitlist in enumerate(progress):
  v_id = ESV_ids[idx]
  v_book = re.sub(" \d+.\d+",'',v_id)
  v_chap = int(re.findall("(\d+).",v_id)[0])
  v_verse = int(v_id.split(".")[-1])
  seen = []

  for hit in hitlist:
    key = full_ids[hit['corpus_id']]
    version = re.findall(r"\((.*?)\)",key)
    if len(version) == 0:
      continue
    version = version[0]

    if v_id == key.split(" (")[0]:
      if version not in ['ODRV','Douay-Rheims','Vulgate']:
        seen.append(re.findall(r"\((.*?)\)",key)[0])
    # must be from the same B
    key_book = re.sub(" \d+.\d+ \(.*?\)",'',key)
    key_chap = int(re.findall("(\d+).",key)[0])
    key_verse = int(key.split(" (")[0].split(".")[-1])
    label = ''


    if f"{v_book} {v_chap}" in psalms_mapping and version in ['ODRV','Douay-Rheims','Vulgate']:
      new_key = psalms_mapping[f"{v_book} {v_chap}"].split(".")[0]
      if new_key != f"{key_book} {key_chap}":
        continue
      if v_chap not in [147,116,9,10,114,115]:
        if key_chap == v_chap and key_verse == v_verse:
          label = True
        else:
          continue
      if version != 'Vulgate':
        if f"{new_key}.{key_verse} (ODRV)" in bible:
          new_key = f"{new_key}.{key_verse} (ODRV)"
          seen.append("ODRV")
        else:
          new_key = f"{new_key}.{key_verse} (Douay-Rheims)"
          seen.append("Douay-Rheims")
      else:
        new_key += f".{key_verse} (Vulgate)"
        seen.append('Vulgate')

      if new_key in bible:
        seen.append(version)
        score = pytorch_cos_sim(ESV_vectors[idx], bible_vectors[v_to_idx[bible[new_key]]])
        score = round(float(score),3)
        output.append({'index':idx,'version':'Vulgate','text':ESV_verses[idx],'verse_text':bible[new_key],'score':score})

    else: # identical numbering
      if key_book != v_book: continue
      if key_chap != v_chap: continue
      if key_verse != v_verse: continue

    # add to outputs
    score = hit['score']
    score = round(score,3)
    verse_text = bible[key]
    output.append({'index':idx,'version':version,'text':ESV_verses[idx],'verse_text':verse_text,'score':score})

  for b_ver in ['Vulgate','Douay-Rheims','ODRV','Tyndale','Wycliffe','Geneva','AKJV']:
    if b_ver in seen: continue
    label = ''
    if f"{v_book} {v_chap}" in psalms_mapping and b_ver in ['Vulgate','Douay-Rheims','ODRV']:
        label = False
        new_key = psalms_mapping[f"{v_book} {v_chap}"] + f".{v_verse} ({b_ver})"
        if new_key in bible:
          score = pytorch_cos_sim(ESV_vectors[idx], bible_vectors[v_to_idx[bible[new_key]]])
          score = round(float(score),3)
          output.append({'index':idx,'version':b_ver,'text':ESV_verses[idx],'verse_text':bible[new_key],'score':score})
    new_key = f"{v_id} ({b_ver})"
    if new_key in bible:
      score = pytorch_cos_sim(ESV_vectors[idx], bible_vectors[v_to_idx[bible[new_key]]])
      score = round(float(score),3)
      output.append({'index':idx,'version':b_ver,'text':ESV_verses[idx],'verse_text':bible[new_key],'score':score})
output = pd.DataFrame(output)
output = output.drop_duplicates()
output

  0%|          | 0/28607 [00:00<?, ?it/s]

Unnamed: 0,index,version,text,verse_text,score
0,0,Douay-Rheims,"And they fell on their faces and said, ""O God,...","Numbers 16.22 They fell flat on their face, an...",0.960
1,0,Vulgate,"And they fell on their faces and said, ""O God,...","Numbers 16.22 Qui ceciderunt proni in faciem, ...",0.882
3,0,Wycliffe,"And they fell on their faces and said, ""O God,...","Numbers 16.22 Whiche felden lowe on the face, ...",0.878
4,0,Geneva,"And they fell on their faces and said, ""O God,...",Numbers 16.22 And they fell vpon their faces a...,0.949
5,0,AKJV,"And they fell on their faces and said, ""O God,...","Numbers 16.22 And they fell vpon their faces, ...",0.972
...,...,...,...,...,...
149143,28606,Douay-Rheims,And those to camp next to him shall be the tri...,Numbers 2.27 Beside him they of the tribe of A...,0.969
149144,28606,Vulgate,And those to camp next to him shall be the tri...,Numbers 2.27 Juxta eum fixere tentoria de trib...,0.939
149146,28606,Wycliffe,And those to camp next to him shall be the tri...,Numbers 2.27 Men of the lynage of Aser settide...,0.944
149147,28606,Geneva,And those to camp next to him shall be the tri...,Numbers 2.27 And by him shall the tribe of Ash...,0.966




# ESV Parallel Text

In [13]:
with open(f"{folder}/Early-Modern-Sermons/assets/Bibles/ESV.json",'r') as file:
  ESV = json.load(file)
len(ESV)

31105

In [None]:
ESV_ids = list(ESV.keys())
ESV_verses = [ESV_ids[idx] + " " + v for idx, v in enumerate(ESV.values())]
# ESV_vectors = bi_encoder.encode(ESV_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
commonDiff_books = ['Esther', 'Daniel', 'Jeremiah','Ecclesiasticus','Judith']
Group0 = [i for i in ESV_ids if 'Psalms' in i]
Group1 = [i for i in ESV_ids if re.search("|".join(commonDiff_books),i)]
seen_dict = {i:None for i in Group1}
seen_dict.update({i:None for i in Group0})
Group2 = [i for i in ESV_ids if i not in seen_dict]
import random
sample0 = {s:None for s in random.sample(Group0, 1000)}
sample1 = {s:None for s in random.sample(Group1, 500)}
sample2 = {s:None for s in random.sample(Group2, 1000)}

In [None]:
sample_ids, sample_verses = [],[]
for idx, v_id in enumerate(ESV_ids):
  if v_id in sample1 or v_id in sample2 or v_id in sample0:
    sample_ids.append(v_id)
    sample_verses.append(ESV_verses[idx])
sample_vectors = bi_encoder.encode(sample_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
parallel = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - TRUE.csv").to_dict(orient='records')
parallel.extend(pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - FALSE.csv").to_dict(orient='records'))

parallel = {x['index']:x['text'] for x in parallel}
parallel = {k:parallel[k] for k in sorted(parallel)}
sample_vectors = bi_encoder.encode(list(parallel.values()), batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
torch.save(sample_vectors, f'{output_folder}/parallel_predictions_SAMPLE_{model_name}.pt')

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
ESV_verses, ESV_ids = sample_verses, sample_ids
hits = semantic_search(sample_vectors,full_vectors,query_chunk_size=1000,top_k=200)

# Base Bi-Encoder Comparison

- "all-mpnet-base-v2"
- "all-MiniLM-L12-v2"
- "paraphrase-multilingual-mpnet-base-v2"
- "paraphrase-multilingual-MiniLM-L12-v2"
- "emanjavacas/MacBERTh"

Conclusion: use "emanjavacas/MacBERTh"


In [None]:
ptexts = ["The bond-woman and her sonne were cast out because he should not inherit with the free woman's son"]

# only Galatians 4 & Genesis 21
galgen_ids = []
galgen_verses = []
for idx, verse_id_list in enumerate(bible_ids): # partial bible verses
  hits = [v_id for v_id in verse_id_list if re.search("Galatians 4.|Genesis 21.",v_id)]
  if len(hits) > 0:
    galgen_ids.append(idx)
    galgen_verses.append(bible_verses[idx])

def try_model(model_name):
  if "MacBERTh" in model_name:
    word_embedding_model = models.Transformer(model_name, max_seq_length=128)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "mean")
    encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
  else:
    encoder = SentenceTransformer(model_name)

  galgen_vectors = encoder.encode(galgen_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
  p_embedding = encoder.encode(ptexts, batch_size=1024, convert_to_tensor=True,show_progress_bar=True)
  hits = semantic_search(p_embedding,galgen_vectors,query_chunk_size=512,top_k=5)
  print(model_name)
  for pidx, hitlist in enumerate(hits):
    for hit in hitlist:
        bid = hit['corpus_id']
        score = hit['score']
        print(round(score,3), bible_ids[galgen_ids[bid]], galgen_verses[bid])

In [None]:
try_model("paraphrase-multilingual-MiniLM-L12-v2")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

paraphrase-multilingual-MiniLM-L12-v2
0.733 ['Galatians 4.30 (ODRV) - 1'] Cast out the bond-woman and her sonne.
0.714 ['Galatians 4.30 (ODRV) - 2'] For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.707 ['Galatians 4.30 (AKJV) - 2'] for the son of the bondwoman shall not bee heire with the son of the freewoman.
0.697 ['Galatians 4.30 (AKJV) - 1'] Cast out the bondwoman and her sonne:
0.658 ['Galatians 4.30 (ODRV)'] But what saith the Scripture? Cast out the bond-woman and her sonne. For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.


In [None]:
try_model("all-MiniLM-L12-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

all-MiniLM-L12-v2
0.764 ['Galatians 4.30 (ODRV)'] But what saith the Scripture? Cast out the bond-woman and her sonne. For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.759 ['Galatians 4.30 (AKJV) - 2'] for the son of the bondwoman shall not bee heire with the son of the freewoman.
0.754 ['Galatians 4.30 (ODRV) - 1'] Cast out the bond-woman and her sonne.
0.752 ['Galatians 4.30 (ODRV) - 2'] For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.743 ['Galatians 4.30 (AKJV) - 1'] Cast out the bondwoman and her sonne:


In [None]:
try_model("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

all-mpnet-base-v2
0.804 ['Galatians 4.30 (AKJV) - 1'] Cast out the bondwoman and her sonne:
0.763 ['Galatians 4.30 (AKJV) - 2'] for the son of the bondwoman shall not bee heire with the son of the freewoman.
0.753 ['Galatians 4.30 (ODRV) - 1'] Cast out the bond-woman and her sonne.
0.75 ['Galatians 4.30 (ODRV) - 2'] For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.744 ['Galatians 4.30 (ODRV)'] But what saith the Scripture? Cast out the bond-woman and her sonne. For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.


In [None]:
try_model("paraphrase-multilingual-mpnet-base-v2")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

paraphrase-multilingual-mpnet-base-v2
0.875 ['Galatians 4.30 (ODRV) - 1'] Cast out the bond-woman and her sonne.
0.832 ['Galatians 4.30 (AKJV) - 1'] Cast out the bondwoman and her sonne:
0.76 ['Galatians 4.30 (AKJV) - 2'] for the son of the bondwoman shall not bee heire with the son of the freewoman.
0.707 ['Galatians 4.30 (ODRV) - 2'] For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.7 ['Galatians 4.30 (ODRV)'] But what saith the Scripture? Cast out the bond-woman and her sonne. For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.


In [None]:
try_model("emanjavacas/MacBERTh")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

emanjavacas/MacBERTh
0.96 ['Galatians 4.30 (ODRV)'] But what saith the Scripture? Cast out the bond-woman and her sonne. For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.953 ['Galatians 4.30 (AKJV) - 2'] for the son of the bondwoman shall not bee heire with the son of the freewoman.
0.948 ['Galatians 4.30 (ODRV) - 2'] For the sonne of the bond-woman shal not be heire with the sonne of the free-woman.
0.948 ['Galatians 4.30 (Geneva)'] But what sayth the Scripture? Put out the seruant and her sonne: for the sonne of the seruant shall not be heire with the sonne of the free woman.
0.945 ['Galatians 4.30 (Tyndale)'] Neverthelesse what sayth the scripture: put awaye the bonde woman and her sonne. For the sonne of the bonde woman shall not be heyre with the sonne of the fre woman.


# QP Set-up
Must remove missing page indicators from tokens   

In [None]:
QPdir = f"{folder}/segments"
outputdir = f"{output_folder}/qp"

def clean_text(text):
  text = re.sub(r"\d+\^PAGE[S]*\^MISSING"," ",text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

def get_batches(df_dict,batch_size=40000):
  batches = []
  idx_to_p = {}
  idx = 0
  for i in tqdm(range(0, len(df_dict), batch_size)):
    batch = []
    for item in df_dict[i: i + batch_size]:
      text = clean_text(item)
      idx_to_p[idx] = text
      batch.append((idx,text))
      idx += 1
    batches.append(batch)

  return batches, idx_to_p

In [None]:
outfname = f"CC_QP" # segments close to citations

with open(f"{folder}/segments/all_relevant.json","r") as file:
  corpus = json.load(file)

len(corpus)

683634

In [None]:
outfname = f"M_QP" # segments close to citations

with open(f"{folder}/segments/all_unique_marginalia.json","r") as file:
  corpus = json.load(file)
corpus = {k:v for k,v in corpus.items() if len(k.split(" "))>3}
len(corpus)

249056

In [None]:
batches, idx_to_passage = get_batches(list(corpus.keys()))
batches[0][0] # first item of the first batch

In [None]:
for bidx, batch in enumerate(batches):
  outputs = []

  p_embedding = bi_encoder.encode([b[1] for b in batch], batch_size=1024, convert_to_tensor=True,show_progress_bar=True)
  p_embedding = p_embedding.cuda()
  hits = semantic_search(p_embedding,bible_vectors,query_chunk_size=1000,top_k=1)
  progress = tqdm(hits)

  for pidx, hitlist in enumerate(progress):
    idx, passage = batch[pidx]
    max_score = max([h['score'] for h in hitlist])

    # print('\n',batch[pidx][1])
    for hit in hitlist:
      key = bible_ids[hit['corpus_id']]
      s = hit['score']
      verse_text = bible_verses[hit['corpus_id']]
      # print('\t',s,key,verse_text)
      outputs.append({'verse_id':"; ".join(key), 'score':s, 'text': passage, 'verse_text':verse_text})

  output_df = pd.DataFrame(outputs)
  output_df.to_csv(f"{outputdir}/{outfname}_{bidx}.csv") # save intermediary results to avoid losing work
  print(len(outputs),"entries for this batch")
output_df

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 400533.24it/s]


40000 entries for this batch


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 337152.54it/s]


40000 entries for this batch


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 421361.29it/s]


40000 entries for this batch


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 408014.16it/s]


40000 entries for this batch


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 55345.76it/s]


40000 entries for this batch


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 237397.53it/s]


40000 entries for this batch


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9056/9056 [00:00<00:00, 402036.63it/s]

9056 entries for this batch





Unnamed: 0,verse_id,score,text,verse_text
0,1 Chronicles 24.29 (Geneva) - 0,0.900274,2 Sam. 10. 11.,Of Kish.
1,1 Chronicles 24.21 (Geneva) - 0,0.899044,2 Chro 13 8.,Of Rehabiah.
2,Joshua 12.24 (Geneva) - 0,0.905104,1 Chronicles 12. 1. v. 8.,"The King of Tirzah, one."
3,Daniel 11.35 (AKJV) - 1,0.928344,"English Version, an appointed time to Man upon...",because it is yet for a time appointed.
4,Ecclesiastes 7.3 (Vulgate),0.940999,Magnum Exemplum nisi mala fortuna non invenit....,Melius est ire ad domum luctus quam ad domum c...
...,...,...,...,...
9051,2 Samuel 12.19 (Douay-Rheims) - 4,0.909254,1 Cor 16. 2.,He is dead.
9052,1 Chronicles 24.29 (Geneva) - 0,0.910404,1 The. 5. 22.,Of Kish.
9053,1 Chronicles 24.29 (Geneva) - 0,0.923213,De• … tO. 13. 8.,Of Kish.
9054,1 Chronicles 24.29 (Geneva) - 0,0.922141,L• … v. 24. 11.,Of Kish.


# Additional positives and negatives

In [None]:
outputfname = f"preE_additional"
input = pd.read_csv(f"{folder}/QP/preE_qp.csv").to_dict(orient="records")
input.extend(pd.read_csv(f"{folder}/QP/preE_Vul.csv").to_dict(orient="records"))

# outputfname = f"GOLDEN_SET_additional"
# input = pd.read_csv(f"{folder}/QP/GOLDEN_SET.csv").to_dict(orient="records")
len(input), input[0]

(5157,
 {'Unnamed: 0': 0,
  'verse_id': '1 Chronicles 1.34 (AKJV)',
  'text': 'Abraham begatte Isaac. Isaac begat Iacob.',
  'verse_text': 'And Abraham begate Isaac. The sonnes of Isaac: Esau, and Israel.',
  'label': True})

In [None]:
corpus = {}
for idx, entry in enumerate(input):
  key = entry['text'].strip()
  if key not in corpus:
    corpus[key] = {"positives":{},"negatives":{}}
  if entry['label'] is True:
    corpus[key]['positives'][entry['verse_text']] = entry['verse_id']
  elif entry['label'] is False:
    corpus[key]['negatives'][entry['verse_text']] = entry['verse_id']
  else:
    print(idx, entry)


In [None]:
batches, idx_to_passage = get_batches(list(corpus.keys()))
batches[0][0] # first item of the first batch

100%|██████████| 1/1 [00:00<00:00, 33.69it/s]


(0, 'Abraham begatte Isaac. Isaac begat Iacob.')

In [None]:
for bidx, batch in enumerate(batches):
  outputs = []

  p_embedding = bi_encoder.encode([b[1] for b in batch], batch_size=1024, convert_to_tensor=True,show_progress_bar=True)
  p_embedding = p_embedding.cuda()
  hits = semantic_search(p_embedding,bible_vectors,query_chunk_size=1000)
  progress = tqdm(hits)

  for pidx, hitlist in enumerate(progress):
    idx, passage = batch[pidx]
    max_score = max([h['score'] for h in hitlist])

    # print('\n',batch[pidx][1])
    for hit in hitlist:
      key = bible_ids[hit['corpus_id']]
      s = hit['score']
      verse_text = bible_verses[hit['corpus_id']]
      if verse_text in corpus[passage]['positives'] or verse_text in corpus[passage]['negatives']:
        continue
      outputs.append({'verse_id':"; ".join(key), 'text':passage,'verse_text':verse_text, 'label':round(s,3)})

  output_df = pd.DataFrame(outputs)
  output_df.to_csv(f"{outputdir}/{outputfname}_{bidx}.csv") # save intermediary results to avoid losing work
  print(len(outputs),"entries for this batch")
output_df

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 4591/4591 [00:00<00:00, 45192.66it/s]


43190 entries for this batch


Unnamed: 0,verse_id,text,verse_text,label
0,Matthew 1.2 (Tyndale) - 1,Abraham begatte Isaac. Isaac begat Iacob.,Isaac begat Iacob:,0.954
1,Genesis 25.19 (AKJV) - 1; Matthew 1.2 (Geneva)...,Abraham begatte Isaac. Isaac begat Iacob.,Abraham begate Isaac.,0.952
2,Matthew 1.2 (ODRV) - 0,Abraham begatte Isaac. Isaac begat Iacob.,"Abraham begat Isaac, And Isaac begat Iacob.",0.948
3,Genesis 25.19 (ODRV) - 1; Matthew 1.2 (Tyndale...,Abraham begatte Isaac. Isaac begat Iacob.,Abraham begat Isaac:,0.942
4,1 Chronicles 1.34 (AKJV) - 0,Abraham begatte Isaac. Isaac begat Iacob.,And Abraham begate Isaac.,0.939
...,...,...,...,...
43185,Numbers 19.18 (Vulgate),Quasi pannus menstruatae omnes iustitiae nostr...,"in quibus cum homo mundus tinxerit hyssopum, a...",0.927
43186,Ecclesiasticus 35.3 (Vulgate),Quasi pannus menstruatae omnes iustitiae nostr...,Et propitiationem litare sacrificii super inju...,0.926
43187,2 Corinthians 10.10 (AKJV),Quasi pannus menstruatae omnes iustitiae nostr...,For his letters (say they) are waighty and pow...,0.926
43188,Isaiah 27.9 (Vulgate) - 2,Quasi pannus menstruatae omnes iustitiae nostr...,"ut auferatur peccatum ejus, cum posuerit omnes...",0.925


# Base Model Cross References

In [None]:
cross_refs = {}
batches, idx_to_passage = get_batches(bible_verses)
batches[0][12] # first item of the first batch

100%|██████████| 9/9 [00:02<00:00,  3.31it/s]


(12, 'and the euening and the morning were the first day.')

In [None]:
for bidx, batch in enumerate(batches):

  p_embedding = bi_encoder.encode([b[1] for b in batch], batch_size=1024, convert_to_tensor=True,show_progress_bar=True)
  p_embedding = p_embedding.cuda()
  hits = semantic_search(p_embedding,bible_vectors,query_chunk_size=1000,top_k = 6)
  progress = tqdm(hits)

  for pidx, hitlist in enumerate(progress):
    idx, passage = batch[pidx]
    max_score = max([h['score'] for h in hitlist])

    for hit in hitlist:
      key = bible_ids[hit['corpus_id']]
      s = hit['score']
      verse_text = bible_verses[hit['corpus_id']]
      if passage not in cross_refs:
        cross_refs[passage] = {}
      cross_refs[passage][verse_text] = s

  with open(f"{folder}/EEPS/qp/cross_refs_{model_name}.json",'w+') as file:
    json.dump(cross_refs,file)

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 153000.43it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 86243.37it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 136922.33it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 50791.64it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 130500.96it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 129061.96it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 131946.04it/s]


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

100%|██████████| 40000/40000 [00:00<00:00, 127443.35it/s]


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 7969/7969 [00:00<00:00, 149709.35it/s]


In [None]:
import random

sample_verse = random.choice(list(cross_refs.keys()))
sampled = cross_refs[sample_verse]

print(sample_verse)
for match_verse, score in sampled.items():
  print(score, match_verse)

He hath hedged in my path round about, and I cannot pass, and in my way he hath set darkness.
0.9999997615814209 He hath hedged in my path round about, and I cannot pass, and in my way he hath set darkness.
0.9719390273094177 Hee hath hedged vp my way that I cannot passe, and he hath set darkenesse in my paths.
0.959246814250946 He hath hedged me about, that I cannot get out:
0.9572896957397461 Hee hath fenced vp my way that I cannot passe; and hee hath set darkenesse in my pathes.
0.9522112607955933 Wherefore behold I will hedge up thy way with thorns, and I will stop it up with a wall, and she shall not find her paths.
0.9483442306518555 Darkness compasseth me about, and the walls cover me, and no man seeth me:


In [None]:
missingCR = pd.read_csv(f"{folder}/QP/cross_refs_missing.csv").to_dict(orient="records")
print(len(missingCR))

missing = {}
for entry in missingCR:
  key = entry['text']
  value = entry['verse_text']
  if key not in missing:
    missing[key] = []
  if value not in missing:
    missing[value] = []

print(len(missing))
batches, idx_to_passage = get_batches(list(missing.keys()))
batches[0][0] # first item of the first batch

372250
152522


100%|██████████| 4/4 [00:01<00:00,  2.26it/s]


(0, 'In the beginning God created the Heauen, and the Earth.')

In [None]:
passage_to_idx = {v:k for k,v in idx_to_passage.items()}
seen = {}
targets = {}
for entry in missingCR:
  t1 = passage_to_idx[entry['text']]
  t2 = passage_to_idx[entry['verse_text']]
  if (t1,t2) not in seen and (t2,t1) not in seen:
    seen[(t1,t2)] = None
    if t1 not in targets:
      targets[t1] = []
    targets[t1].append(t2)
print(len(seen))

192201


In [None]:
for bidx, batch in enumerate(batches):
  p_embedding = bi_encoder.encode([b[1] for b in batch], batch_size=1024, convert_to_tensor=True,show_progress_bar=True)
  p_embedding = p_embedding.cuda()

  for pidx, vector in enumerate(p_embedding):
    idx, passage = batch[pidx]
    if idx not in targets: continue

    for v_idx in targets[idx]:
      vector2 = p_embedding[(v_idx % len(p_embedding))]
      sim = pytorch_cos_sim(vector, vector2).item()
      seen[(idx,v_idx)] = sim

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
output = []
for entry in missingCR:
  key = entry['text']
  value = entry['verse_text']
  k = passage_to_idx[key]
  v = passage_to_idx[value]
  if (k,v) not in seen: continue
  entry['score'] = seen[(k,v)]
  output.append(entry)
output = pd.DataFrame(output)
output.to_csv(f"{folder}/QP/cross_refs_missing.csv",index=False)

In [None]:
from google.colab import runtime
runtime.unassign()