## Setup

In [None]:
import json
import random
import re, os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
import unicodedata
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import notebook as tqdm
from sklearn.model_selection import train_test_split

np.random.seed(42)
random.seed(42)

try:
    assert torch.cuda.is_available()
    device = torch.device("cuda")
except:
    device = torch.device("cpu")
print("Using device:", device)

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

Using device: cpu


In [10]:
from google.colab import drive
drive.mount('/content/drive')

folder = f"/content/drive/MyDrive/DH"

# Load Bibles
bible = {}

b_versions = ['AKJV','ODRV','Geneva', 'Douay-Rheims', 'Tyndale', 'Wycliffe','Vulgate']
for bname in b_versions:
    data = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/Bibles/{bname}.csv",header=None)
    data = data.to_dict(orient="records")
    for entry in tqdm.tqdm(data):
        key = entry[0]
        v_id = key.split(" (")[0]
        text = entry[6]
        if re.search("Douay-Rheims",key):
            if re.sub("Douay-Rheims","ODRV",key) in bible: continue
        if len(text.split(" ")) < 200:
            bible[key] = f"{v_id} {text}"

        parts = re.split(r'(?<=[\.\?]) (?=[A-Z])|(?<=[\!\:\;])', text)
        parts = [re.sub(r'\s+', ' ', p).strip() for p in parts if len(p.strip(" ")) > 0]
        if (len(parts[0].split(" ")) <= 5 or len(parts[-1].split(" ")) <= 5 or re.search(r"\&\w+\;",parts[0])):
            for pidx, p in enumerate(parts): continue
        elif len(parts) > 1:
            for pidx, p in enumerate(parts):
              p_id = f"{key} - {pidx}"
              if len(p) == 0: continue
              if re.search(r"\&\w+\;",p) or len(p.split(" ")) <= 5: continue
              bible[p_id] = f"Part {pidx+1} of {v_id}: {p}"

bible['NonQP 0.0'] = "No Biblical quotation or paraphrase found"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  0%|          | 0/36702 [00:00<?, ?it/s]

  0%|          | 0/14737 [00:00<?, ?it/s]

  0%|          | 0/31090 [00:00<?, ?it/s]

  0%|          | 0/35811 [00:00<?, ?it/s]

  0%|          | 0/7954 [00:00<?, ?it/s]

  0%|          | 0/9622 [00:00<?, ?it/s]

  0%|          | 0/35809 [00:00<?, ?it/s]

In [9]:
to_remove = {}
items = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/overly_vague.csv").to_dict(orient='records')
for entry in items:
  if " - " not in entry['verse_id']: continue
  to_remove[entry['verse_id']] = None
len(to_remove)

457

In [59]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, CrossEncoder,models
from sentence_transformers.util import semantic_search

model_checkpoint = f"{folder}/models/EEPS_emanjavacas-MacBERTh_2025-05-05/checkpoint-1560"
bi_encoder = SentenceTransformer(model_checkpoint)
model_name = "EEPS_emanjavacas-MacBERTh_2025-05-05_checkpoint-1560"
all_bible_vectors = torch.load(f'{folder}/EEPS/Bibles_{model_name}.pt',map_location=device)

In [60]:
bible_verses, bible_ids = [],[]
bible_vectors = []
for idx, key in enumerate(bible):
  if key in to_remove: continue
  bible_ids.append(key)
  bible_verses.append(bible[key])
  bible_vectors.append(all_bible_vectors[idx])
v_id_to_idx = {v_id:idx for idx, v_id in enumerate(bible_ids)}
len(bible_verses)

276171

## Load & Split Dataset

In [122]:
with open(f"{folder}/Early-Modern-Sermons/assets/Bibles/ESV.json",'r') as file:
  ESV = json.load(file)

stylized_to_standard = {'ᴇ': 'E','ꜱ': 'S','ᴠ': 'V','ᴅ': 'D','ʀ': 'R','ᴍ': 'M','ɪ': 'I','ᴛ':
                        'T','ʏ': 'Y','ʟ': 'L','ᴘ': 'P','ᴀ': 'A','ɴ': 'N','ꞵ': 'B','ʙ': 'B',
                        'ꞽ': 'Y','ʜ': 'H','Æ': 'AE','ꜰ': 'F','æ': 'ae','ᴏ': 'O','ᴡ': 'W','ᴄ':
                        'C','ᴋ': 'K','ɢ': 'G','ᴢ': 'Z','é': 'e','ô': 'o','ë': 'e','ó': 'o',
                        'á': 'a','œ': 'oe','ȯ': 'o','³': '3','Ꜳ': 'AE','Ꜵ': 'AO','Ꜷ': 'AU'}

def normalize_stylized(text):
    return ''.join(stylized_to_standard.get(char, char) for char in text)


def fix_name(v_id):
  if "1 Kings" in v_id: v_id = "3 Kings" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id: v_id = "4 Kings" + v_id.split("2 Kings")[-1]
  elif "1 Samuel" in v_id: v_id = "1 Kings" + v_id.split("1 Samuel")[-1]
  elif "2 Samuel" in v_id: v_id = "2 Kings" + v_id.split("2 Samuel")[-1]
  elif re.search(r"^\d+ Chronicles",v_id): v_id = re.sub(r"Chronicles","Paralipomenon",v_id)
  return v_id

def fix_name_revert(v_id):
  if "3 Kings" in v_id: v_id = "1 Kings" + v_id.split("3 Kings")[-1]
  elif "4 Kings" in v_id: v_id = "2 Kings" + v_id.split("4 Kings")[-1]
  elif "1 Kings" in v_id: v_id = "1 Samuel" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id: v_id = "2 Samuel" + v_id.split("2 Kings")[-1]
  elif re.search(r"^\d+ Paralipomenon",v_id): v_id = re.sub(r"Paralipomenon","Chronicles",v_id)
  return v_id

def get_training_data():
    # reading training and development data
    parallel = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_verses_CONFIDENT.csv")
    parallel['toCheck'] = False
    parallel = parallel.to_dict(orient='records')
    parallel.extend(pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_verses_CONFIDENT - HNDPR.csv").to_dict(orient='records'))

    all_data = {}
    for item in parallel:
      key = (item['ESV'],item['version'])
      v_id = item['ESV']
      text = ESV[v_id] # without verse_id prepended

      if key not in all_data:
        all_data[key] = {'qid': v_id,
                          'query': text,
                          'pos':{}, # verse ids with version
                          'neg':{}, # high similarity and equivalent numberings
                          }
      entry = all_data[key]

      v2_id = item['Parallel'] + f" ({item['version']})"
      verse_text = bible[v2_id]

      if item['toCheck'] is False:
        entry['pos'][v2_id] = None
      elif item['prediction'] is True and item['correction'] is not False:
        entry['pos'][v2_id] = None
      else:
        entry['neg'][v2_id] = None

    training_set = {}
    for key, entry in all_data.items():
      if len(entry['pos']) == 0: continue
      training_set[key] = entry
    return training_set

In [123]:
from sklearn.model_selection import train_test_split
import random
random.seed(42)

def split_training_data(training_data):
    psalms_items = {v['qid']:None for v in training_data.values() if v['qid'].startswith("Psalms")}
    non_psalms_items = {v['qid']:None for v in training_data.values() if not v['qid'].startswith("Psalms") and v['qid'] != bible_ids[-1]}

    psalms_keys = list(psalms_items.keys())
    psalms_dev_keys = random.sample(psalms_keys, 200)
    psalms_train_keys = list(set(psalms_keys) - set(psalms_dev_keys))
    non_psalms_keys = list(non_psalms_items.keys())


    train_keys_np, dev_keys_np = train_test_split(non_psalms_keys, test_size=0.05, random_state=42)
    train_keys = {k:None for k in train_keys_np + psalms_train_keys}
    dev_keys = {k:None for k in dev_keys_np + psalms_dev_keys}
    print('Train IDs', len(train_keys), 'Dev IDs', len(dev_keys))
    train_data = {k:v for k,v in training_data.items() if v['qid'] in train_keys}
    dev_data = {k:v for k,v in training_data.items() if v['qid'] in dev_keys}

    print(len(train_data),len(dev_data))
    return train_data, dev_data

In [124]:
training_data = get_training_data()
train_data, dev_data = split_training_data(training_data)
del training_data

Train IDs 27021 Dev IDs 1556
115388 6486


## Parallel Text

In [66]:
bible_parts = {}
for key, text in bible.items():
  v_id = key.split(" - ")[0]
  if v_id not in bible_parts:
    bible_parts[v_id] = []
  if " - " not in key: continue
  bible_parts[v_id].append(key)
len(bible_parts)

157011

In [144]:
def get_parts(pos_id):
  parts = []
  if len(bible_parts[pos_id]) > 0:
      for part_id in bible_parts[pos_id]:
        parts.append(part_id)
  return parts

def get_pairs(data):
  pairs = {}
  for key, entry in data.items():
    v_id,version = key
    to_break = False
    for pos_id in entry['pos']:
      pairs[(v_id, pos_id)] = v_id
      parts = get_parts(pos_id)
      if len(parts) >= 1:
        for p in parts:
          pairs[(p,pos_id)] = v_id
        to_break = True
      for ver in b_versions:
        if ver == version: continue
        if (v_id, ver) in data: # add parallel verses and their parts to PT list
          new_key = (v_id, ver)
          for pos2_id in data[new_key]['pos']:
            pairs[(pos_id, pos2_id)] = v_id
            for p in parts:
              pairs[(p,pos2_id)] = v_id
  print(len(pairs))
  return pairs

train_pairs = get_pairs(train_data) # (query, answer):None
dev_pairs = get_pairs(dev_data)

930857
51425


In [126]:
k = 25 # hard negatives; this means that there are 930857*25 = 23271425 (23 million triplets for training)


20386725

## Cross References

In [38]:
with open(f"{folder}/EEPS/cross_references.json",'r') as file:
  CR = json.load(file)
len(CR)

29364

In [57]:
for key, item in train_data.items():
  print(key,item)
  version = key[-1]
  for cr in CR[item['qid']]:
    cr = (cr, version)
    if cr in train_data:
      for cr_id in train_data[cr]['pos']:
        print(cr_id, bible[cr_id])
  break

('Judges 14.1', 'Geneva') {'qid': 'Judges 14.1', 'query': 'Samson went down to Timnah, and at Timnah he saw one of the daughters of the Philistines.', 'pos': {'Judges 14.1 (Geneva)': None}, 'neg': {}, 'CR': []}
Genesis 6.2 (Geneva) Genesis 6.2 Then the sonnes of God sawe the daughters of men that they were faire, and they tooke them wiues of all that they liked.
Joshua 19.43 (Geneva) Joshua 19.43 And Elon, and Temnathah, and Ekron,
Joshua 15.10 (Geneva) Joshua 15.10 Then this border compasseth from Baalah Westward vnto mount Seir, and goeth along vnto the side of mount Iearim, which is Chesalon on the Northside: so it commeth downe to Bethshemesh, and goeth to Timnah.
Genesis 34.2 (Geneva) Genesis 34.2 Whome when Shechem the sonne of Hamor the Hiuite lorde of that countrey sawe, hee tooke her, and lay with her, and defiled her.
Job 31.1 (Geneva) Job 31.1 I made a couenant with mine eyes: why then should I thinke on a mayde?
1 John 2.16 (Geneva) 1 John 2.16 For all that is in this world

In [None]:
# choose cross references with high similarity (>=0.8)

## Triplets and Dataloaders

In [169]:
ESV_id_to_idx = {id:idx for idx, id in enumerate(ESV)}

In [None]:
ESV_vectors = bi_encoder.encode([v for v in ESV.values()], batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
torch.save(ESV_vectors, f"{folder}/EEPS/ESV_{model_name}.pt")
ESV_vectors = torch.load(f"{folder}/EEPS/ESV_{model_name}.pt",map=location)

In [None]:
bible_verses_no_prepend = [re.split(r"^[\w\s]+\d+\.\d+\ |Part \d+ of [\w\s]+\d+\.\d+\ ",v)[-1] for v in bible_verses]
bible_vectors_no_prepend = bi_encoder.encode(bible_verses_no_prepend, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
torch.save(bible_vectors_no_prepend,f"{folder}/EEPS/Bible_vectors_no_prepend.pt")
bible_vectors_no_prepend = torch.load(f"{folder}/EEPS/Bible_vectors_no_prepend.pt", map_location=device)

In [176]:
def make_batches(pairs,batch_size=40000):
  keys = {k[0]:v for k,v in pairs.items()}
  print(len(keys),'unique verse ids')
  batches = []
  keys_list = list(keys.keys())
  for start_index in range(0, len(keys_list), batch_size):
    batch = keys_list[start_index:start_index + batch_size]
    batches.append(batch)
  print(len(batches), 'batches')
  return batches, keys

dev_batches, dev_keys = make_batches(dev_pairs)
train_batches, train_keys = make_batches(train_pairs)

13022 unique verse ids
1 batches
231780 unique verse ids
6 batches


In [None]:
NUM_NEGATIVES = 50
def get_negatives(batch, keys, data):
  vectors = []
  for v_id in batch:
    if " (" in v_id:
      if v_id not in v_id_to_idx: continue
      vectors.append(bible_vectors_no_prepend[v_id_to_idx[v_id]])
    else:
      vectors.append(ESV_vectors[ESV_id_to_idx[v_id]])
  len(vectors)

  hits = semantic_search(vectors,bible_vectors,query_chunk_size=1000,top_k=NUM_PASSAGES)

  negatives = {}
  for idx, qid in enumerate(tqdm.tqdm(batch)):
    vector = vectors[idx]
    hitlist = hits[idx]
    negatives[idx] = []
    for hit in hitlist:
      v_id = bible_verses[hit['corpus_id']]
      v_id_no_part = v_id.split(" - ")[0]

      skip = False
      for ver in b_versions:
        if (keys[qid], ver) in data:
          if v_id_no_part in data[(keys[qid], ver)]['pos']:
            skip = True
            break
      if skip: continue
      negatives[idx].append(v_id)
      if len(negatives) == NUM_NEGATIVES: break

  return negatives

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/8860 [00:00<?, ?it/s]

In [None]:
from torch.utils.data import DataLoader,Dataset
from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util

k = 25

class BibleDataset(Dataset):
    def __init__(self,query_dict):
        self.queries = {q['qid']:q for q in query_dict}
        self.queries_ids = list(self.queries.keys())
        self.queries_ids = self.queries_ids * k
        for entry in self.queries.values():
            random.shuffle(entry["neg"])

    def __getitem__(self, item):
        query = self.queries[self.queries_ids[item]]
        query_text = query["query"]

        pos_id = query["pos"].pop(0)
        pos_text = bible[pos_id]
        query['pos'].append(pos_id)

        neg_id = query["neg"].pop(0)
        neg_text = bible[neg_id]
        query['neg'].append(neg_id)

        return InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(self.queries_ids)

train_dataset = BibleDataset(train_data)
eval_dataset = BibleDataset(dev_data)

In [None]:
from datetime import datetime
max_seq_length = 500
train_batch_size = 64
pooling = "mean"
warmup_steps = 1000
lr = 2e-5

In [None]:
# triplet evaluator
from sentence_transformers.evaluation import TripletEvaluator
eval_triplets = {"anchors":[],"positives":[],"negatives":[]}

for cvkey, info in eval_dataset.queries.items():
  bible_verse = ESV[cvkey]
  for p in info["pos"]:
    num_negs = 0
    for n in info["neg"]:
      num_negs += 1
      eval_triplets["anchors"].append(bible_verse)
      eval_triplets["positives"].append(bible[p])
      eval_triplets["negatives"].append(bible[n])
      if num_negs == k: break

print(len(eval_triplets["anchors"]))

triplet_evaluator = TripletEvaluator(
    anchors=eval_triplets["anchors"],
    positives=eval_triplets["positives"],
    negatives=eval_triplets["negatives"],
    name="EEPS-triplets-dev",
    show_progress_bar=True,
    batch_size=1024
)

40520


In [None]:
print(model_name)
results = triplet_evaluator(bi_encoder)
results

In [None]:
model_name = 'emanjavacas/MacBERTh'
print(model_name)
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
results = triplet_evaluator(model)
del model
results

# Training

In [None]:
model_name = f"EEPS_QP_emanjavacas-MacBERTh"
model_save_path = "{}/models/{}_{}".format(
    folder,
    model_name,
    datetime.now().strftime("%Y-%m-%d")
)
print(model_save_path)

/content/drive/MyDrive/DH/models/EEPS_emanjavacas-MacBERTh_2025-05-05


In [None]:
# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=bi_encoder)
print(len(train_dataloader.dataset), "triplets", f"k={k}")

99800 triplets k=40


In [None]:
!pip install datasets

In [None]:
# Train the model
from datasets import Dataset
bi_encoder.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=warmup_steps,
    use_amp=True,
    checkpoint_path=model_save_path,
    checkpoint_save_steps=len(train_dataloader),
    optimizer_params={"lr": lr},
    evaluator=triplet_evaluator
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Eeps-triplets-dev Cosine Accuracy
500,0.6185,,
1000,0.0281,,
1500,0.0223,,
1560,0.0223,No log,0.986969


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]