## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

folder = f"/content/drive/MyDrive/DH"
import pandas as pd
import os, json,re, torch
from tqdm import notebook as tqdm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import random
import re, os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
import unicodedata
import numpy as np
import scipy as sp
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from transformers import AutoTokenizer, AutoModel
from tqdm import notebook as tqdm
from sklearn.model_selection import train_test_split

In [None]:
bible = {}

b_versions = ['AKJV','ODRV','Geneva', 'Douay-Rheims', 'Tyndale', 'Wycliffe','Vulgate']
for bname in b_versions:
    data = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/Bibles/{bname}.csv",header=None)
    data = data.to_dict(orient="records")
    for entry in tqdm.tqdm(data):
        key = entry[0]
        v_id = key.split(" (")[0]
        text = entry[6].strip()
        if re.search("Douay-Rheims",key):
            if re.sub("Douay-Rheims","ODRV",key) in bible: continue
        if len(text.split(" ")) < 200:
            bible[key] = f"{v_id} {text}"

        parts = re.split(r'(?<=[\.\?]) (?=[A-Z])|(?<=[\!\:\;])', text)
        parts = [re.sub(r'\s+', ' ', p).strip() for p in parts if len(p.strip(" ")) > 0]
        if (len(parts[0].split(" ")) <= 5 or len(parts[-1].split(" ")) <= 5 or re.search(r"\&\w+\;",parts[0])):
            for pidx, p in enumerate(parts): continue
        elif len(parts) > 1:
            for pidx, p in enumerate(parts):
              p_id = f"{key} - {pidx}"
              if len(p) == 0: continue
              if re.search(r"\&\w+\;",p) or len(p.split(" ")) <= 5: continue
              bible[p_id] = f"Part {pidx+1} of {v_id}: {p}"

bible['NonQP 0.0'] = "No Biblical quotation or paraphrase found"

bible_verses = list(bible.values())
bible_ids = list(bible.keys())
verse_to_id = {v:k for k,v in bible.items()}
len(bible_verses)

  0%|          | 0/36702 [00:00<?, ?it/s]

  0%|          | 0/14737 [00:00<?, ?it/s]

  0%|          | 0/31090 [00:00<?, ?it/s]

  0%|          | 0/35811 [00:00<?, ?it/s]

  0%|          | 0/7954 [00:00<?, ?it/s]

  0%|          | 0/9622 [00:00<?, ?it/s]

  0%|          | 0/35809 [00:00<?, ?it/s]

276628

In [None]:
model_checkpoint = "emanjavacas/MacBERTh"
try:
    assert torch.cuda.is_available()
    device = torch.device("cuda")
except:
    device = torch.device("cpu")
print("Using device:", device)

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

Using device: cuda


In [None]:
!pip install sentence_transformers



In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder,models
from sentence_transformers.util import semantic_search

import sys
sys.modules["wandb"] = None

In [None]:
word_embedding_model = models.Transformer(model_checkpoint, max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "mean")
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
# Load saved Bible vectors
bible_vectors = torch.load(f"{folder}/EEPS/Bibles_MacBERTh.pt",map_location=device)
bible_vectors.shape

torch.Size([276628, 768])

# Load & Organize Dataset

In [None]:
with open(f"{folder}/Early-Modern-Sermons/assets/Bibles/ESV.json",'r') as file:
  ESV = json.load(file)

def fix_name(v_id):
  if "1 Kings" in v_id:
    v_id = "3 Kings" + v_id.split("1 Kings")[-1]
  elif "2 Kings" in v_id:
    v_id = "4 Kings" + v_id.split("2 Kings")[-1]
  elif "1 Samuel" in v_id:
    v_id = "1 Kings" + v_id.split("1 Samuel")[-1]
  elif "2 Samuel" in v_id:
    v_id = "2 Kings" + v_id.split("2 Samuel")[-1]
  elif re.search(r"^\d+ Chronicles",v_id):
    v_id = re.sub(r"Chronicles","Paralipomenon",v_id)
  return v_id

def get_training_data():
    # reading training and development data
    parallel = pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - FINAL.csv").to_dict(orient='records')
    parallel.extend(pd.read_csv(f"{folder}/Early-Modern-Sermons/assets/QP_Datasets/parallel_predictions_SAMPLE - NEGATIVES.csv").to_dict(orient='records'))

    all_data = {}
    for item in parallel:
      key = (item['index'],item['version'])

      if key not in all_data:
        v_id = re.findall(r"^[\w\s]+\d+.\d+",item['text'])[0]
        text = item['text'].split(v_id)[-1].strip()
        all_data[key] = {'qid': v_id,
                          'query': text,
                          'pos':{}, # verse ids with version
                          'neg':{}, # high similarity and equivalent numberings
                          }
      entry = all_data[key]

      v2_id = re.findall(r"^[\w\s]+\d+.\d+",item['verse_text'])[0]
      if re.search(r"Psalms 9\.",v2_id) and item['version'] == 'Douay-Rheims':
          item['version'] = 'ODRV'
      v2_id = v2_id + f" ({item['version']})"
      if item['version'] in ['Vulgate','Douay-Rheims','ODRV']:
        if v2_id not in bible:
          v2_id = fix_name(v2_id)
          if v2_id == "Psalms 77.59 (ODRV)": v2_id = "Psalms 77.59 (Douay-Rheims)"
      verse_text = bible[v2_id]

      if item['label'] is True:
        entry['pos'][v2_id] = None
      elif item['prediction'] is True and item['label'] is not False:
        entry['pos'][v2_id] = None
      else:
        entry['neg'][v2_id] = None

    training_set = {}
    for key, entry in all_data.items():
      if len(entry['pos']) == 0: continue
      training_set[key] = entry
    return training_set

In [None]:
from sklearn.model_selection import train_test_split
import random
random.seed(42)

def split_training_data(training_data):
    psalms_items = {k: v for k, v in training_data.items() if v['qid'].startswith("Psalms")}
    non_psalms_items = {k: v for k, v in training_data.items() if not v['qid'].startswith("Psalms") and v['qid'] != bible_ids[-1]}

    psalms_keys = list(set(psalms_items.keys()))
    psalms_dev_keys = random.sample(psalms_keys, 100)
    psalms_train_keys = list(set(psalms_keys) - set(psalms_dev_keys))
    non_psalms_keys = list(non_psalms_items.keys())

    train_keys_np, dev_keys_np = train_test_split(non_psalms_keys, test_size=0.2, random_state=42)
    train_keys = train_keys_np + psalms_train_keys
    dev_keys = dev_keys_np + psalms_dev_keys

    train_data = {k:training_data[k] for k in train_keys}
    dev_data = {k:training_data[k] for k in dev_keys}

    print(len(train_data),len(dev_data))
    return train_data, dev_data

In [None]:
training_data = get_training_data()
train_data, dev_data = split_training_data(training_data)
del training_data

8860 1400


In [None]:
full_ids = [] # idx to orig idx
full_vectors = []
idx = 0
for orig_idx, v_id in enumerate(bible_ids):
  if " - " not in v_id: # a full verse
    full_ids.append(v_id)
    full_vectors.append(bible_vectors[orig_idx])
    idx += 1
print(len(full_ids),len(full_vectors))

version_vectors = {b:[] for b in b_versions}
for b in b_versions:
  b_ids = {idx:None for idx, v_id in enumerate(full_ids) if b in v_id}
  version_vectors[b] = ([vec for idx, vec in enumerate(full_vectors) if idx in b_ids], [v_id for idx, v_id in enumerate(full_ids) if idx in b_ids])
  print(b, len(version_vectors[b][1]))


157008 157008
AKJV 36700
ODRV 14730
Geneva 31090
Douay-Rheims 21103
Tyndale 7954
Wycliffe 9622
Vulgate 35808


In [None]:
dev_vectors = bi_encoder.encode([entry['query'] for entry in dev_data.values()], batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
train_vectors = bi_encoder.encode([entry['query'] for entry in train_data.values()], batch_size=1024, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
NUM_PASSAGES = 50
def get_passages(vectors, data):
  keys = data
  for idx, key in enumerate(tqdm.tqdm(data)):
    _, version = key
    vector = vectors[idx]
    hitlist = semantic_search(vector,version_vectors[version][0],query_chunk_size=1000,top_k=NUM_PASSAGES)
    hitlist = hitlist[0]
    for hit in hitlist:
      v_id = version_vectors[version][1][hit['corpus_id']]
      if v_id in data[key]['pos']:
        continue
      data[key]['neg'][v_id] = True
  data = list(data.values())
  for item in data:
    item['neg'] = list(item['neg'].keys())[:NUM_PASSAGES]
    item['pos'] = list(item['pos'].keys())

  return data

dev_data = get_passages(dev_vectors, dev_data)
train_data = get_passages(train_vectors, train_data)

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/8860 [00:00<?, ?it/s]

In [None]:
from torch.utils.data import DataLoader,Dataset
import random
from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util

k = 40

class BibleDataset(Dataset):
    def __init__(self,train_queries):
        self.queries = {q['qid']:q for q in train_queries}
        self.queries_ids = list(self.queries.keys())
        self.queries_ids = self.queries_ids * k
        for entry in self.queries.values():
            random.shuffle(entry["neg"])

    def __getitem__(self, item):
        query = self.queries[self.queries_ids[item]]
        query_text = query["query"]

        pos_id = query["pos"].pop(0)
        pos_text = bible[pos_id]
        query['pos'].append(pos_id)

        neg_id = query["neg"].pop(0)
        neg_text = bible[neg_id]
        query['neg'].append(neg_id)

        return InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(self.queries_ids)

train_dataset = BibleDataset(train_data)
eval_dataset = BibleDataset(dev_data)

In [None]:
from datetime import datetime
max_seq_length = 500
train_batch_size = 64
pooling = "mean"
warmup_steps = 1000
lr = 2e-5

In [None]:
# triplet evaluator
from sentence_transformers.evaluation import TripletEvaluator
eval_triplets = {"anchors":[],"positives":[],"negatives":[]}

for cvkey, info in eval_dataset.queries.items():
  bible_verse = ESV[cvkey]
  for p in info["pos"]:
    num_negs = 0
    for n in info["neg"]:
      num_negs += 1
      eval_triplets["anchors"].append(bible_verse)
      eval_triplets["positives"].append(bible[p])
      eval_triplets["negatives"].append(bible[n])
      if num_negs == k: break

print(len(eval_triplets["anchors"]))

triplet_evaluator = TripletEvaluator(
    anchors=eval_triplets["anchors"],
    positives=eval_triplets["positives"],
    negatives=eval_triplets["negatives"],
    name="EEPS-triplets-dev",
    show_progress_bar=True,
    batch_size=1024
)

40520


# Training

In [None]:
model_name = f"EEPS_emanjavacas-MacBERTh"
model_save_path = "{}/models/{}_{}".format(
    folder,
    model_name,
    datetime.now().strftime("%Y-%m-%d")
)
print(model_save_path)

/content/drive/MyDrive/DH/models/EEPS_emanjavacas-MacBERTh_2025-05-05


In [None]:
# For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=bi_encoder)
print(len(train_dataloader.dataset), "triplets", f"k={k}")

99800 triplets k=40


In [None]:
!pip install datasets



In [None]:
# Train the model
from datasets import Dataset
bi_encoder.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1, # 1 EPOCH DONE ALREADY
    warmup_steps=warmup_steps,
    use_amp=True,
    checkpoint_path=model_save_path,
    checkpoint_save_steps=len(train_dataloader),
    optimizer_params={"lr": lr},
    evaluator=triplet_evaluator
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Eeps-triplets-dev Cosine Accuracy
500,0.6185,,
1000,0.0281,,
1500,0.0223,,
1560,0.0223,No log,0.986969


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
dev_vectors = bi_encoder.encode([entry['query'] for entry in dev_data], batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
bible_vectors = bi_encoder.encode(bible_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/271 [00:00<?, ?it/s]

In [None]:
model_name = "EEPS_emanjavacas-MacBERTh_2025-05-05_checkpoint-1559"
torch.save(bible_vectors, f'{folder}/EEPS/Bibles_{model_name}.pt')