## Prepare data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os,json,re
import pandas as pd
import torch

In [None]:
try:
    assert torch.cuda.is_available()
    device = torch.device("cuda")
except:
    device = torch.device("cpu")
print("Using device:", device)

Using device: cpu


### Vector Database

In [None]:
!pip install chromadb

In [None]:
import chromadb

In [None]:
client = chromadb.PersistentClient(path="/content/drive/MyDrive/CAPSTONE/db")

In [None]:
eras = ["preJacobean","Jacobean","Carolinian","CivilWar","Interregnum"]
collections = {}
for era in eras:
    # collections[era] = client.create_collection(name=era,metadata={"hnsw:space": "cosine"})
    collections[era] = client.get_collection(name=era)

In [12]:
for era, collection in collections.items():
    check = collection.get(where={"tcpID": 'A41135'})
    print(era, len(check['ids']))

DatabaseError: database disk image is malformed

In [16]:
!sqlite3 /content/drive/MyDrive/CAPSTONE/db/chroma.sqlite3 'PRAGMA integrity_check';

/bin/bash: line 1: sqlite3: command not found


In [None]:
file_prefix = "/content/drive/MyDrive/CAPSTONE"
with open(f'{file_prefix}/pre1660.json') as file:
    pre1660 = json.load(file)
preJ,J,C,CW,IR = pre1660
eras = {"preJacobean":preJ, "Jacobean":J, "Carolinian":C,"CivilWar":CW,"Interregnum":IR}
tcpID_era = {}
for era, era_dict in eras.items():
    for id_list in era_dict.values():
        for tcpID in id_list:
            tcpID_era[tcpID] = era

In [None]:
import re

def split_sentence(sentence):
    to_segment = [", but", ", while", ", let", ", they", ", NONLATINALPHABET",
                    ", then", ", yet", ", than", ', and yet', ', and though',
                    ', at least', ', and to', ', this be', ', for', ', therefore',
                    ', that', ', and we', ', and i ', ', when', ', and say', ', and this',
                    ', and then', ', and than', ', and they', ', i say', ', as the apostle',
                    ', otherwise', ', how', ', according', ', accordi^^', ', say',', and when',
                    ', and he', ', and she', ', he say', ', she say', ', lest', ', and where',
                    ', and how', ', and what', ', and there', ', and therefore', ', and thus',
                    ', and if', ', and because', ', and I ', ', he will', ', they will', ', she will']
    pattern = '|'.join(map(re.escape, to_segment))
    parts = re.split(pattern, sentence)

    matches = re.findall(pattern,sentence)
    if len(parts) == 1: return parts
    for idx, part in enumerate(parts):
        if idx == (len(parts) - 1): break
        conj = re.sub(", ", "",matches[idx])
        parts[idx] = part + " , "
        parts[idx + 1] = conj + parts[idx+1]
    return parts

In [None]:
class Sermons():
    def __init__(self,prefix):
      self.prefix = prefix

def get_docs(prefix):
  corpus = Sermons(prefix)
  with open(f'/content/drive/MyDrive/CAPSTONE/sermons/{prefix}.json','r') as file:
      sent_id, lemmatized, chunks, fw_subchunks = json.load(file)
  corpus.sent_id = sent_id
  corpus.lemmatized = lemmatized
  # corpus.chunks = chunks
  # corpus.fw_subchunks = fw_subchunks
  corpus.sent_id_to_idx = {(tuple(x[0]),x[1]):idx for idx, x in enumerate(sent_id)}
  passages = []
  count = 0
  for idx, id in enumerate(corpus.sent_id):
      if prefix in id[0][0]:
          passage = corpus.lemmatized[corpus.sent_id_to_idx[(tuple(id[0]),id[1])]]
          passage = re.sub(r"[^A-Za-z\^\*,]"," ",passage)
          passage = re.sub(r"\s+"," ", passage).strip(" ")
          passage = passage.strip(" ")
          if len(passage.split(" ")) < 2: continue
          parts = split_sentence(passage)
          for part in parts:
              if len(part.split(" ")) < 3: continue
              passages.append(part)
  print("Passages:", len(passages))
  return passages

In [None]:
def process(prefix,docs):
    vectors = torch.load(f"{file_prefix}/embeddings/{prefix}_corpus_embeddings_segmented.pth",map_location=device)
    with open(f'{file_prefix}/embeddings/{prefix}_ids.json') as file:
      ids = json.load(file)
    count = 0
    e, m, i,d = [],[],[],[]
    prev_tcpID = None
    for idx, label in enumerate(ids):
        tcpID, chunk_id, is_note = label[0]
        part_id = label[1]
        if tcpID == 'A69056': continue # twice as big as the max allowed size

        # check if the book is dated before 1660
        if tcpID not in tcpID_era: continue
        if prev_tcpID is None:
            prev_tcpID = tcpID

        if tcpID != prev_tcpID:
            print('Processed',prev_tcpID,tcpID_era[prev_tcpID])
            collection = collections[tcpID_era[prev_tcpID]]
            check = collection.get(where={'tcpID':prev_tcpID})
            if len(e) > 0 and len(check['ids']) == 0:
                collection.upsert(ids=i,embeddings=e,metadatas=m,documents=d)
                e, m, i, d = [],[],[],[]
            elif len(check['ids']) > 0:
                e, m, i, d = [],[],[],[]
                print(prev_tcpID,'already in database')
            prev_tcpID = tcpID

        count += 1
        e.append(vectors[idx].tolist())
        m.append({"tcpID": tcpID, 'chunk_id': chunk_id, 'is_note':is_note, 'part_id':part_id})
        i.append(f'{prefix}_{idx}')
        d.append(docs[idx])

    if len(i) > 0:
        collection = collections[tcpID_era[prev_tcpID]]
        collection.upsert(ids=i,embeddings=e,metadatas=m,documents=d)
        print('Processed',prev_tcpID,tcpID_era[prev_tcpID])
    print(count)

In [None]:
def process_db(prefix):
    data = get_docs(prefix)
    process(prefix,data)
    del data

In [None]:
process_db('B')

In [None]:
process_db('A4')

In [None]:
process_db('A9')

In [None]:
process_db('A8')

In [None]:
process_db('A7')

In [None]:
process_db('A6')

In [None]:
process_db('A5')

In [None]:
process_db('A3')

KeyboardInterrupt: 

In [None]:
process_db('A2')

In [None]:
process_db('A1')

In [None]:
process_db('A0')

## The Bible


In [None]:
with open('/content/drive/MyDrive/CAPSTONE/kjv-adorned.txt','r') as file:
    kjv_tokens = file.readlines()

bible = {}
current_ver = None
for t in kjv_tokens:
    t = t.split("\t")
    token, pos, lemma = t[0], t[2], t[4]
    if token[0].isupper() and re.search("vv",pos):
        lemma = token
    if re.search(r'VERSE-',token):
        current_ver = re.sub("VERSE-", "",token)
        if current_ver[0].islower():
            current_ver = "J" + current_ver
        elif "Acts" in current_ver:
            n = current_ver.split("-")[-2:]
            current_ver = f"Acts-{n[0]}-{n[1]}"
        bible[current_ver] = []
    # elif token == pos: # punctuation mark
    #     continue
    else:
        bible[current_ver].append(lemma)
bible = {k: " ".join(v) for k,v in bible.items()}

## Set up the transformers

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

bi_encoder = SentenceTransformer("all-MiniLM-L6-v2")
bi_encoder.max_seq_length = 256     #Truncate long sections

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
def search(query,k=100,top_k=5):
    print("\n-------------------------\n")
    print("Bible verse:", query)
    ##### Semantic Search #####
    k=100

    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.to(device)

    # query database for most similar embeddings
    emb, docs, meta = [],[],[]
    for era, collection in collections.items():
      hits = collection.query(query_embeddings=[question_embedding.tolist()],n_results=k,include=['documents','embeddings','metadatas'])
      emb.extend(hits['embeddings'][0])
      docs.extend(hits['documents'][0])
      meta.extend(hits['metadatas'][0])

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, d] for d in docs]
    cross_scores = cross_encoder.predict(cross_inp)
    cross_scores = {idx:score for idx, score in enumerate(cross_scores)}

    # print("\n-------------------------\n")
    print(f"Top {top_k} Cross-Encoder Re-ranker hits")
    results = sorted(cross_scores.items(), key=lambda x: x[1], reverse=True)
    for idx, score in results[0:top_k]:
        print("\t{:.3f}\t{}\t{}".format(score,meta[idx],docs[idx]))
    print()

In [None]:
def specific_search(collection,query,k=100,top_k=5):
    print("\n-------------------------\n")
    print("Bible verse:", query)
    ##### Semantic Search #####

    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.to(device)

    # query database for most similar embeddings
    hits = collection.query(query_embeddings=[question_embedding.tolist()],n_results=k,include=['documents','embeddings','metadatas'])
    emb = hits['embeddings'][0]
    docs = hits['documents'][0]
    meta = hits['metadatas'][0]

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, d] for d in docs]
    cross_scores = cross_encoder.predict(cross_inp)
    cross_scores = {idx:score for idx, score in enumerate(cross_scores)}

    # print("\n-------------------------\n")
    print(f"Top {top_k} Cross-Encoder Re-ranker hits")
    results = sorted(cross_scores.items(), key=lambda x: x[1], reverse=True)
    for idx, score in results[0:top_k]:
        print("\t{:.3f}\t{}\t{}".format(score,meta[idx],docs[idx]))
    print()

In [None]:
with open('/content/drive/MyDrive/CAPSTONE/pre1660_citations.json') as file:
    citations = json.load(file)

50th percentile: 3.0

75th percentile: 10.0

85th percentile: 18.0

95th percentile: 42.0

99th percentile: 86.37999999999738

99.75th percentile: 140.0

99.99th percentile: 236.54379999999946
99.9999th percentile: 278.7557679999736
100th percentile: 280.0

The value of top_k is 5 for verses within the 50th percentile, 20 within the 75th percentile, 50 within the 85th, 100 within the 95th, 150 within the 99th, 200 within the 99.75th, and 500 for the rest.  

The value of k is 500 within the 95th percentile and 1000 otherwise.


In [None]:
# percentiles and k
def get_k(verse):
    count = citations[verse]
    if count <= 3: return 5,500
    if count <= 10: return 20,500
    if count <= 18: return 50,500
    if count <= 42: return 100,500
    if count <= 87: return 150,1000
    if count <= 140: return 300,1000
    else: return 500,1000

In [None]:
import re
def search_verse(ver_name,collection="general"):
  top_k, k = get_k(ver_name)
  verse = re.split(r" : | ; | \? ",bible[ver_name])
  for phrase in verse:
      parts  = split_sentence(phrase)
      for p in parts:
        if collection=="general":
            search(query = p,k=k,top_k=top_k)
        else:
            specific_search(collections[collection],p,k,top_k)

In [None]:
search_verse("Joshua-6-21")


-------------------------

Bible verse: and they utter destroy all that be in the city , both man and woman , young and old , and ox  , 
Top 20 Cross-Encoder Re-ranker hits
	5.437	{'chunk_id': 686, 'is_note': -1, 'part_id': 6, 'tcpID': 'A73176'}	that he will without all pity destroy man , woman , young and old , high and low amongst they , yea their very city also , and all that be therein , whereby they have be so wicked , and that within forty day
	3.105	{'chunk_id': 5707, 'is_note': -1, 'part_id': 0, 'tcpID': 'B11837'}	therefore like as the day of mourning , and sudden destruction come upon old room , and utter destroy both the city and empire
	2.587	{'chunk_id': 1849, 'is_note': -1, 'part_id': 1, 'tcpID': 'A96538'}	break down the wall thereof and utter destroy the city
	2.577	{'chunk_id': 1177, 'is_note': -1, 'part_id': 3, 'tcpID': 'A86299'}	and destroy they utter
	2.527	{'chunk_id': 3989, 'is_note': -1, 'part_id': 2, 'tcpID': 'A42583'}	than i will utter destroy their city
	2.355	

In [None]:
search_verse("Joshua-6-21","CivilWar")


-------------------------

Bible verse: and they utter destroy all that be in the city , both man and woman , young and old , and ox  , 
Top 20 Cross-Encoder Re-ranker hits
	5.138	{'chunk_id': 212, 'is_note': -1, 'part_id': 0, 'tcpID': 'A71286'}	a utter destruction be denounce against all of all sort , who be band together against God and his people , man and woman , young and old
	2.759	{'chunk_id': 234, 'is_note': -1, 'part_id': 1, 'tcpID': 'A92145'}	than a man accord to god heart David will but utter destroy Nabal and all he have , the disciple , because they can have a night lodige , will do no less than burn city and town , and have Samaria destroy as Sodom , as if Christ be come in the world to raise fire and sword against man , woman and suck infant  , 
	2.075	{'chunk_id': 34, 'is_note': -1, 'part_id': 10, 'tcpID': 'A81239'}	that they may utter destroy we
	2.043	{'chunk_id': 212, 'is_note': -1, 'part_id': 2, 'tcpID': 'A85664'}	and to destroy they utter
	1.610	{'chunk_id': 627, 

### Examples with the A4 + B corpora

In [None]:
search_verse("Psalms-39-3")

In [None]:
search_verse("Psalms-39-3")


-------------------------

Bible verse: my heart be hot within i  , 
Top 50 Cross-Encoder Re-ranker hits
	9.049	{'chunk_id': 623, 'is_note': -1, 'part_id': 7, 'tcpID': 'A43844'}	my heart be hot within i  , 
	9.049	{'chunk_id': 411, 'is_note': -1, 'part_id': 5, 'tcpID': 'A43844'}	my heart be hot within i  , 
	8.727	{'chunk_id': 1594, 'is_note': -1, 'part_id': 0, 'tcpID': 'A41135'}	my heart be hot within i
	4.777	{'chunk_id': 210, 'is_note': -1, 'part_id': 0, 'tcpID': 'B00565'}	when my sorrow be stir say he my hart be hot within i , and while i be muse the fire kindle
	4.458	{'chunk_id': 212, 'is_note': -1, 'part_id': 0, 'tcpID': 'B00565'}	my hart be hot within i
	3.363	{'chunk_id': 90, 'is_note': -1, 'part_id': 3, 'tcpID': 'A41135'}	while his heart be hot  , 
	3.282	{'chunk_id': 156, 'is_note': -1, 'part_id': 0, 'tcpID': 'B01867'}	perhaps your heart have be burn hot with passion when you have be come into god presence
	2.494	{'chunk_id': 90, 'is_note': -1, 'part_id': 2, 'tcpID': 'A4113

In [None]:
B[0].chunks["B00565,212,False"]

'My hart was hot within me. There is the torch lighted.'

In [None]:
search_verse("Matthew-22-13")


-------------------------

Bible verse: then say the king to the servant , Bind he hand and foot , and take he away , and cast he into outer darkness
Top 150 Cross-Encoder Re-ranker hits
	10.437	{'chunk_id': 1357, 'is_note': -1, 'part_id': 0, 'tcpID': 'A49589'}	then the king say to the servant , bound he hand and foot , and take he away , and cast he into outer darkness , there shall be weep and gnash of tooth
	10.336	{'chunk_id': 1513, 'is_note': -1, 'part_id': 0, 'tcpID': 'A49589'}	then say the king unto his servant , bound he hand and foot , and take he away , and cast he into outer darkness
	10.278	{'chunk_id': 1242, 'is_note': -1, 'part_id': 0, 'tcpID': 'A49589'}	then say the king unto his servant , bind he hand and foot , and take he away , and cast he into outer darkness
	7.067	{'chunk_id': 1527, 'is_note': -1, 'part_id': 1, 'tcpID': 'A49589'}	bound he hand and foot , and take he away , and cast he into outer darkness
	5.894	{'chunk_id': 340, 'is_note': -1, 'part_id': 0, 'tcpID

In [None]:
with open('/content/drive/MyDrive/CAPSTONE/case_study.txt') as file:
    lines = file.readlines()
case_study = {}
curr = None
for line in lines:
    if "#" in line:
        line = line.split(" ")
        curr = line[1].strip("\n")
        case_study[curr] = []
    else:
        case_study[curr].append(line.strip("\n"))

In [None]:
for k, v in case_study.items():
    print(k, v)

Original_Sin ['Genesis-3-16', 'Romans-5-12', 'Romans-5-16', 'Romans-5-19', '1-Corinthians-15-22', 'Acts-17-26']
Biblical_Cases ['Hebrews-7-9', 'Hebrews-7-10 ', 'Numbers-16-27', 'Numbers-16-32', 'Joshua-7-24', 'Joshua-7-25', 'Joshua-6-21', '1-Samuel-2-31', '1-Samuel-2-33', '1-Samuel-2-36', '1-Samuel-21-1', '1-Samuel-15-2', '1-Samuel-15-3', '1-Samuel-15-33', '2-Kings-17-2', '2-Kings-17-4', '2-Kings-23-26', 'Jeremiah-25-3', 'Jeremiah-25-4', 'Matthew-23-34', 'Matthew-23-35', 'Genesis-9-25', 'Genesis-12-14', 'Genesis-20-7', '2-Samuel-24-17', 'Joshua-7-1', 'Joshua-7-11']
Divine_Justice ['Exodus-20-5', 'Numbers-14-33', 'Leviticus-26-39', '2-Kings-17-14', 'Ezekiel-18-4']
Iniquity ['Leviticus-26-40', 'Nehemiah-9-2']
Not_Cited ['Numbers-14-18', 'Deuteronomy-5-9', 'Exodus-34-7']
Personal_Responsibility ['Deuteronomy-24-16', 'Ezekiel-18-20', '2-Chronicles-25-4', 'Jeremiah-31-30', '2-Kings-14-6', 'Galatians-6-5', 'Galatians-6-7', '1-Corinthians-3-8']


In [None]:
search_verse("Joshua-6-21")


-------------------------

Bible verse: and they utter destroy all that be in the city , both man and woman , young and old , and ox  , 
Top 20 Cross-Encoder Re-ranker hits
	3.105	{'chunk_id': 5707, 'is_note': -1, 'part_id': 0, 'tcpID': 'B11837'}	therefore like as the day of mourning , and sudden destruction come upon old room , and utter destroy both the city and empire
	2.527	{'chunk_id': 3989, 'is_note': -1, 'part_id': 2, 'tcpID': 'A42583'}	than i will utter destroy their city
	2.260	{'chunk_id': 295, 'is_note': -1, 'part_id': 3, 'tcpID': 'A49255'}	that they destroy utter
	1.866	{'chunk_id': 7543, 'is_note': -1, 'part_id': 0, 'tcpID': 'B11837'}	the city , wherein the roman have their garnison , be utter destroy  , 
	1.247	{'chunk_id': 3059, 'is_note': -1, 'part_id': 1, 'tcpID': 'A41140'}	and to destroy they , as man in wa^res do when they slay both young and old , and make no bone of it , and be glad when they have do it
	0.729	{'chunk_id': 845, 'is_note': -1, 'part_id': 0, 'tcpID'

In [None]:
search_verse("Exodus-20-5")


-------------------------

Bible verse: thou shall not bow down thyself to they , nor serve they
Top 150 Cross-Encoder Re-ranker hits
	8.668	{'chunk_id': 7975, 'is_note': -1, 'part_id': 1, 'tcpID': 'A42583'}	and make idol , add , thou shall not bow down thy self to they nor serve they
	4.017	{'chunk_id': 78, 'is_note': -1, 'part_id': 4, 'tcpID': 'B02276'}	they shall bow down to thou , with their face towards the earth , and lick up the dust of thy foot
	3.657	{'chunk_id': 14, 'is_note': -1, 'part_id': 3, 'tcpID': 'A46822'}	thou shall not go up and down as a talebearer among thy people
	2.421	{'chunk_id': 75, 'is_note': -1, 'part_id': 2, 'tcpID': 'A42498'}	they can sanctify thou , nor thy service
	2.205	{'chunk_id': 14176, 'is_note': -1, 'part_id': 0, 'tcpID': 'B12105'}	thou shall not
	1.965	{'chunk_id': 3388, 'is_note': -1, 'part_id': 3, 'tcpID': 'B12105'}	and he only shall thou serve
	1.965	{'chunk_id': 365, 'is_note': -1, 'part_id': 12, 'tcpID': 'B13601'}	and he only shall thou serv

In [None]:
search_verse("Genesis-9-25")


-------------------------

Bible verse: and he say , Cursed be Canaan
Top 100 Cross-Encoder Re-ranker hits
	7.621	{'chunk_id': 421, 'is_note': -1, 'part_id': 2, 'tcpID': 'B12376'}	no , Canaan be curse , a servant of servant shall he be unto his brethren
	6.574	{'chunk_id': 188, 'is_note': -1, 'part_id': 0, 'tcpID': 'A43608'}	but curse be Canaan
	5.526	{'chunk_id': 8402, 'is_note': -1, 'part_id': 1, 'tcpID': 'B12105'}	and God say , curse be every one  , 
	5.237	{'chunk_id': 6234, 'is_note': -1, 'part_id': 2, 'tcpID': 'A42583'}	say , there shall be no Canaanite  , 
	4.953	{'chunk_id': 246, 'is_note': -1, 'part_id': 0, 'tcpID': 'A43562'}	Noah curse cham , and the Canaanite be curse of God
	4.721	{'chunk_id': 43, 'is_note': -1, 'part_id': 0, 'tcpID': 'B11957'}	i be ashamed , canaans curse shall now adays be think no curse  , 
	4.531	{'chunk_id': 188, 'is_note': -1, 'part_id': 0, 'tcpID': 'A43608'}	thus cham be curse in his son Canaan , mark the text , not curse be cham  , 
	4.501	{'chunk_