In [1]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import os 
import torch 
folder = os.getcwd()
persist_directory=f'{folder}/static/data/VECTORDB'
bi_encoder = SentenceTransformer(f"{folder}/static/data/EEPS_emanjavacas-MacBERTh_2.0_2024-08-20_08-13-56")
client_settings = Settings(is_persistent= True, persist_directory= persist_directory, anonymized_telemetry=False)
queryclient = chromadb.PersistentClient(path= persist_directory, settings= client_settings) 

  from tqdm.autonotebook import tqdm, trange


In [3]:
import pandas as pd
data_folder = "/Users/amycweng/DH/SERMONS_APP/db/data"

sermons = pd.read_csv(f"{data_folder}/sermons.csv",header=None)
tcpIDs = list(sermons[0])
sermons = pd.read_csv(f"{data_folder}/sermons_missing.csv",header=None)
tcpIDs.extend(list(sermons[0]))
len(tcpIDs)

5862

In [4]:
import json
bible = {}
bible_ver = []
for bname in ['Geneva', 'Vulgate', 'Douay-Rheims', 'Tyndale', 'Wycliffe', 'KJV']:
    # if ".json" not in bname: continue
    bible_ver.append(bname.split(".")[0])
    with open(f"{data_folder}/Bibles/{bname}.json","r") as file:
      b_dict = json.load(file)
      print(bname, len(b_dict))
    for vid, text in b_dict.items():
      bible[f"{vid} ({bname.split('.')[0]})"] = text
bible_ver, len(bible.keys())

Geneva 31090
Vulgate 35809
Douay-Rheims 35811
Tyndale 7954
Wycliffe 9622
KJV 36822


(['Geneva', 'Vulgate', 'Douay-Rheims', 'Tyndale', 'Wycliffe', 'KJV'], 157108)

In [5]:
import re 
bible_ids = list(bible.keys())
bible_verses = list(bible.values())
btexts = []
bids = []
chroma_bids = []
for key in bible_ids:
  parts = re.split(r"\;|\:|\?|\.",bible[key])
  parts = [p.strip(" ") for p in parts if len(p.strip(" ").split(" ")) >= 5]

  if len(parts) > 1:
    for pidx, p in enumerate(parts):
      btexts.append(p)
      bids.append(key)
      chroma_bids.append(f"{key} - {pidx}")

In [6]:
b_parts_collection = queryclient.get_or_create_collection(name="BibleParts",metadata={"hnsw:space": "cosine"})
b_embedding = torch.load(f"{data_folder}/bible_parts.pt",map_location=torch.device('cpu'))
b_parts_collection.name

'BibleParts'

In [7]:
ptexts = ["the bond-woman and her son were cast out;"]
p_embedding = bi_encoder.encode(ptexts, batch_size=128, convert_to_tensor=True,show_progress_bar=True)
from sentence_transformers.util import semantic_search
hits = semantic_search(p_embedding,b_embedding,query_chunk_size=512)

for pidx, hitlist in enumerate(hits):
  for hit in hitlist:
      bid = hit['corpus_id']
      score = hit['score']
      print(btexts[bid],bids[bid],score)

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.02s/it]


Cast out the bondwoman and her son Galatians 4.30 (Douay-Rheims) 0.9138004779815674
Cast out the bondwoman and her son Galatians 4.30 (KJV) 0.9138004779815674
Cast out this bondwoman, and her son Genesis 21.10 (Douay-Rheims) 0.8937473297119141
Wherefore she said unto Abraham, Cast out this bondwoman and her son Genesis 21.10 (KJV) 0.8222891092300415
Wherefore she saide vnto Abraham, Cast out this bond woman and her sonne Genesis 21.10 (Geneva) 0.8087805509567261
for the son of the bondwoman shall not be heir with my son Isaac Genesis 21.10 (Douay-Rheims) 0.7663462162017822
for the son of the bondwoman shall not be heir with the son of the free woman Galatians 4.30 (Douay-Rheims) 0.7635434865951538
for the son of the bondwoman shall not be heir with the son of the freewoman Galatians 4.30 (KJV) 0.758854866027832
But he having put them all out, taketh the father and the mother of the damsel, and them that were with him, and entereth in where the damsel was lying Mark 5.40 (Douay-Rheims) 

In [9]:
from tqdm import tqdm 
bible_batches = []
batch_size = 40000
for i in range(0, len(chroma_bids), batch_size):
  bible_batches.append((chroma_bids[i: i + batch_size],b_embedding[i: i + batch_size],btexts[i:i+batch_size]))

for batchids, bvectors,batchtexts in tqdm(bible_batches):
  b_parts_collection.upsert(
    embeddings=bvectors.tolist(),
    ids=batchids,
    documents=batchtexts
  )

100%|██████████| 5/5 [06:39<00:00, 79.99s/it] 


In [10]:
query = "the bond-woman and her son were cast out;"
q_embedding = bi_encoder.encode([query])
results = b_parts_collection.query(query_embeddings=q_embedding.tolist(), n_results= 10,include=["distances"])
results

{'ids': [['Galatians 4.30 (Douay-Rheims) - 1',
   'Galatians 4.30 (KJV) - 1',
   'Genesis 21.10 (Douay-Rheims) - 0',
   'Genesis 21.10 (KJV) - 0',
   'Genesis 21.10 (Geneva) - 0',
   'Genesis 21.10 (Douay-Rheims) - 1',
   'Galatians 4.30 (Douay-Rheims) - 2',
   'Galatians 4.30 (KJV) - 2',
   'Mark 5.40 (Douay-Rheims) - 1',
   'Galatians 4.30 (Geneva) - 1']],
 'distances': [[0.08619952201843262,
   0.08619952201843262,
   0.10625314712524414,
   0.17771100997924805,
   0.19121956825256348,
   0.23365366458892822,
   0.23645663261413574,
   0.24114561080932617,
   0.24341309070587158,
   0.24531781673431396]],
 'metadatas': None,
 'embeddings': None,
 'documents': None,
 'uris': None,
 'data': None,
 'included': ['distances']}

In [15]:
bible_collection = queryclient.get_or_create_collection(name="Bible",metadata={"hnsw:space": "cosine"})
b_embedding = torch.load(f"{data_folder}/bible_3.0.pt",map_location=torch.device('cpu'))
bible_collection.name

'Bible'

In [16]:
bible_batches = []
batch_size = 40000
for i in range(0, len(bible_ids), batch_size):
  bible_batches.append((bible_ids[i: i + batch_size],b_embedding[i: i + batch_size],bible_verses[i:i+batch_size]))

for batchids, bvectors,batchtexts in tqdm(bible_batches):
  bible_collection.upsert(
    embeddings=bvectors.tolist(),
    ids=batchids,
    documents=batchtexts
  )

100%|██████████| 4/4 [05:20<00:00, 80.20s/it]


In [17]:
query = "the bond-woman and her son were cast out;"
q_embedding = bi_encoder.encode([query])
results = bible_collection.query(query_embeddings=q_embedding.tolist(), n_results= 10,include=["distances"])
results

{'ids': [['Galatians 4.30 (KJV)',
   'Galatians 4.30 (Douay-Rheims)',
   'Genesis 21.10 (Douay-Rheims)',
   'Galatians 4.30 (Geneva)',
   'Genesis 21.10 (KJV)',
   '1 Maccabees 2.11 (Douay-Rheims)',
   'Ezekiel 16.45 (Douay-Rheims)',
   'Isaiah 50.1 (Douay-Rheims)',
   'Isaiah 50.1 (KJV)',
   'Jeremiah 22.26 (KJV)']],
 'distances': [[0.14670240879058838,
   0.15016305446624756,
   0.1623440384864807,
   0.20470017194747925,
   0.20858585834503174,
   0.23436886072158813,
   0.24835169315338135,
   0.25248634815216064,
   0.2549479007720947,
   0.267315149307251]],
 'metadatas': None,
 'embeddings': None,
 'documents': None,
 'uris': None,
 'data': None,
 'included': ['distances']}

In [12]:
title_collection = queryclient.get_or_create_collection(name="Titles",metadata={"hnsw:space": "cosine"})
title_vectors = torch.load(f"{data_folder}/titles.pt",map_location=torch.device('cpu'))
title_collection.upsert(
  embeddings=title_vectors.tolist(),
  ids=tcpIDs
)

In [19]:
import re, json
from tqdm import tqdm 
import math, re
import torch
def add_to_db(era):
  corpus = {} 
  output = f"{era}_margin"
  folder = "/Users/amycweng/DH/Early-Modern-Sermons/assets"
  for fp in tqdm(os.listdir(f"{folder}/unique")):
      if re.search(era,fp):
        if "margin" in output:
          if not re.search(f'margin',fp): continue
        with open(f"{folder}/unique/{fp}","r") as file:
          r = json.load(file)
          for k, v in r.items():
            if k not in corpus:
              if len(v[0]) == 0: continue
              corpus[k] = (v[0],v[1],None)
            else:
              for loc in v[1]:
                corpus[k][1].append(loc)
  rel_batches = []
  idx_to_p = {}
  batch_size = 40000
  idx = 0
  for i in range(0, len(corpus), batch_size):
    batch = []
    for p in list(corpus.keys())[i: i + batch_size]:
      original = corpus[p][0][0]
      idx_to_p[idx] = original
      batch.append((idx,original,list(set([tuple(c) for c in corpus[p][1]])),corpus[p][2]))
      idx += 1
    rel_batches.append(batch)
  print(sum([len(v) for v in rel_batches]))

  chroma_batches = {}
  batch_size = math.ceil(len(corpus)/200000) + 1
  batch_num = 0
  for i in range(0, len(rel_batches), batch_size):
    print(f"{output}_{batch_num}")
    collection = queryclient.get_or_create_collection(name=f"{output}_{batch_num}",metadata={"hnsw:space": "cosine"})
    for j in range(i,i+batch_size):
      if j >= len(rel_batches): break
      chroma_batches[j] = batch_num
    batch_num += 1
  print(chroma_batches)

  for bidx, batch in enumerate(rel_batches):
    p_embedding = torch.load(f"{data_folder}/embeddings/{output}_{bidx}",map_location=torch.device('cpu'))
    print(len(p_embedding))
    cidx = chroma_batches[bidx]
    collection = queryclient.get_collection(name=f"{output}_{cidx}")
    print(collection)
    docs = [";".join(["_".join(key) for key in b[2]]) for b in batch]
    collection.upsert(
      embeddings=p_embedding.tolist(),
      ids=[str(b[0]) for b in batch],
      documents= docs
    )
    print(f"finished inserting to my Chroma collection")

In [20]:
add_to_db("pre-Elizabethan") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 625.46it/s]


7066
pre-Elizabethan_margin_0
{0: 0}
7066
Collection(id=4c51f774-00bd-4e86-85cb-1409aecf521e, name=pre-Elizabethan_margin_0)
finished inserting to my Chroma collection


In [21]:
add_to_db("Elizabethan") # preE, JII, 

100%|██████████| 19/19 [00:01<00:00, 16.33it/s]


99283
Elizabethan_margin_0
Elizabethan_margin_1
{0: 0, 1: 0, 2: 1}
40000
Collection(id=1782ba01-2afe-45a3-9ec5-96dad81e92db, name=Elizabethan_margin_0)
finished inserting to my Chroma collection
40000
Collection(id=1782ba01-2afe-45a3-9ec5-96dad81e92db, name=Elizabethan_margin_0)
finished inserting to my Chroma collection
19283
Collection(id=59f49960-8dd8-4c2e-8e92-5e2e2711dd46, name=Elizabethan_margin_1)
finished inserting to my Chroma collection


In [22]:
add_to_db("Jacobean") # preE, JII, 

100%|██████████| 19/19 [00:01<00:00, 18.11it/s]


109834
Jacobean_margin_0
Jacobean_margin_1
{0: 0, 1: 0, 2: 1}
40000
Collection(id=ee298538-8394-482f-923a-7be4535800de, name=Jacobean_margin_0)
finished inserting to my Chroma collection
40000
Collection(id=ee298538-8394-482f-923a-7be4535800de, name=Jacobean_margin_0)
finished inserting to my Chroma collection
29834
Collection(id=4ce4a2de-ecfb-4458-ba1e-182cf1435a07, name=Jacobean_margin_1)
finished inserting to my Chroma collection


In [23]:
add_to_db("Carolinian") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 32.18it/s]


96211
Carolinian_margin_0
Carolinian_margin_1
{0: 0, 1: 0, 2: 1}
40000
Collection(id=e04ab81a-92da-4426-8aab-4ec6dbd4559f, name=Carolinian_margin_0)
finished inserting to my Chroma collection
40000
Collection(id=e04ab81a-92da-4426-8aab-4ec6dbd4559f, name=Carolinian_margin_0)
finished inserting to my Chroma collection
16211
Collection(id=df966e79-9375-4dae-91f4-858634b5a3b7, name=Carolinian_margin_1)
finished inserting to my Chroma collection


In [24]:
add_to_db("CivilWar") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 46.06it/s]


50047
CivilWar_margin_0
{0: 0, 1: 0}
40000
Collection(id=ac41f3d5-4773-46d6-a802-d396ce8b6fef, name=CivilWar_margin_0)
finished inserting to my Chroma collection
10047
Collection(id=ac41f3d5-4773-46d6-a802-d396ce8b6fef, name=CivilWar_margin_0)
finished inserting to my Chroma collection


In [25]:
add_to_db("Interregnum") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 37.77it/s]


60876
Interregnum_margin_0
{0: 0, 1: 0}
40000
Collection(id=fc19d3f8-6f2d-43b8-814d-f848dd6c0ca4, name=Interregnum_margin_0)
finished inserting to my Chroma collection
20876
Collection(id=fc19d3f8-6f2d-43b8-814d-f848dd6c0ca4, name=Interregnum_margin_0)
finished inserting to my Chroma collection


In [26]:
add_to_db("JamesII") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 498.77it/s]

11589
JamesII_margin_0
{0: 0}
11589
Collection(id=3a5d245b-d2c8-4b6f-8c22-45df5b883a8b, name=JamesII_margin_0)





finished inserting to my Chroma collection


In [27]:
add_to_db("WilliamAndMary") # preE, JII, 

100%|██████████| 19/19 [00:00<00:00, 68.82it/s]


25198
WilliamAndMary_margin_0
{0: 0}
25198
Collection(id=a6d44d66-c1fe-46c3-85a5-1e21dba10484, name=WilliamAndMary_margin_0)
finished inserting to my Chroma collection


In [28]:
add_to_db("CharlesII") # preE, JII, 

100%|██████████| 19/19 [00:01<00:00, 14.58it/s]


90319
CharlesII_margin_0
CharlesII_margin_1
{0: 0, 1: 0, 2: 1}
40000
Collection(id=4e2d9eef-1e06-4d2a-a982-0141444d228e, name=CharlesII_margin_0)
finished inserting to my Chroma collection
40000
Collection(id=4e2d9eef-1e06-4d2a-a982-0141444d228e, name=CharlesII_margin_0)
finished inserting to my Chroma collection
10319
Collection(id=0719c9d8-2d0a-4236-9786-95f3260483f7, name=CharlesII_margin_1)
finished inserting to my Chroma collection


In [29]:
collections = queryclient.list_collections()

# Print the collection names
for collection in collections:
    print(collection.name,len(collection.get()['ids']))

BibleParts 161935
CharlesII_margin_1 10319
Titles 5862
Elizabethan_margin_0 80000
JamesII_margin_0 11589
pre-Elizabethan_margin_0 7066
Jacobean_margin_1 29834
CharlesII_margin_0 80000
Elizabethan_margin_1 19283
WilliamAndMary_margin_0 25198
CivilWar_margin_0 50047
Carolinian_margin_1 16211
Carolinian_margin_0 80000
Bible 157108
Jacobean_margin_0 80000
Interregnum_margin_0 60876


In [13]:
collection = queryclient.get_collection(name="Bible")
k=10
query = "Borne again of immortall seed."
q_embedding = bi_encoder.encode([query])
results = collection.query(query_embeddings=q_embedding.tolist(), n_results= k,include=["distances"])