In [27]:
import chromadb
import json
import sys
import torch
sys.path.append('../')

In [28]:
client = chromadb.PersistentClient(path="/Users/amycweng/DH/db")

In [29]:
# only the texts 
eras = ["pre-Elizabethan","Elizabethan","Jacobean","Carolinian","CivilWar","Interregnum"]
extra_eras = ["Elizabethan2", "Jacobean2", "Carolinian2","Interregnum2"]
eras.extend(extra_eras)
collections = {}
for era in eras:
    # collections[era] = client.create_collection(name=era,metadata={"hnsw:space": "cosine"})
    collections[era] = client.get_collection(name=era)
    

In [None]:
# only the marginalia  
# pre1660marginalia = client.create_collection(name="pre1660marginalia",metadata={"hnsw:space": "cosine"})
pre1660marginalia = client.get_collection(name="pre1660marginalia")

In [30]:
for era, collection in collections.items(): 
    print(era, len(collection.get()['ids']))

pre-Elizabethan 79351
Elizabethan 663735
Jacobean 641077
Carolinian 918023
CivilWar 609219
Interregnum 648176
Elizabethan2 117547
Jacobean2 84224
Carolinian2 99419
Interregnum2 387390


In [None]:
with open(f'../assets/pre1660.json') as file:
    pre1660 = json.load(file)
preE,E,J,C,CW,IR = pre1660
eras = {"pre-Elizabethan":preE,"Elizabethan":E, "Jacobean":J, "Carolinian":C,"CivilWar":CW,"Interregnum":IR}
tcpID_era = {}
for era, era_dict in eras.items():
    for id_list in era_dict.values():
        for tcpID in id_list:
            tcpID_era[tcpID] = era

In [None]:
import re

def split_sentence(sentence):
    to_segment = [", but", ", while", ", let", ", they", ", NONLATINALPHABET",
                    ", then", ", yet", ", than", ', and yet', ', and though',
                    ', at least', ', and to', ', this be', ', for', ', therefore',
                    ', that', ', and we', ', and i ', ', when', ', and say', ', and this',
                    ', and then', ', and than', ', and they', ', i say', ', as the apostle',
                    ', otherwise', ', how', ', according', ', accordi^^', ', say',', and when',
                    ', and he', ', and she', ', he say', ', she say', ', lest', ', and where',
                    ', and how', ', and what', ', and there', ', and therefore', ', and thus',
                    ', and if', ', and because', ', and I ', ', he will', ', they will', ', she will']
    pattern = '|'.join(map(re.escape, to_segment))
    parts = re.split(pattern, sentence)

    matches = re.findall(pattern,sentence)
    if len(parts) == 1: return parts
    for idx, part in enumerate(parts):
        if idx == (len(parts) - 1): break
        conj = re.sub(", ", "",matches[idx])
        parts[idx] = part + " , "
        parts[idx + 1] = conj + parts[idx+1]
    return parts

In [None]:
class Sermons():
    def __init__(self,prefix):
      self.prefix = prefix

def get_docs(prefix):
  corpus = Sermons(prefix)
  with open(f'../assets/processed/{prefix}.json','r') as file:
      sent_id, lemmatized, chunks, fw_subchunks = json.load(file)
  corpus.sent_id = sent_id
  corpus.lemmatized = lemmatized
  corpus.sent_id_to_idx = {(tuple(x[0]),x[1]):idx for idx, x in enumerate(sent_id)}
  passages = []
  for id in corpus.sent_id:
      if prefix in id[0][0]:
          passage = corpus.lemmatized[corpus.sent_id_to_idx[(tuple(id[0]),id[1])]]
          passage = re.sub(r"[^A-Za-z\^\*,]"," ",passage)
          passage = re.sub(r"\s+"," ", passage).strip(" ")
          passage = passage.strip(" ")
          if len(passage.split(" ")) < 2: continue
          parts = split_sentence(passage)
          for part in parts:
              if len(part.split(" ")) < 3: continue
              passages.append(part)
  print("Passages:", len(passages))
  return passages

In [None]:
# collections['pre-Elizabethan'].get(ids=["A0_217167"])
collections['CivilWar'].get(where={"tcpID":"A67876"})

In [17]:
processed = {} 

In [23]:
def process(prefix,docs):
    vectors = torch.load(f"/Users/amycweng/DH/embeddings/{prefix}_corpus_embeddings_segmented.pth",map_location="cpu")
    with open(f'/Users/amycweng/DH/embeddings/{prefix}_ids.json') as file:
      ids = json.load(file)
    count = 0
    e, m, i,d = [],[],[],[]
    prev_tcpID = None
    s_count = 0
    for idx, label in enumerate(ids):
        tcpID, chunk_id, is_note = label[0]
        part_id = label[1]

        # check if the book is dated before 1660
        if tcpID not in tcpID_era: continue
        if int(tcpID[1:]) <= 67876: 
            count += 1 
            continue

        s_count += 1 
        if prev_tcpID is None:
            prev_tcpID = tcpID 

        if (tcpID != prev_tcpID): 
            if prev_tcpID not in processed:
                collection = collections[tcpID_era[prev_tcpID]] 
                collection.upsert(ids=i,embeddings=e,metadatas=m,documents=d)
                print('Processed',prev_tcpID,tcpID_era[prev_tcpID])
                processed[prev_tcpID] = True 
            prev_tcpID = tcpID
            e, m, i, d = [],[],[],[]
            s_count = 0
        elif (s_count > 0) and ((s_count % 20000) == 0):
            if len(e) > 0:
                collection = collections[tcpID_era[prev_tcpID]]
                collection.upsert(ids=i,embeddings=e,metadatas=m,documents=d)
                e, m, i, d = [],[],[],[]
                print('Processed part of',prev_tcpID,tcpID_era[prev_tcpID])

        count += 1
        e.append(vectors[idx].tolist())
        m.append({"tcpID": tcpID, 'chunk_id': chunk_id, 'is_note':is_note, 'part_id':part_id})
        i.append(f'{prefix}_{idx}')
        d.append(docs[idx])

    if len(i) > 0 and prev_tcpID not in processed:
        collection = collections[tcpID_era[prev_tcpID]]
        collection.add(ids=i,embeddings=e,metadatas=m,documents=d)
        print('Processed',prev_tcpID,tcpID_era[prev_tcpID])
    print(count)

In [None]:
# remember to delete all notes from the text databases 

In [22]:
prefix = "A6" # done: B, A0, A1, A2, A3, A4, A5 
data = get_docs(prefix)
process(prefix,data)
del data 

Passages: 1102579
Processed A67922 Elizabethan2
Processed A67926 Elizabethan2
Processed A67927 Elizabethan2
Processed A68088 Jacobean2
Processed A68093 Elizabethan2
Processed A68105 Carolinian2
Processed A68126 Carolinian2
Processed A68162 Carolinian2
Processed A68202 Elizabethan2
Processed A68214 pre-Elizabethan
Processed A68254 Elizabethan2
Processed A68255 Carolinian2
Processed A68257 Jacobean2
Processed A68325 pre-Elizabethan
Processed A68336 Carolinian2
Processed A68376 Elizabethan2
Processed A68415 Jacobean2
Processed part of A68508 Carolinian2
Processed A68508 Carolinian2
Processed A68607 Jacobean2
Processed A68609 Carolinian2
Processed A68657 Jacobean2
Processed A68672 Jacobean2
Processed A68715 Jacobean2
Processed A68750 Elizabethan2
Processed A68859 Elizabethan2
Processed A68877 Jacobean2
Processed A68930 Jacobean2
Processed A68970 Elizabethan2
Processed A69013 Jacobean2
Processed A69044 Carolinian2
Processed part of A69056 Elizabethan2
Processed part of A69056 Elizabethan2
P

In [24]:
prefix = "A7" 
data = get_docs(prefix)
process(prefix,data)
del data 

Passages: 448632
Processed A70084 CivilWar
Processed A70165 Carolinian2
Processed A70235 CivilWar
Processed A70378 Carolinian2
Processed A70654 CivilWar
Processed A70812 CivilWar
Processed A70828 CivilWar
Processed A70945 Interregnum2
Processed A71085 CivilWar
Processed A71209 CivilWar
Processed A71286 CivilWar
Processed A72056 Carolinian2
Processed A72063 Elizabethan2
Processed A72065 Jacobean2
Processed A72114 Jacobean2
Processed A72143 Carolinian2
Processed A72180 Elizabethan2
Processed A72208 Elizabethan2
Processed A72253 Elizabethan2
Processed A72300 Carolinian2
Processed A72311 Jacobean2
Processed A72347 Elizabethan2
Processed A72359 Jacobean2
Processed A72376 Carolinian2
Processed A72485 Carolinian2
Processed A72487 Jacobean2
Processed A72538 Jacobean2
Processed A72540 Jacobean2
Processed A72904 Jacobean2
Processed A72913 Elizabethan2
Processed A73009 Carolinian2
Processed A73023 Jacobean2
Processed A73031 Jacobean2
Processed A73033 Carolinian2
Processed A73175 Elizabethan2
Proc

In [25]:
prefix = "A8" 
data = get_docs(prefix)
process(prefix,data)
del data 

Passages: 441213
Processed A80045 CivilWar
Processed A80080 CivilWar
Processed A80157 Interregnum2
Processed A80160 Interregnum2
Processed A80426 CivilWar
Processed A80446 Interregnum2
Processed A80485 Interregnum2
Processed A80515 Interregnum2
Processed A80534 Interregnum2
Processed A80611 Interregnum2
Processed A80637 Interregnum2
Processed A80742 CivilWar
Processed A80745 CivilWar
Processed A80758 CivilWar
Processed A80766 CivilWar
Processed A80790 Interregnum2
Processed A80811 CivilWar
Processed A80829 Interregnum2
Processed A80832 Interregnum2
Processed A80841 Interregnum2
Processed A80847 Interregnum2
Processed A80854 Interregnum2
Processed A80867 Interregnum2
Processed A80952 CivilWar
Processed A80971 Interregnum2
Processed A81131 CivilWar
Processed A81140 CivilWar
Processed A81152 CivilWar
Processed A81210 CivilWar
Processed A81211 CivilWar
Processed A81214 CivilWar
Processed A81215 Interregnum2
Processed A81216 Interregnum2
Processed A81219 CivilWar
Processed A81220 Interregnu

In [26]:
prefix = "A9" 
data = get_docs(prefix)
process(prefix,data)
del data 

Passages: 329845
Processed A90052 Interregnum2
Processed A90059 Interregnum2
Processed A90060 CivilWar
Processed A90061 CivilWar
Processed A90062 CivilWar
Processed A90064 CivilWar
Processed A90065 CivilWar
Processed A90143 Interregnum2
Processed A90206 Interregnum2
Processed A90263 Interregnum2
Processed A90268 Interregnum2
Processed A90269 Interregnum2
Processed A90288 CivilWar
Processed A90294 CivilWar
Processed A90296 CivilWar
Processed A90348 CivilWar
Processed A90384 Interregnum2
Processed A90394 Interregnum2
Processed A90512 CivilWar
Processed A90514 Interregnum2
Processed A90550 Interregnum2
Processed A90552 CivilWar
Processed A90603 CivilWar
Processed A90672 CivilWar
Processed A90695 CivilWar
Processed A90696 CivilWar
Processed A90701 CivilWar
Processed A90706 Interregnum2
Processed A90871 Interregnum2
Processed A90894 Interregnum2
Processed A90897 CivilWar
Processed A90993 CivilWar
Processed A91012 CivilWar
Processed A91477 Interregnum2
Processed A91515 Interregnum2
Processed