In [None]:
import chromadb
from pprint import pprint

client = chromadb.Client()

collection = client.create_collection(
      name="KJV_Bible",
      metadata={"hnsw:space": "cosine"}
  )

collection = client.get_collection(
      name="KJV_Bible",
  )

In [None]:
stopwords = [
    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
    "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
    "such", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with", "i", "said",
    "should", "from", "he", "have", "us", "our", "his", "shall",
    "him", "so", "yet","&","^","etc","&c","*"
]
stopwords = {k:'' for k in stopwords}
stopwords = {}

In [None]:
import os,json,sys,re 
sys.path.append('../')
from lib.standardization import * 
with open('../assets/encoded/A41135.json','r') as file:
    data = json.load(file)
encodings, info = data
print(len(info),"sentences")

In [None]:
tokenized_sentences = []
for idx,e in enumerate(encodings):
    current = [] 
    parts = e[3]
    if e[2] == True: 
      continue # is a marginal note 
    for p in parts: 
        lemma = p[2]
        pos = p[1]
        if "|" in lemma: lemma = lemma.split("|")[0]
        if lemma == pos: continue
        lemma = lemma.strip(".")
        if lemma not in stopwords: 
            current.append(lemma)
    tokenized_sentences.append(current)

In [None]:
with open('../assets/kjv-adorned.txt','r') as file:
    kjv_tokens = file.readlines()

bible = {}
current_ver = None
for t in kjv_tokens:
    t = t.split("\t")
    token, pos, lemma = t[0], t[2], t[4]
    if token[0].isupper() and re.search("vv",pos):
        lemma = token
        pos = "np"
    if re.search(r'VERSE-',token):
        current_ver = re.sub("VERSE-", "",token)
        bible[current_ver] = [[],[],[]]
    elif token == pos: # punctuation mark
        continue
    else:
        if lemma not in stopwords:
            bible[current_ver][0].append(token)
            bible[current_ver][1].append(pos)
            bible[current_ver][2].append(lemma)

In [None]:
book = "Matthew"
bible_labels = [label for label in bible if book in label]
bible_verses = [" ".join(bible[label][2]) for label in bible_labels]
print(len(bible_labels))

In [None]:
book = "Psalms"
bible_labels = [label for label in bible if book in label]
bible_verses = [" ".join(bible[label][2]) for label in bible_labels]
print(len(bible_labels))

In [None]:
for sent in tokenized_sentences: 
    if "utter" in sent and "darkness" in sent: 
       print(tokenized_sentences.index(sent),sent)

In [None]:
for idx, label in enumerate(bible_labels): 
    print(label)
    verse = bible_verses[idx]
    collection.upsert(
        documents=[verse],
        ids=[label]
    )

In [None]:
from nltk.util import ngrams

In [None]:
# get 1-5 grams and full sentence embedding
sample = tokenized_sentences[328]
all_phrases = [" ".join(sample)]
for i in range(1,6):
  all_phrases.extend([" ".join(item) for item in list(ngrams(sample, i))])
print(all_phrases)

In [None]:
all_results = {}
for phrase in all_phrases: 
    results = collection.query(
        query_texts=[phrase],
        n_results=1,
    )
    all_results[phrase] = (results["ids"][0][0],results["distances"][0][0],results["documents"][0][0])

In [None]:
min_dist = min([x[1] for x in all_results.values()])
print(min_dist)
min_dist = round(min_dist,1)
print(min_dist)
for phrase, items in all_results.items(): 
    if round(items[1],1) == min_dist: 
        print(phrase, items[0], items[1],items[2])

In [None]:
results = collection.query(
        query_texts=["the fire kindle"],
        n_results=10,
    )
for i, distance in enumerate(results['distances'][0]): 
    print(results["distances"][0][i])
    pprint(results["documents"][0][i])
    print(results['ids'][0][i],'\n')

In [None]:
# get 1-5 grams and full sentence embedding
sample = tokenized_sentences[1565]
all_phrases = [" ".join(sample)]
for i in range(3,10):
  all_phrases.extend([" ".join(item) for item in list(ngrams(sample, i))])
all_results = {}
for phrase in all_phrases: 
    results = collection.query(
        query_texts=[phrase],
        n_results=1,
    )
    all_results[phrase] = (results["ids"][0][0],results["distances"][0][0],results["documents"][0][0])

In [None]:
min_dist = min([x[1] for x in all_results.values()])
min_dist = round(min_dist,1)
for phrase, items in all_results.items(): 
    print(phrase, items[0], items[1],items[2])

In [None]:
results = collection.query(
        query_texts=["utter darkness where be weep and gnash of"],
        n_results=10,
    )
for i, distance in enumerate(results['distances'][0]): 
    print(results["distances"][0][i])
    pprint(results["documents"][0][i])
    print(results['ids'][0][i],'\n')