# Setup

In [21]:
import json
import torch
import re, os
import pandas as pd
import tqdm 
# !pip install unidecode
from unidecode import unidecode
import unicodedata
import re
import html

def normalize_text(text):
    text = html.unescape(text)
    text = unicodedata.normalize('NFKD', text)
    text = unidecode(text)
    text = re.sub(r"\s+", " ", text)
    return text
def clean_text(s):
  s = normalize_text(s)
  s = re.sub(r"</i>|<NOTE>|NONLATINALPHABET|<i>"," ",s) # \d+\^PAGE[S]*\^MISSING"
  s = re.sub(r"\s+"," ",s)
  return s.strip(" ").lower()

folder = f"/content/drive/MyDrive/DH"
folder = "../.."
QPdir = f"{folder}/segments"
outputdir = f"{folder}/Early-Modern-Sermons/assets/MINED_QP"

def get_batches(df_dict,batch_size=100000):
  batches = []
  idx = 0
  for i in tqdm.tqdm(range(0, len(df_dict), batch_size)):
    batch = []
    for text in df_dict[i: i + batch_size]:
      text = clean_text(text)
      batch.append((idx,text))
      idx += 1
    batches.append(batch)
  return batches

In [None]:
from google.colab import drive
drive.mount('/content/drive')

bible = {}

to_remove = {}
items = pd.read_csv(f"{folder}/EEPS/overly_vague.csv").to_dict(orient='records')
for entry in items:
  if entry['to_remove'] is True:
    to_remove[entry['verse_id']] = None

print('To Remove', len(to_remove))

b_versions = ['AKJV','ODRV','Geneva', 'Douay-Rheims', 'Tyndale', 'Wycliffe','Vulgate']
ODRV_books = pd.read_csv(f"{folder}/Bibles/ODRV.csv",header=None)
ODRV_books = set(ODRV_books[3])
for bname in b_versions:
    data = pd.read_csv(f"{folder}/Bibles/{bname}.csv",header=None)
    data = data.to_dict(orient="records")
    for entry in tqdm.tqdm(data):
        key = entry[0]
        if key in to_remove: continue
        v_id = key.split(" (")[0]
        text = entry[6]
        if re.search("Douay-Rheims",key):
            if entry[3] in ODRV_books: continue
        if len(text.split(" ")) < 200:
            bible[key] = normalize_text(f"{v_id}: {text}")

        parts = re.split(r'(?<=[\.\?]) (?=[A-Z])|(?<=[\!\:\;])', text)
        parts = [re.sub(r'\s+', ' ', p).strip() for p in parts if len(p.strip(" ")) > 0]
        if (len(parts[0].split(" ")) <= 5 or len(parts[-1].split(" ")) <= 5 or re.search(r"\&\w+\;",parts[0])):
            for pidx, p in enumerate(parts): continue
        elif len(parts) > 1:
            for pidx, p in enumerate(parts):
              p_id = f"{key} - {pidx}"
              if p_id in to_remove: continue
              if len(p) == 0: continue
              if re.search(r"\&\w+\;",p) or len(p.split(" ")) <= 5: continue
              bible[p_id] = normalize_text(f"Part {pidx+1} of {v_id}: {p}")

bible['NonQP 0.0'] = 'None'
bible_verses = list(bible.values())
bible_ids = list(bible.keys())
verse_to_id = {v:k for k,v in bible.items()}
bible_parts = {}
for key, text in bible.items():
  v_id = key.split(" - ")[0]
  if v_id not in bible_parts:
    bible_parts[v_id] = [v_id]
  if " - " not in key: continue
  bible_parts[v_id].append(key)

def get_parts(pos_id):
  parts = []
  if len(bible_parts[pos_id]) > 0:
      for part_id in bible_parts[pos_id]:
        parts.append(part_id)
  return parts

v_id_to_idx = {v_id:idx for idx, v_id in enumerate(bible_ids)}
bible_verses_no_prepend = [re.sub(r"^[\d\w\s]+\d+\.\d+\: |^Part \d+ of [\d\w\s]+\d+\.\d+\: ",'',v) for v in bible_verses]
bible["Genesis 1.2 (AKJV) - 0"], bible_verses_no_prepend[v_id_to_idx["Genesis 1.2 (AKJV) - 0"]]
len(bible_verses), len(bible_parts)

Mounted at /content/drive
To Remove 667


  0%|          | 0/36702 [00:00<?, ?it/s]

  0%|          | 0/14736 [00:00<?, ?it/s]

  0%|          | 0/31090 [00:00<?, ?it/s]

  0%|          | 0/35811 [00:00<?, ?it/s]

  0%|          | 0/7954 [00:00<?, ?it/s]

  0%|          | 0/9622 [00:00<?, ?it/s]

  0%|          | 0/35809 [00:00<?, ?it/s]

(275895, 156696)

In [4]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, CrossEncoder, util, models
from sentence_transformers.util import semantic_search, pytorch_cos_sim

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [5]:
model_checkpoint = "emanjavacas/MacBERTh"
model_name = "MacBERTh"
model_round = "ALL"
epoch = "1"
state_dict_path = f"EEPS_{model_round}_MacBERTh_Epoch{epoch}"
retrieve_k = 25
if model_name == 'MacBERTh':
  word_embedding_model = models.Transformer(model_checkpoint, max_seq_length=128)
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "mean")
  bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
else:
  bi_encoder = SentenceTransformer(model_checkpoint)

if state_dict_path:
  model_name = state_dict_path
  bi_encoder.load_state_dict(torch.load(f'{folder}/models/{state_dict_path}.pt'))

reranker_name = f"{folder}/models/EEPS_cross-encoder_emanjavacas_MacBERTh/checkpoint-1000"
cross_encoder = CrossEncoder(reranker_name)

def get_bible_vectors():
  EEPS_files = set(os.listdir(f"{folder}/EEPS"))
  vec_file_name = f"Bibles_{model_name}.pt"
  if vec_file_name not in EEPS_files:
    bible_vectors = bi_encoder.encode(bible_verses, batch_size=1024, convert_to_tensor=True, show_progress_bar=True)
    torch.save(bible_vectors, f"{folder}/EEPS/"+vec_file_name)
  bible_vectors = torch.load(f"{folder}/EEPS/"+vec_file_name,map_location=device)
  bible_vectors = bible_vectors[:-1]

  full_ids = [] # idx to orig idx
  full_verses = []
  full_vectors = []
  for idx, v_id in enumerate(bible_ids[:-1]):
    if " - " not in v_id: # a full verse
      full_ids.append(v_id)
      full_vectors.append(bible_vectors[idx])
  print(len(full_ids),len(full_vectors))
  return full_ids, full_vectors

full_ids, full_vectors = get_bible_vectors()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

156692 156692


# QP Set-up

In [None]:
# outfname,inputfname = "M_QP", f"{folder}/segments/all_unique_marginalia.json"
outfname,inputfname = f"B_QP",f"{folder}/segments/all_unique.json"
with open(inputfname,"r") as file:
  batches = json.load(file)
batches = [k for k in batches if len(k.split())>3]
print(len(batches))
batches = get_batches(batches)

In [None]:
batches[2][2]

(200002,
 'isa. 1. 17. quaerite judicium, i. e. disquirite diligenter jus causae. iudices & patronos alloquitur. dirigite negotia oppressi. vatabl. in 1. is. 17.')

# Reformat and Verify QP

In [29]:
outfname,inputfname = "M_QP", f"{folder}/segments/all_unique_marginalia.json"
outfname,inputfname = f"B_QP",f"{folder}/segments/all_unique.json"
with open(inputfname,"r") as file:
  texts = json.load(file)
print(len(texts))
batches = [k for k in texts if len(k.split())>3]
texts = {idx:texts[k] for idx,k in enumerate(batches)}
batches = ['' for _ in batches]
print(len(batches), len(texts))
batches = len(get_batches(batches))

6125769
6112794 6112794


100%|██████████| 62/62 [00:19<00:00,  3.26it/s]


In [9]:
'''
Corrections
'''
# neg_sim_threshold = 0.7
# pos_threshold = 0.6
# pos_sim_threshold = 0.65
# neg_threshold = 0.4
# for bidx in range(batches):
#   outputs = []
#   qp = pd.read_csv(f"{outputdir}/{outfname}_{bidx}.csv")
#   qp = qp.drop(columns = ['Unnamed: 0'])
#   qp = qp.to_dict(orient='records')
#   batch = []
#   seen = {}
#   for item in qp:
#     if 'Douay-Rheims' in item['verse_id']:
#       if re.sub('Douay-Rheims','ODRV', item['verse_id']) in to_remove:
#         continue
#     elif item['text'].strip(',.:;?') in ['a psalm of david', 'psalmus david']:
#       continue
#     else:
#       if item['text_idx'] in seen: continue
#       seen[item['text_idx']] = True
#       batch.append(item)
#   print('Batch',bidx)
#   print(len(batch))
#   p_embedding = bi_encoder.encode([b['text'] for b in batch], batch_size=1014, convert_to_tensor=True,show_progress_bar=True)
#   p_embedding = p_embedding.cuda()
#   hits = semantic_search(p_embedding,full_vectors,query_chunk_size=2000,top_k=25)
#   progress = tqdm.tqdm(hits)
#   unique = {}
#   for pidx, hitlist in enumerate(progress):
#     idx, passage = batch[pidx]['text_idx'], batch[pidx]['text']
#     hitlist = [hit for hit in hitlist if hit['score'] >= pos_sim_threshold]
#     cross_inp = [[passage, bible_verses[v_id_to_idx[full_ids[hit['corpus_id']]]]] for hit in hitlist]
#     if len(cross_inp) == 0: continue
#     cross_scores = cross_encoder.predict(cross_inp)
#     for i in range(len(cross_scores)):
#       hitlist[i]['cross-score'] = cross_scores[i]
#     hitlist = sorted(hitlist, key=lambda x: x['cross-score'], reverse=True)

#     for rank, hit in enumerate(hitlist):
#       v_id = full_ids[hit['corpus_id']]
#       cross_score = hit['cross-score']
#       sim_score = hit['score']
#       lang = 'English'
#       if 'Vulgate' in v_id:
#         lang = "Latin"
#       if (cross_score >= neg_threshold and sim_score >= neg_sim_threshold) or (cross_score >= pos_threshold):
#         outputs.append({'text_idx': idx, 'verse_id':v_id,'lang': lang, 'rank':rank, 'sim_score':sim_score, 'cross_score':cross_score,'text': passage, 'verse_text':bible[v_id]})
#   final = []
#   for o in outputs:
#     if (o['text_idx'], o['verse_id']) in unique: continue
#     unique[(o['text_idx'], o['verse_id'])] = None
#     final.append(o)

#   output_df = pd.DataFrame(final)
#   len_qp = len(set(output_df['text_idx']))
#   output_df.to_csv(f"{outputdir}/{outfname}_{bidx}.csv")
#   print(len_qp,f"segments with QP in Batch {bidx}")
# output_df

Batch 0
1073


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1073 [00:00<?, ?it/s]

1073 segments with QP in Batch 0
Batch 1
955


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/955 [00:00<?, ?it/s]

955 segments with QP in Batch 1
Batch 2
645


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/645 [00:00<?, ?it/s]

645 segments with QP in Batch 2


Unnamed: 0,text_idx,verse_id,lang,rank,sim_score,cross_score,text,verse_text
0,200028,Psalms 2.10 (Vulgate),Latin,0,0.934655,0.997154,"et nunc reges, intelligite: erudimini, qui jud...","Psalms 2.10: Et nunc, reges, intelligite; erud..."
1,200028,Psalms 2.10 (ODRV),English,1,0.889267,0.996425,"et nunc reges, intelligite: erudimini, qui jud...",Psalms 2.10: And now ye kings vnderstand: take...
2,200028,Psalms 2.10 (AKJV),English,2,0.830186,0.954899,"et nunc reges, intelligite: erudimini, qui jud...","Psalms 2.10: Bee wise now therefore, O yee Kin..."
3,200028,Wisdom 6.2 (Vulgate),Latin,3,0.834879,0.463485,"et nunc reges, intelligite: erudimini, qui jud...","Wisdom 6.2: Audite ergo, reges, et intelligite..."
4,200030,1 Corinthians 4.4 (Geneva),English,0,0.671644,0.987320,"jer. 17.9. i know nothing by my self, yet am i...",1 Corinthians 4.4: For I know nothing by my se...
...,...,...,...,...,...,...,...,...
1653,260935,Psalms 49.20 (Geneva),English,1,0.923072,0.991461,psal. man that is in honour and understandeth ...,"Psalms 49.20: Man is in honour, and vnderstand..."
1654,263300,Psalms 119.96 (AKJV),English,0,0.771703,0.980020,"psal. 119. 16. thy commandment, i* exceeding b...",Psalms 119.96: I haue seene an end of all perf...
1655,263300,Psalms 119.96 (Geneva),English,1,0.742942,0.967042,"psal. 119. 16. thy commandment, i* exceeding b...",Psalms 119.96: I haue seene an ende of all per...
1656,263300,Psalms 118.96 (ODRV),English,2,0.712739,0.957963,"psal. 119. 16. thy commandment, i* exceeding b...",Psalms 118.96: Of al consummation I haue sene ...


In [30]:
'''
Body: 190716 segments that occur in more than one place
Margin: 10637 segments that occur in more than one place
'''
more = 0
for entry in texts.values():
    if len(entry) > 1:
        more += 1
print(more,'segments that occur in more than one place')

190716 segments that occur in more than one place


In [31]:
outputdir = f"/Users/amycweng/SERMONS_APP/db/data/MINED_QP"
inputdir = f"{folder}/Early-Modern-Sermons/assets/MINED_QP"

num_qp = 0
for bidx in range(batches):
   meta = []
   qp = pd.read_csv(f"{inputdir}/{outfname}_{bidx}.csv")
   num_qp += len(set(qp['text_idx']))
   print('Batch',bidx, len(set(qp['text_idx'])),'unique')
   qp['sim_score'] = qp['sim_score'].round(3)
   qp['cross_score'] = qp['cross_score'].round(3)
   qp = qp.drop(columns=['Unnamed: 0','text','verse_text'])
   qp = qp.sort_values(by=['text_idx'])
   for idx in sorted(set(qp['text_idx'])):
      for tcpID, sidx, loc in texts[idx]:
         meta.append({
                        'text_idx': idx,
                        'loc_key': outfname.split("_")[0],
                        'tcpID':tcpID,
                        'sidx': sidx,
                        'loc': loc
                     })
   qp['loc'] = outfname.split("_")[0]
   qp.to_csv(f"{outputdir}/{outfname}_{bidx}.csv",header=None,index=False)
   meta = pd.DataFrame(meta)
   meta.to_csv(f"{outputdir}/{outfname}_{bidx}_indices.csv",header=None,index=False)
print('Total', num_qp)
qp, meta

Batch 0 6887 unique
Batch 1 7589 unique
Batch 2 5451 unique
Batch 3 10516 unique
Batch 4 9349 unique
Batch 5 10213 unique
Batch 6 8910 unique
Batch 7 8516 unique
Batch 8 8442 unique
Batch 9 7496 unique
Batch 10 9422 unique
Batch 11 7198 unique
Batch 12 8444 unique
Batch 13 9885 unique
Batch 14 8664 unique
Batch 15 5004 unique
Batch 16 8863 unique
Batch 17 8274 unique
Batch 18 7405 unique
Batch 19 11677 unique
Batch 20 6342 unique
Batch 21 7773 unique
Batch 22 6661 unique
Batch 23 8513 unique
Batch 24 8344 unique
Batch 25 7686 unique
Batch 26 5841 unique
Batch 27 9553 unique
Batch 28 9356 unique
Batch 29 9401 unique
Batch 30 9485 unique
Batch 31 9857 unique
Batch 32 7719 unique
Batch 33 7757 unique
Batch 34 8282 unique
Batch 35 8449 unique
Batch 36 7578 unique
Batch 37 8896 unique
Batch 38 9124 unique
Batch 39 14328 unique
Batch 40 8435 unique
Batch 41 7952 unique
Batch 42 7587 unique
Batch 43 6276 unique
Batch 44 8156 unique
Batch 45 8304 unique
Batch 46 8829 unique
Batch 47 8541 uniqu

(      text_idx                      verse_id     lang  rank  sim_score  \
 0      6100016            Isaiah 44.4 (AKJV)  English     0      0.916   
 1      6100016          Isaiah 44.4 (Geneva)  English     1      0.882   
 2      6100017            Isaiah 44.5 (AKJV)  English     0      0.836   
 3      6100017          Isaiah 44.5 (Geneva)  English     1      0.774   
 4      6100017    Isaiah 44.5 (Douay-Rheims)  English     2      0.818   
 ...        ...                           ...      ...   ...        ...   
 3188   6112760         Ephesians 4.24 (AKJV)  English     3      0.842   
 3189   6112793     1 Corinthians 1.25 (ODRV)  English     0      0.699   
 3190   6112793     1 Corinthians 1.25 (AKJV)  English     1      0.776   
 3191   6112793  1 Corinthians 1.25 (Tyndale)  English     2      0.797   
 3192   6112793   1 Corinthians 1.25 (Geneva)  English     3      0.765   
 
       cross_score loc  
 0           0.988   B  
 1           0.586   B  
 2           0.992   B 

In [32]:
'''Body'''
510829 / len(texts)

0.08356718711607164

In [28]:
'''Margin'''
2673 / len(texts)

0.009893807209561423