Imports

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import re

Data Import and Processing

In [2]:
def merge_proverbs(input_file, output_file):
    with open(input_file, 'r') as file:
        content = file.read()

    # Normalize content to handle newlines within verses
    content = re.sub(r'\n(?!Ecc \d+:\d+)', ' ', content)

    lines = [line.strip() for line in content.splitlines() if line.strip()]

    merged_lines = []
    current_merge = []
    current_refs = []

    for i in range(len(lines)):
        match = re.match(r'Ecc (\d+):(\d+) (\[?)(.*)', lines[i])
        if match:
            chapter, verse, merge_flag, text = match.groups()
            text = text.strip()
            ref = f"{chapter}:{verse}"

            if merge_flag:  # Start merging with the next verse
                current_merge.append(text)
                current_refs.append((chapter, verse))
            else:
                if current_merge:  # Finalize merge with the current verse
                    current_merge.append(text)
                    current_refs.append((chapter, verse))
                    merged_text = ' '.join(current_merge).strip()
                    start_chapter, start_verse = current_refs[0]
                    end_chapter, end_verse = current_refs[-1]
                    if start_chapter == end_chapter:
                        merged_refs = f"{start_chapter}:{start_verse}-{end_verse}"
                    else:
                        merged_refs = f"{start_chapter}:{start_verse}-{end_chapter}:{end_verse}"
                    merged_text = insert_newlines(merged_text, 80)
                    merged_lines.append(f"Ecc {merged_refs} {merged_text}")
                    current_merge = []
                    current_refs = []
                else:
                    merged_lines.append(f"Ecc {ref} {insert_newlines(text, 80)}")

    with open(output_file, 'w') as file:
        file.write('\n'.join(merged_lines))

def insert_newlines(text, max_length):
    words = text.split()
    lines = []
    current_line = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > max_length:
            lines.append(' '.join(current_line))
            current_line = [word]
            current_length = len(word)
        else:
            current_line.append(word)
            current_length += len(word) + 1

    if current_line:
        lines.append(' '.join(current_line))

    return '\n'.join(lines)

# Example usage
input_file = 'drive/MyDrive/grouped_ecc.txt'
output_file = 'drive/MyDrive/merged_ecc.txt'
merge_proverbs(input_file, output_file)

In [3]:
proverbs_location = "drive/MyDrive/merged_proverbs.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Pro ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
proverbs_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
proverbs_df['book'] = "Proverbs"

In [4]:
proverbs_location = "drive/MyDrive/merged_ecc.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Ecc ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
ecc_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
ecc_df['book'] = "Ecclesiastes"

Proverb Embeddings

In [5]:
# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings by calling model.encode()
proverbs = proverbs_df['text'].to_list() + ecc_df['text'].to_list()
proverbs_references = (proverbs_df['book'] + ' ' + proverbs_df['chapter'] + ':' + proverbs_df['verse']).to_list() + (ecc_df['book'] + ' ' + ecc_df['chapter'] + ':' + ecc_df['verse']).to_list()
proverb_embeddings = model.encode(proverbs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Calculate embedding of query
prompt = input("Explain your quandry, and I will tell you wisdom. The more detail -> the better.\n\n")
prompt_embedding = model.encode(prompt)

# Calculate the embedding similarities
similarities = model.similarity(proverb_embeddings, prompt_embedding)

top_values, top_indices = torch.topk(similarities, 3, dim=0)

print("\n")
verse_tally=0
for j in range(3):
  ref = proverbs_references[top_indices[j][0]]
  print(ref)
  print(proverbs[top_indices[j][0]])

  if '-' in ref:
    verse_tally += int(ref.split(':')[1].split('-')[1])-int(ref.split(':')[1].split('-')[0]) + 1
  else:
    verse_tally += 1
  if verse_tally > 2:
    break

Explain your quandry, and I will tell you wisdom. The more detail -> the better.

When should I buy house? What if I can barely afford it?


Proverbs 24:27
Prepare your work outside; get everything ready for yourself in the field, and
after that build your house.

Proverbs 24:3-4
By wisdom a house is built, and by understanding it is established; by
knowledge the rooms are filled with all precious and pleasant riches.



Experimentation for Version 4