Imports

In [2]:
# Install necessary libraries
!pip install transformers datasets torch accelerate sentence-transformers --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch
import re
import importlib.util
import sys
from pathlib import Path
from transformers import pipeline
from datasets import Dataset
from tqdm import tqdm
import numpy as np

In [4]:
# Define the path to topics.py
topics_file_path = Path("drive/MyDrive/topics.py")  # Replace with the actual file path

# Dynamically import topics.py
spec = importlib.util.spec_from_file_location("topics", topics_file_path)
topics_module = importlib.util.module_from_spec(spec)
sys.modules["topics"] = topics_module
spec.loader.exec_module(topics_module)

Data Import and Processing

In [5]:
proverbs_location = "drive/MyDrive/merged_proverbs.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Pro ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
proverbs_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
proverbs_df['book'] = "Proverbs"

In [6]:
proverbs_location = "drive/MyDrive/merged_ecc.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Ecc ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
ecc_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
ecc_df['book'] = "Ecclesiastes"

Annotations

In [7]:
# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Configurations for efficient computation
BATCH_SIZE = 16  # Adjust batch size for free-tier GPU; lower if you hit memory issues.
# List of proverbs
proverbs = proverbs_df['text'].to_list() + ecc_df['text'].to_list()

# List of topics
topics = topics_module.topics

print(len(proverbs))

Using device: cuda
734


In [7]:
# Step 1: Pre-filter topics using SentenceTransformer embeddings
print("Generating embeddings for topics and proverbs...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and fast model

# Generate embeddings
topic_embeddings = embedding_model.encode(topics, convert_to_tensor=True, device=device)
proverb_embeddings = embedding_model.encode(proverbs, convert_to_tensor=True, device=device)

# Pre-filter top 100 proverbs per topic
print("Filtering top proverbs for each topic...")
filtered_proverbs = []  # Stores top 100 proverbs for each topic
top_n_proverbs = 100    # Reduce candidates to 100 per topic

for topic_embedding in tqdm(topic_embeddings):
    cosine_scores = util.pytorch_cos_sim(topic_embedding, proverb_embeddings)[0]
    top_proverbs_idx = torch.topk(cosine_scores, k=top_n_proverbs).indices
    filtered_proverbs.append([proverbs[i] for i in top_proverbs_idx])

# Step 2: Zero-shot classification using pre-filtered proverbs and faster model
print("Initializing zero-shot classifier...")
classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3", device=0 if device == "cuda" else -1)

# Batch process topics with pre-filtered proverbs
BATCH_SIZE = 16
results = []

print("Classifying topics...")
for i in tqdm(range(0, len(topics), BATCH_SIZE)):
    batch_topics = topics[i:i + BATCH_SIZE]
    batch_filtered_proverbs = filtered_proverbs[i:i + BATCH_SIZE]

    for topic, proverbs_subset in zip(batch_topics, batch_filtered_proverbs):
        result = classifier(
            topic,
            candidate_labels=proverbs_subset,
            multi_label=True
        )
        # Get the top 5 proverbs
        top_proverbs = np.array(result["labels"])[np.argsort(result["scores"])[-5:]][::-1]
        for proverb in top_proverbs:
            results.append({"topic": topic, "proverb": proverb})

# Step 3: Save results to a CSV file
df = pd.DataFrame(results)
df.to_csv("optimized_topic_proverb_matches.csv", index=False)

print("Matching complete. Results saved to 'optimized_topic_proverb_matches.csv'.")

Generating embeddings for topics and proverbs...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Filtering top proverbs for each topic...


100%|██████████| 951/951 [00:03<00:00, 261.71it/s]


Initializing zero-shot classifier...


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cuda:0


Classifying topics...


  0%|          | 0/60 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 60/60 [21:38<00:00, 21.65s/it]

Matching complete. Results saved to 'optimized_topic_proverb_matches.csv'.





Proverb Embeddings

In [9]:
# Load the matching results
matches_df = pd.read_csv("optimized_topic_proverb_matches.csv")

# Initialize the "topics" column with empty lists
proverbs_df["topics"] = [[] for _ in range(len(proverbs_df))]

# Group topics by proverb
proverb_to_topics = matches_df.groupby("proverb")["topic"].apply(list).to_dict()

# Add topics to the proverbs dataframe
proverbs_df["topics"] = proverbs_df["text"].apply(lambda proverb: proverb_to_topics.get(proverb, []))

# Display the updated dataframe
print(proverbs_df.head())



  chapter  verse                                               text      book  \
0       1    1-6  The proverbs of Solomon, son of David, king of...  Proverbs   
1       1      7  The fear of the LORD is the beginning of knowl...  Proverbs   
2       1    8-9  Hear, my son, your father's instruction, and f...  Proverbs   
3       1  10-19  My son, if sinners entice you, do not consent....  Proverbs   
4       1  20-33  Wisdom cries aloud in the street, in the marke...  Proverbs   

                                              topics  
0  [Academic burnout, Academic success, Academic ...  
1                                                 []  
2                                                 []  
3  [Birth control, Civil rights, Dating boundarie...  
4                                                 []  


In [11]:
proverbs_df.to_csv("proverbs_topics.csv", index=False)

In [None]:
# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings by calling model.encode()
proverbs_references = (proverbs_df['book'] + ' ' + proverbs_df['chapter'] + ':' + proverbs_df['verse']).to_list() + (ecc_df['book'] + ' ' + ecc_df['chapter'] + ':' + ecc_df['verse']).to_list()
proverb_embeddings = model.encode(proverbs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Calculate embedding of query
prompt = input("Explain your quandry, and I will tell you wisdom. The more detail -> the better.\n\n")
prompt_embedding = model.encode(prompt)

# Calculate the embedding similarities
similarities = model.similarity(proverb_embeddings, prompt_embedding)

top_values, top_indices = torch.topk(similarities, 3, dim=0)

print("\n")
verse_tally=0
for j in range(3):
  ref = proverbs_references[top_indices[j][0]]
  print(ref)
  print(proverbs[top_indices[j][0]])

  if '-' in ref:
    verse_tally += int(ref.split(':')[1].split('-')[1])-int(ref.split(':')[1].split('-')[0]) + 1
  else:
    verse_tally += 1
  if verse_tally > 2:
    break

Explain your quandry, and I will tell you wisdom. The more detail -> the better.

what happened on the seventh day of creation?


Ecclesiastes 12:1-8
Remember also your Creator in the days of your youth, before the evil days come
and the years draw near of which you will say, "I have no pleasure in them";
before the sun and the light and the moon and the stars are darkened and the
clouds return after the rain, in the day when the keepers of the house tremble,
and the strong men are bent, and the grinders cease because they are few, and
those who look through the windows are dimmed, and the doors on the street are
shut- when the sound of the grinding is low, and one rises up at the sound of a
bird, and all the daughters of song are brought low they are afraid also of what
is high, and terrors are in the way; the almond tree blossoms, thegrasshopper
drags itself along, and desire fails, because man is going to his eternal home,
and the mourners go about the streets before the silver cord

Experimentation for Version 4