Imports

In [1]:
# Install necessary libraries
!pip install transformers datasets torch accelerate sentence-transformers --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch
import re
import importlib.util
import sys
from pathlib import Path
from transformers import pipeline
from datasets import Dataset
from tqdm import tqdm
import numpy as np

In [3]:
# Define the path to topics.py
topics_file_path = Path("drive/MyDrive/topics.py")  # Replace with the actual file path

# Dynamically import topics.py
spec = importlib.util.spec_from_file_location("topics", topics_file_path)
topics_module = importlib.util.module_from_spec(spec)
sys.modules["topics"] = topics_module
spec.loader.exec_module(topics_module)

Data Import and Processing

In [4]:
proverbs_location = "drive/MyDrive/merged_proverbs.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Pro ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
proverbs_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
proverbs_df['book'] = "Proverbs"

In [5]:
proverbs_location = "drive/MyDrive/merged_ecc.txt"

# Read the entire file as a single string
with open(proverbs_location, 'r') as file:
    content = file.read()

# Split the verses with 'Pro ' as the delimiter, and skip the first empty split
lines = content.split('Ecc ')[1:]

# Open the file and load each verse into a pandas dataframe
data = []
for line in lines:
    parts = line.split(' ', 1)
    chapter, verse = parts[0].split(':')
    text = parts[1]
    data.append([chapter, verse, text])

# Create a DataFrame
ecc_df = pd.DataFrame(data, columns=['chapter', 'verse', 'text'])
ecc_df['book'] = "Ecclesiastes"

Annotations

In [6]:
# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Configurations for efficient computation
BATCH_SIZE = 16  # Adjust batch size for free-tier GPU; lower if you hit memory issues.
# List of proverbs
proverbs = proverbs_df['text'].to_list() + ecc_df['text'].to_list()

# List of topics
topics = topics_module.topics

print(len(proverbs))

Using device: cuda
734


In [15]:
# Step 1: Pre-filter topics using SentenceTransformer embeddings
print("Generating embeddings for topics and proverbs...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and fast model

# Generate embeddings
topic_embeddings = embedding_model.encode(topics, convert_to_tensor=True, device=device)
proverb_embeddings = embedding_model.encode(proverbs, convert_to_tensor=True, device=device)

# Pre-filter top 100 proverbs per topic
print("Filtering top proverbs for each topic...")
filtered_proverbs = []  # Stores top 100 proverbs for each topic
top_n_proverbs = 100    # Reduce candidates to 100 per topic

for topic_embedding in tqdm(topic_embeddings):
    cosine_scores = util.pytorch_cos_sim(topic_embedding, proverb_embeddings)[0]
    top_proverbs_idx = torch.topk(cosine_scores, k=top_n_proverbs).indices
    filtered_proverbs.append([proverbs[i] for i in top_proverbs_idx])

# Step 2: Zero-shot classification using pre-filtered proverbs and faster model
print("Initializing zero-shot classifier...")
classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3", device=0 if device == "cuda" else -1)

# Batch process topics with pre-filtered proverbs
BATCH_SIZE = 16
results = []

print("Classifying topics...")
for i in tqdm(range(0, len(topics), BATCH_SIZE)):
    batch_topics = topics[i:i + BATCH_SIZE]
    batch_filtered_proverbs = filtered_proverbs[i:i + BATCH_SIZE]

    for topic, proverbs_subset in zip(batch_topics, batch_filtered_proverbs):
        result = classifier(
            topic,
            candidate_labels=proverbs_subset,
            multi_label=True
        )
        # Get the top 5 proverbs
        top_proverbs = np.array(result["labels"])[np.argsort(result["scores"])[-5:]][::-1]
        for proverb in top_proverbs:
            results.append({"topic": topic, "proverb": proverb})

# Step 3: Save results to a CSV file
df = pd.DataFrame(results)
df.to_csv("optimized_topic_proverb_matches.csv", index=False)

print("Matching complete. Results saved to 'optimized_topic_proverb_matches.csv'.")

Generating embeddings for topics and proverbs...
Filtering top proverbs for each topic...


100%|██████████| 951/951 [00:01<00:00, 748.86it/s]


Initializing zero-shot classifier...


Device set to use cuda:0


Classifying topics...


100%|██████████| 60/60 [21:56<00:00, 21.94s/it]

Matching complete. Results saved to 'optimized_topic_proverb_matches.csv'.





Proverb Embeddings

In [16]:
# Load the matching results
matches_df = pd.read_csv("optimized_topic_proverb_matches.csv")

# Initialize the "topics" column with empty lists
proverbs_df["topics"] = [[] for _ in range(len(proverbs_df))]

# Group topics by proverb
proverb_to_topics = matches_df.groupby("proverb")["topic"].apply(list).to_dict()

# Add topics to the proverbs dataframe
proverbs_df["topics"] = proverbs_df["text"].apply(lambda proverb: proverb_to_topics.get(proverb, []))

# Display the updated dataframe
print(proverbs_df.head())



  chapter  verse                                               text      book  \
0       1    1-6  The proverbs of Solomon, son of David, king of...  Proverbs   
1       1      7  The fear of the LORD is the beginning of knowl...  Proverbs   
2       1    8-9  Hear, my son, your father's instruction, and f...  Proverbs   
3       1  10-19  My son, if sinners entice you, do not consent....  Proverbs   
4       1  20-33  Wisdom cries aloud in the street, in the marke...  Proverbs   

                                              topics  
0  [Academic burnout, Academic success, Academic ...  
1                                                 []  
2                                                 []  
3  [Birth control, Civil rights, Dating boundarie...  
4                                                 []  


Model

In [10]:
proverbs_df.to_csv("proverbs_topics.csv", index=False)

In [11]:
# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Calculate embeddings by calling model.encode()
proverbs_references = (proverbs_df['book'] + ' ' + proverbs_df['chapter'] + ':' + proverbs_df['verse']).to_list() + (ecc_df['book'] + ' ' + ecc_df['chapter'] + ':' + ecc_df['verse']).to_list()
proverb_embeddings = model.encode(proverbs)

In [14]:
# Calculate embedding of query
prompt = input("Explain your quandry, and I will tell you wisdom. The more detail -> the better.\n\n")
prompt_embedding = model.encode(prompt)

# Calculate the embedding similarities
similarities = model.similarity(proverb_embeddings, prompt_embedding)

top_values, top_indices = torch.topk(similarities, 3, dim=0)

print("\n")
verse_tally=0
for j in range(3):
  ref = proverbs_references[top_indices[j][0]]
  print(ref)
  print(proverbs[top_indices[j][0]])

  if '-' in ref:
    verse_tally += int(ref.split(':')[1].split('-')[1])-int(ref.split(':')[1].split('-')[0]) + 1
  else:
    verse_tally += 1
  if verse_tally > 2:
    break

Explain your quandry, and I will tell you wisdom. The more detail -> the better.

What is the meaning of life? What should I do with my time?


Ecclesiastes 3:1-8
For everything there is a season, and a time for every matter under heaven: a
time to be born, and a time to die; a time to plant, and a time to pluck up what
is planted; a time to kill, and a time to heal; a time to break down, and a time
to build up; a time to weep, and a time to laugh; a time to mourn, and a time to
dance; a time to cast away stones, and a time to gather stones together; a time
to embrace, and a time to refrain from embracing; a time to seek, and a time to
lose; a time to keep, and a time to cast away; a time to tear, and a time to
sew; a time to keep silence, and a time to speak; a time to love, and a time to
hate; a time for war, and a time for peace.



Experimentation for Version 4