In [None]:
import typing
import pandas as pd
import numpy as np
import torch

In [None]:
BIBLE_DATA = {
    'NIV',
    'NKJV'
}

In [None]:
# http://my-bible-study.appspot.com
df = pd.read_csv(
    'data/NIV_fixed.csv', 
    sep=',', 
    escapechar='\\', 
    names=['book', 'chapter', 'verse', 'text']
)

In [None]:
df.head()

In [None]:
df.text.values

Estimate cost of using OpenAI's Embedding model `text-embedding-ada-002`.

Ada uses the `cl100k_base` encoding.

In [None]:
import tiktoken
EMBEDDING_MODEL = "text-embedding-ada-002"
ENCODING = tiktoken.encoding_for_model(EMBEDDING_MODEL)
ENCODING_NAME = "cl100k_base"


In [None]:
def num_tokens_from_string(string: str, encoding_name: str = ENCODING_NAME) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_row(row: typing.Dict):
    return num_tokens_from_string(row['text'])

In [None]:
# testing the number of tokens for a verse
print(df['text'][10], num_tokens_from_string(df['text'][10]))


In [None]:
# NIV has missing verses, see https://en.wikipedia.org/wiki/List_of_New_Testament_verses_not_included_in_modern_English_translations
# clean first before tokenizing to avoid errors
clean_df = df.dropna() 
len(clean_df)

In [None]:
clean_df['tokens'] = clean_df.apply(num_tokens_from_row, axis=1)


In [None]:
np.mean(clean_df.tokens) # ~29 tokens per verse

In [None]:
# Ada embedding model pricing: https://openai.com/api/pricing/
ADA_PRICING_PER_TOKEN = 0.0004 # for every 1k token
total_cost = sum(clean_df.tokens) / 1000 * ADA_PRICING_PER_TOKEN
total_cost # $0.36 cents to generate embeddings for the entire Bible?!

In [None]:
# need to save the clean df correctly with escaped double quotes
# clean_df.to_csv('data/NIV_clean.csv', index=False)

Get embeddings from Ada

In [None]:
# source .env file from project directory
%load_ext dotenv
%dotenv

In [None]:

import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
def get_single_embedding(text: str, model=EMBEDDING_MODEL) -> typing.List[float]:
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def get_multi_embeddings(texts: typing.List[str], model=EMBEDDING_MODEL) -> typing.List[typing.List[float]]:
   texts = [text.replace("\n", " ") for text in texts]
   return [data['embedding'] for data in openai.Embedding.create(input=texts, model=model)['data']]


In [None]:
embeddings = get_multi_embeddings(list(df[:10].text.values))
embeddings, len(embeddings.data), embeddings.data[0]['embedding']

In [None]:
clean_df.iloc[0]['tokens']

In [None]:
# test embeddings
emb = get_single_embedding(clean_df.iloc[0]['text'])
emb

In [None]:
from dataclasses import dataclass

@dataclass
class Row:
    book: int
    chapter: int
    verse: int
    text: str

@dataclass
class RowEmbeddings(Row):
    oai_embeddings: typing.List[float]

In [None]:
import csv
from pathlib import Path
from typing import Iterator

def _get_row(fp: Path) -> Iterator[Row]:
    with fp.open() as f:
        for row in csv.DictReader(f):
            yield Row(
                int(row['book']),
                int(row['chapter']),
                int(row['verse']),
                row['text']
            )

def generate_embeddings(fp: Path):
    for row in _get_row(fp):
        yield RowEmbeddings(
            row.book,
            row.chapter,
            row.verse,
            row.text,
            get_single_embedding(row.text)
        )

def _read_row_batches(fp: Path, batch_size: int = 50) -> Iterator[typing.List[Row]]:
    with fp.open() as f:
        batches = []
        for row in csv.DictReader(f):
            batches.append(Row(
                int(row['book']),
                int(row['chapter']),
                int(row['verse']),
                row['text']
            ))
            if len(batches) == batch_size:
                yield batches
                batches = []
        
        if batches:
            yield batches

def generate_embeddings_batch(fp: Path, batch_size: int = 50) -> Iterator[typing.List[RowEmbeddings]]:
    for batch in _read_row_batches(fp, batch_size):
        try:
            embs = get_multi_embeddings([row.text for row in batch])
            yield [RowEmbeddings(
                row.book,
                row.chapter,
                row.verse,
                row.text,
                embs[idx]
            ) for idx, row in enumerate(batch)]
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            print(f"Problematic batch: {batch}")
            raise e



In [None]:
def append_to_parquet(fp: Path, data: typing.Any):
    df = pd.DataFrame(data, columns=['book', 'chapter', 'verse', 'text', 'oai_embeddings'])
    df.to_parquet(fp, compression='gzip', engine="fastparquet", index=False, append=os.path.isfile(fp))
    return df

In [None]:
def write_to_parquet(csv_fp: Path, parquet_fp: Path):
    for batch in generate_embeddings_batch(csv_fp):
        append_to_parquet(
            parquet_fp, 
            batch,
        )

In [None]:
write_to_parquet(
    csv_fp=Path("data/NIV_clean.csv"),
    parquet_fp=Path("data/NIV_clean.parquet")
)

### Loading the parquet file

In [None]:
pdf = pd.read_parquet('data/NIV_clean.parquet', engine='fastparquet')
pdf.head()

In [None]:
embs_tensors = torch.tensor(pdf['oai_embeddings'])
embs_tensors.shape

In [None]:
embs_tensors

In [None]:
# ada embeddings are normalized already
torch.functional.norm(embs_tensors[0, :])

In [None]:
embs_tensors[0, :].shape

In [None]:
query = get_single_embedding('trinity')
query

In [None]:
search_results = torch.matmul(embs_tensors, torch.tensor(query))
search_results

In [None]:
top_results = torch.topk(search_results, 10)
top_results

In [None]:
text_results = pdf.loc[top_results.indices][['book', 'chapter', 'verse', 'text']]

In [None]:
[tuple(np_row) for np_row in list(text_results.values)]

In [None]:
def get_ada_vector(query: str):
    return torch.tensor(get_single_embedding(query))

def get_search_results(query: str, embeddings: torch.Tensor, source: pd.DataFrame, k: int = 10, only_text = False):
    query_vec = torch.tensor(get_single_embedding(query))

    # cosine similarity: Ada embeddings are L2 normalized, so only require a dot product
    # between the query and embedding vectors.
    results = torch.topk(torch.matmul(embeddings, query_vec), k)

    cols = ['text'] if only_text else ['book', 'chapter', 'verse', 'text']
    results = source.loc[results.indices][cols]
    return [tuple(np_row) for np_row in list(results.values)]



In [None]:
pdf.iloc[29258].text

In [None]:
res = get_search_results('what is the meaning of life according to Jesus?', embs_tensors, pdf, only_text=True)
res

In [None]:
res = get_search_results('what is the Trinity?', embs_tensors, pdf, only_text=True)


In [None]:
res

Generating sentence embeddings using `sentence_tranformers`

In [None]:
from sentence_transformers import SentenceTransformer

ST_EMBEDDING_MODEL = "all-MiniLM-L6-v2"

In [None]:
model = SentenceTransformer(ST_EMBEDDING_MODEL)

In [None]:
minilm_embs = model.encode(pdf.text)
minilm_embs.shape

In [None]:
# test encode 
model.encode(list(pdf.text.values[:10])).tolist()

In [None]:
minilm_list = minilm_embs.tolist()
len(minilm_list), len(minilm_list[0])

In [None]:
pdf['minilm_embeddings'] = minilm_list

In [None]:
pdf['minilm_embeddings']

In [None]:
minilm_embs = torch.tensor(pdf['minilm_embeddings'])

In [None]:
torch.norm(minilm_embs[0])

In [None]:
import enum

# need to load the parquet file containing the embeddings
model = SentenceTransformer(ST_EMBEDDING_MODEL)

class EmbeddingType(enum.Enum):
    Ada = 'ada'
    miniLM = 'minilm'

def _get_query_vec(query: str, emb_type: EmbeddingType):
    if emb_type == EmbeddingType.Ada:
        return torch.tensor(get_single_embedding(query))
    elif emb_type == EmbeddingType.miniLM:
        return model.encode(query, convert_to_tensor=True)
    else:
        raise Exception(f"No such embedding: {emb_type}") 

def _get_embeddings(emb_type: EmbeddingType):
    if emb_type == EmbeddingType.Ada:
        return torch.tensor(pdf['oai_embeddings'])
    elif emb_type == EmbeddingType.miniLM:
        return torch.tensor(pdf['minilm_embeddings'])
    else:
        raise Exception(f"No such embedding: {emb_type}")

def _get_search_results(query_vec, embeddings, source: pd.DataFrame, k: int = 10, only_text=False):
    results = torch.topk(torch.matmul(embeddings, query_vec), k)
    cols = ['text'] if only_text else ['book', 'chapter', 'verse', 'text']
    results = source.loc[results.indices][cols]
    return [tuple(np_row) for np_row in list(results.values)]

def search(query: str, emb_type: EmbeddingType = EmbeddingType.Ada, only_text: bool =False):
    query_vec = _get_query_vec(query, emb_type)
    _embeddings = _get_embeddings(emb_type)

    return _get_search_results(query_vec, _embeddings, source=pdf, only_text=only_text)


In [None]:
query_str = "what is the meaning of life?"
ada_result = search(query_str, EmbeddingType.Ada, only_text=True)
ada_result

In [None]:
minilm_result = search(query_str, EmbeddingType.miniLM, only_text=True)
minilm_result