In [1]:
import typing
import pandas as pd
import numpy as np

In [4]:
BIBLE_DATA = {
    'NIV',
    'NKJV'
}

In [18]:
# http://my-bible-study.appspot.com
df = pd.read_csv(
    'data/NIV_fixed.csv', 
    sep=',', 
    escapechar='\\', 
    names=['book', 'chapter', 'verse', 'text']
)

In [4]:
df.head()

Unnamed: 0,book,chapter,verse,text
0,1,1,1,In the beginning God created the heavens and t...
1,1,1,2,"Now the earth was formless and empty, darkness..."
2,1,1,3,"And God said, ""Let there be light,"" and there ..."
3,1,1,4,"God saw that the light was good, and He separa..."
4,1,1,5,"God called the light ""day,"" and the darkness h..."


In [5]:
df.text.values

array(['In the beginning God created the heavens and the earth.',
       'Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.',
       'And God said, "Let there be light," and there was light.', ...,
       'And if anyone takes words away from this book of prophecy, God will take away from him his share in the tree of life and in the holy city, which are described in this book.',
       'He who testifies to these things says, "Yes, I am coming soon." Amen. Come, Lord Jesus.',
       "The grace of the Lord Jesus be with God's people. Amen."],
      dtype=object)

Estimate cost of using OpenAI's Embedding model `text-embedding-ada-002`.

Ada uses the `cl100k_base` encoding.

In [9]:
import tiktoken
EMBEDDING_MODEL = "text-embedding-ada-002"
ENCODING = tiktoken.encoding_for_model(EMBEDDING_MODEL)
ENCODING_NAME = "cl100k_base"


In [6]:
def num_tokens_from_string(string: str, encoding_name: str = ENCODING_NAME) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_row(row: typing.Dict):
    return num_tokens_from_string(row['text'])

In [8]:
# testing the number of tokens for a verse
print(df['text'][10], num_tokens_from_string(df['text'][10]))


Then God said, "Let the land produce vegetation: seed-bearing plants and trees on the land that bear fruit with seed in it, according to their various kinds." And it was so. 38


In [9]:
# NIV has missing verses, see https://en.wikipedia.org/wiki/List_of_New_Testament_verses_not_included_in_modern_English_translations
# clean first before tokenizing to avoid errors
clean_df = df.dropna() 
len(clean_df)

31084

In [10]:
clean_df['tokens'] = clean_df.apply(num_tokens_from_row, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['tokens'] = clean_df.apply(num_tokens_from_row, axis=1)


In [11]:
np.mean(clean_df.tokens) # ~29 tokens per verse

28.779211169733625

In [33]:
# Ada embedding model pricing: https://openai.com/api/pricing/
ADA_PRICING_PER_TOKEN = 0.0004 # for every 1k token
total_cost = sum(clean_df.tokens) / 1000 * ADA_PRICING_PER_TOKEN
total_cost # $0.36 cents to generate embeddings for the entire Bible?!

0.3578292

In [61]:
# need to save the clean df correctly with escaped double quotes
# clean_df.to_csv('data/NIV_clean.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['text'] = clean_df.text.replace('"', '\\"')


Get embeddings from Ada

In [7]:
# source .env file from project directory
%load_ext dotenv
%dotenv

In [10]:

import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
def get_single_embedding(text: str, model=EMBEDDING_MODEL) -> typing.List[float]:
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def get_multi_embeddings(texts: typing.List[str], model=EMBEDDING_MODEL) -> typing.List[typing.List[float]]:
   texts = [text.replace("\n", " ") for text in texts]
   return [data['embedding'] for data in openai.Embedding.create(input=texts, model=model)['data']]


In [22]:
embeddings = get_multi_embeddings(list(df[:10].text.values))
embeddings, len(embeddings.data), embeddings.data[0]['embedding']

<OpenAIObject list at 0x12e05ac00> JSON: {
  "data": [
    {
      "embedding": [
        0.029202710837125778,
        -0.012828509323298931,
        -0.03484134376049042,
        -0.002525381976738572,
        -0.01586943306028843,
        0.021397264674305916,
        -0.030015265569090843,
        -0.007688487879931927,
        0.0033117744605988264,
        -0.010027659125626087,
        0.019969139248132706,
        0.03838703781366348,
        -0.003542613936588168,
        -0.009098146110773087,
        -0.007700799033045769,
        0.021902034059166908,
        0.014219701290130615,
        0.0037919203750789165,
        0.00779313500970602,
        0.009178170934319496,
        -0.0008079375838860869,
        0.014650600962340832,
        0.018097801133990288,
        0.006321919150650501,
        -0.007214497774839401,
        0.003970435820519924,
        0.018097801133990288,
        -0.020658578723669052,
        0.024807531386613846,
        -0.018344029784202576,
     

In [55]:
clean_df.iloc[0]['tokens']

11

In [17]:
# test embeddings
emb = get_single_embedding(clean_df.iloc[0]['text'])
emb

[0.029221996665000916,
 -0.012838476337492466,
 -0.03483498468995094,
 -0.0024941477458924055,
 -0.015915771946310997,
 0.02139335684478283,
 -0.030009783804416656,
 -0.007730165962129831,
 0.0033019378315657377,
 -0.0100135188549757,
 0.019990110769867897,
 0.03830617293715477,
 -0.0035511988680809736,
 -0.009090330451726913,
 -0.007687083911150694,
 0.021922651678323746,
 0.014217104762792587,
 0.0037943050265312195,
 0.007804020773619413,
 0.009121103212237358,
 -0.0008077900274656713,
 0.01462330762296915,
 0.018143733963370323,
 0.006345382891595364,
 -0.007213180419057608,
 0.003975865431129932,
 0.018094496801495552,
 -0.02065480686724186,
 0.02477838099002838,
 -0.01834068074822426,
 -0.015091056004166603,
 -0.012241480872035027,
 -0.01149062067270279,
 -0.017355944961309433,
 -0.012887712568044662,
 -0.011361374519765377,
 -0.012727693654596806,
 -0.004400532227009535,
 -0.0011747574899345636,
 0.00017367485270369798,
 0.011219819076359272,
 0.02737561985850334,
 0.01706052571

In [38]:
from dataclasses import dataclass

@dataclass
class Row:
    book: int
    chapter: int
    verse: int
    text: str

@dataclass
class RowEmbeddings(Row):
    oai_embeddings: typing.List[float]

In [117]:
import csv
from pathlib import Path
from typing import Iterator

def _get_row(fp: Path) -> Iterator[Row]:
    with fp.open() as f:
        for row in csv.DictReader(f):
            yield Row(
                int(row['book']),
                int(row['chapter']),
                int(row['verse']),
                row['text']
            )

def generate_embeddings(fp: Path):
    for row in _get_row(fp):
        yield RowEmbeddings(
            row.book,
            row.chapter,
            row.verse,
            row.text,
            get_single_embedding(row.text)
        )

def _read_row_batches(fp: Path, batch_size: int = 50) -> Iterator[typing.List[Row]]:
    with fp.open() as f:
        batches = []
        for row in csv.DictReader(f):
            batches.append(Row(
                int(row['book']),
                int(row['chapter']),
                int(row['verse']),
                row['text']
            ))
            if len(batches) == batch_size:
                yield batches
                batches = []
        
        if batches:
            yield batches

def generate_embeddings_batch(fp: Path, batch_size: int = 50) -> Iterator[typing.List[RowEmbeddings]]:
    for batch in _read_row_batches(fp, batch_size):
        try:
            embs = get_multi_embeddings([row.text for row in batch])
            yield [RowEmbeddings(
                row.book,
                row.chapter,
                row.verse,
                row.text,
                embs[idx]
            ) for idx, row in enumerate(batch)]
        except Exception as e:
            import traceback
            print(traceback.format_exc())
            print(f"Problematic batch: {batch}")
            raise e



In [119]:
def append_to_parquet(fp: Path, data: typing.Any):
    df = pd.DataFrame(data, columns=['book', 'chapter', 'verse', 'text', 'oai_embeddings'])
    df.to_parquet(fp, compression='gzip', engine="fastparquet", index=False, append=os.path.isfile(fp))
    return df

In [120]:
def write_to_parquet(csv_fp: Path, parquet_fp: Path):
    for batch in generate_embeddings_batch(csv_fp):
        append_to_parquet(
            parquet_fp, 
            batch,
        )

In [121]:
write_to_parquet(
    csv_fp=Path("data/NIV_clean.csv"),
    parquet_fp=Path("data/NIV_clean.parquet")
)

### Loading the parquet file

In [2]:
pdf = pd.read_parquet('data/NIV_clean.parquet', engine='fastparquet')
pdf.head()

Unnamed: 0,book,chapter,verse,text,oai_embeddings
0,1,1,1,In the beginning God created the heavens and t...,"[0.029244348406791687, -0.012874405831098557, ..."
1,1,1,2,"Now the earth was formless and empty, darkness...","[0.009989847429096699, -0.030647028237581253, ..."
2,1,1,3,"And God said, ""Let there be light,"" and there ...","[0.010979833081364632, -0.023425232619047165, ..."
3,1,1,4,"God saw that the light was good, and He separa...","[0.027825910598039627, -0.011852935887873173, ..."
4,1,1,5,"God called the light ""day,"" and the darkness h...","[0.01728380285203457, -0.02279316633939743, -0..."


In [3]:
import torch

embs_tensors = torch.tensor(pdf['oai_embeddings'])
embs_tensors.shape

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([31084, 1536])

In [8]:
embs_tensors

tensor([[ 0.0292, -0.0129, -0.0348,  ...,  0.0045,  0.0025, -0.0231],
        [ 0.0100, -0.0306, -0.0135,  ..., -0.0106, -0.0038, -0.0163],
        [ 0.0110, -0.0234, -0.0285,  ..., -0.0292, -0.0063, -0.0275],
        ...,
        [-0.0048, -0.0382, -0.0095,  ...,  0.0013, -0.0028, -0.0179],
        [-0.0117, -0.0412, -0.0190,  ..., -0.0197,  0.0132, -0.0150],
        [ 0.0045, -0.0130,  0.0089,  ..., -0.0228,  0.0196, -0.0110]])

In [6]:
# ada embeddings are normalized already
torch.functional.norm(embs_tensors[0, :])

tensor(1.0000)

In [5]:
embs_tensors[0, :].shape

torch.Size([1536])

In [11]:
query = get_single_embedding('trinity')
query

[-3.7161233194638044e-05,
 -0.009198221378028393,
 -0.010157083161175251,
 -0.01956077478826046,
 -0.015971893444657326,
 0.01528699230402708,
 -0.016999244689941406,
 -0.01819097250699997,
 -0.01010913960635662,
 -0.020040206611156464,
 0.006201779469847679,
 0.023533200845122337,
 0.005441538989543915,
 -0.009355749003589153,
 0.004962108563631773,
 0.00867769680917263,
 0.018930666148662567,
 -0.02619061805307865,
 0.015971893444657326,
 -0.016862263903021812,
 5.71464333916083e-05,
 0.01413635816425085,
 -0.0072941966354846954,
 0.0012850456405431032,
 -0.004503224510699511,
 -0.013307628221809864,
 0.006027129478752613,
 -0.02771109715104103,
 0.002167711965739727,
 -0.003862842218950391,
 0.020916879177093506,
 0.0035820326302200556,
 -0.03583402559161186,
 -0.012129598297178745,
 -2.6513163902563974e-05,
 -0.012396709993481636,
 -0.01270491536706686,
 0.0020889483857899904,
 -0.0006228319252841175,
 0.025231756269931793,
 0.031149301677942276,
 0.0066161444410681725,
 -0.0059415

In [12]:
search_results = torch.matmul(embs_tensors, torch.tensor(query))
search_results

tensor([0.7954, 0.7960, 0.7920,  ..., 0.7758, 0.7771, 0.8018])

In [14]:
top_results = torch.topk(search_results, 10)
top_results

torch.return_types.topk(
values=tensor([0.8485, 0.8439, 0.8431, 0.8352, 0.8282, 0.8271, 0.8258, 0.8254, 0.8245,
        0.8240]),
indices=tensor([30614, 29259, 29260, 28515, 30630, 29267, 28292, 28233, 30819, 30679]))

In [35]:
text_results = pdf.loc[top_results.indices][['book', 'chapter', 'verse', 'text']]

In [47]:
[tuple(np_row) for np_row in list(text_results.values)]

[(62,
  5,
  8,
  'the Spirit, the water and the blood; and the three are in agreement.'),
 (49, 4, 5, 'one Lord, one faith, one baptism;'),
 (49,
  4,
  6,
  'one God and Father of all, who is over all and through all and in all.'),
 (46,
  8,
  6,
  'yet for us there is but one God, the Father, from whom all things came and for whom we live; and there is but one Lord, Jesus Christ, through whom all things came and through whom we live.'),
 (63,
  1,
  3,
  "Grace, mercy and peace from God the Father and from Jesus Christ, the Father's Son, will be with us in truth and love."),
 (49,
  4,
  13,
  'until we all reach unity in the faith and in the knowledge of the Son of God and become mature, attaining to the whole measure of the fullness of Christ.'),
 (45,
  15,
  6,
  'so that with one heart and mouth you may glorify the God and Father of our Lord Jesus Christ.'),
 (45,
  12,
  5,
  'so in Christ we who are many form one body, and each member belongs to all the others.'),
 (66,
  8,

In [60]:
def get_search_results(query: str, embeddings: torch.Tensor, source: pd.DataFrame, k: int = 10, only_text = False):
    query_vec = torch.tensor(get_single_embedding(query))

    # cosine similarity: Ada embeddings are L2 normalized, so only require a dot product
    # between the query and embedding vectors.
    results = torch.topk(torch.matmul(embeddings, query_vec), k)

    cols = ['text'] if only_text else ['book', 'chapter', 'verse', 'text']
    results = source.loc[results.indices][cols]
    return [tuple(np_row) for np_row in list(results.values)]


In [31]:
pdf.iloc[29258].text

'There is one body and one Spirit--just as you were called to one hope when you were called--'

In [61]:
res = get_search_results('what is the meaning of life according to Jesus?', embs_tensors, pdf, only_text=True)
res

[('Jesus answered, "If I want him to remain alive until I return, what is that to you? You must follow me."',),
 ('Now a man came up to Jesus and asked, "Teacher, what good thing must I do to get eternal life?"',),
 ('Jesus replied: " \'Love the Lord your God with all your heart and with all your soul and with all your mind.\'',),
 ('This is how we know what love is: Jesus Christ laid down his life for us. And we ought to lay down our lives for our brothers.',),
 ('When Jesus spoke again to the people, he said, "I am the light of the world. Whoever follows me will never walk in darkness, but will have the light of life."',),
 ('For to me, to live is Christ and to die is gain.',),
 ('"You have answered correctly," Jesus replied. "Do this and you will live."',),
 ('Then Jesus said to them, "I ask you, which is lawful on the Sabbath: to do good or to do evil, to save life or to destroy it?"',),
 ('Jesus said to them, "I tell you the truth, unless you eat the flesh of the Son of Man and dr

In [62]:
res = get_search_results('what is the Trinity?', embs_tensors, pdf, only_text=True)


[(49,
  4,
  6,
  'one God and Father of all, who is over all and through all and in all.'),
 (46,
  8,
  6,
  'yet for us there is but one God, the Father, from whom all things came and for whom we live; and there is but one Lord, Jesus Christ, through whom all things came and through whom we live.'),
 (49, 4, 5, 'one Lord, one faith, one baptism;'),
 (62,
  5,
  8,
  'the Spirit, the water and the blood; and the three are in agreement.'),
 (49,
  5,
  32,
  'This is a profound mystery--but I am talking about Christ and the church.'),
 (43, 10, 30, 'I and the Father are one."'),
 (46, 3, 23, 'and you are of Christ, and Christ is of God.'),
 (62,
  5,
  20,
  'We know also that the Son of God has come and has given us understanding, so that we may know him who is true. And we are in him who is true--even in his Son Jesus Christ. He is the true God and eternal life.'),
 (66,
  1,
  8,
  '"I am the Alpha and the Omega," says the Lord God, "who is, and who was, and who is to come, the Alm