In [10]:
import typing
import pandas as pd

In [2]:
BIBLE_DATA = {
    'NIV',
    'NKJV'
}

In [3]:
# http://my-bible-study.appspot.com
df = pd.read_csv(
    'data/NIV_fixed.csv', 
    sep=',', 
    escapechar='\\', 
    names=['book', 'chapter', 'verse', 'text']
)

In [4]:
df.head()

Unnamed: 0,book,chapter,verse,text
0,1,1,1,In the beginning God created the heavens and t...
1,1,1,2,"Now the earth was formless and empty, darkness..."
2,1,1,3,"And God said, ""Let there be light,"" and there ..."
3,1,1,4,"God saw that the light was good, and He separa..."
4,1,1,5,"God called the light ""day,"" and the darkness h..."


In [5]:
df.text.values

array(['In the beginning God created the heavens and the earth.',
       'Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.',
       'And God said, "Let there be light," and there was light.', ...,
       'And if anyone takes words away from this book of prophecy, God will take away from him his share in the tree of life and in the holy city, which are described in this book.',
       'He who testifies to these things says, "Yes, I am coming soon." Amen. Come, Lord Jesus.',
       "The grace of the Lord Jesus be with God's people. Amen."],
      dtype=object)

Estimate cost of using OpenAI's Embedding model `text-embedding-ada-002`.

Ada uses the `cl100k_base` encoding.

In [35]:
import tiktoken

ENCODING = tiktoken.encoding_for_model('text-embedding-ada-002')
ENCODING_NAME = "cl100k_base"


<Encoding 'cl100k_base'>

In [11]:
def num_tokens_from_string(string: str, encoding_name: str = ENCODING_NAME) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_row(row: typing.Dict):
    return num_tokens_from_string(row['text'])

In [28]:
# testing the number of tokens for a verse
print(df['text'][10], num_tokens_from_string(df['text'][10]))


Then God said, "Let the land produce vegetation: seed-bearing plants and trees on the land that bear fruit with seed in it, according to their various kinds." And it was so. 38


In [27]:
# NIV has missing verses, see https://en.wikipedia.org/wiki/List_of_New_Testament_verses_not_included_in_modern_English_translations
# clean first before tokenizing to avoid errors
clean_df = df.dropna() 
len(clean_df)

31084

In [29]:
clean_df['tokens'] = clean_df.apply(num_tokens_from_row, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['tokens'] = clean_df.apply(num_tokens_from_row, axis=1)


Unnamed: 0,book,chapter,verse,text
0,1,1,1,In the beginning God created the heavens and t...
1,1,1,2,"Now the earth was formless and empty, darkness..."
2,1,1,3,"And God said, ""Let there be light,"" and there ..."
3,1,1,4,"God saw that the light was good, and He separa..."
4,1,1,5,"God called the light ""day,"" and the darkness h..."


In [33]:
# Ada embedding model pricing: https://openai.com/api/pricing/
ADA_PRICING_PER_TOKEN = 0.0004 # for every 1k token
total_cost = sum(clean_df.tokens) / 1000 * ADA_PRICING_PER_TOKEN
total_cost

0.3578292