In [1]:
import openai
import pandas as pd
import numpy as np

In [3]:
def openai_authenticate():
    from dotenv import dotenv_values
    config = dotenv_values(".env")
    openai.api_key = config['OPEN_API_KEY']
    assert openai.api_key.startswith('sk-'), 'Error loading the API key. The API key should start with "sk-""'

openai_authenticate()

### 1. Loading the dataset into Pandas DataFrame

In [5]:
df = pd.read_csv('books_dataset.csv')
df

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15
...,...,...,...,...,...,...,...
6805,9788185300535,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,This collection of the timeless teachings of o...,1999.0,4.51
6806,9788185944609,Secrets Of The Heart,Khalil Gibran,Mysticism,,1993.0,4.08
6807,9788445074879,Fahrenheit 451,Ray Bradbury,Book burning,,2004.0,3.98
6808,9789027712059,The Berlin Phenomenology,Georg Wilhelm Friedrich Hegel,History,Since the three volume edition ofHegel's Philo...,1981.0,0.00


In [9]:
#exclude NaN data and save it in df
df.dropna(inplace=True)
df = df.sort_values('average_rating', ascending=False).head(2000)
df

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
6738,9781932206081,Insights,Frederick Lenz,Spiritual life,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00
4284,9780738539560,Lake Orion,James E. Ingram;Lori Grove,History,"Orion Township, established in 1835, became a ...",2006.0,5.00
3580,9780567044716,Colossians and Philemon,Robert McL Wilson,Religion,For over one hundred years International Criti...,2005.0,5.00
4306,9780739844328,Bill Gates,Sara Barton-Wood,Juvenile Nonfiction,"Presents the life of Bill Gates, from his chil...",2001.0,5.00
5398,9780851621814,The Complete Theory Fun Factory,Katie Elliott;Ian Martin,Juvenile Nonfiction,(Boosey & Hawkes Scores/Books). Contains the m...,1996.0,5.00
...,...,...,...,...,...,...,...
4871,9780786809943,The Final Battle,Mary Pope Osborne,Juvenile Fiction,After struggling against the gods and his fate...,2005.0,4.08
4720,9780765309969,Blade of Fortriu,Juliet Marillier,Fiction,As King Bridei prepares to expel the Gaelic in...,2006.0,4.08
1701,9780330340199,In Pharaoh's Army,Tobias Wolff,"Authors, American",Having survived the extraordinary childhood re...,1995.0,4.08
1066,9780143039853,The Outsiders,S. E. Hinton;Jodi Picoult,Fiction,The struggle of three brothers to stay togethe...,1967.0,4.08


### Embedding Cost Calculation

In [16]:
#!pip install tiktoken -q
import tiktoken
enc = tiktoken.encoding_for_model('text-embedding-ada-002')

In [18]:
descriptions = list(df['description'])
total_tokens = sum([len(enc.encode(item)) for item in descriptions])
print(f'Total tokens: {total_tokens}')
cost = total_tokens * (0.0004/1000)
print(f'Estimated cost in USD: {cost:.10f}')

Total tokens: 166700
Estimated cost in USD: 0.0666800000


### 3. Calculate the embeddings and cache them locally

In [26]:
def get_embeddings_and_save(embedding_cache_file):
    from openai.embeddings_utils import get_embedding
    df['embedding'] = df['description'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
    df.to_csv(embedding_cache_file)

In [None]:
embedding_cache_file = 'book_embeddings.csv'
get_embeddings_and_save(embedding_cache_file)