In [2]:
pip install -q 'openai<1'

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install numpy -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install matplotlib scipy scikit-learn plotly -q

Note: you may need to restart the kernel to use updated packages.


In [5]:
from openai.embeddings_utils import get_embedding

In [8]:
def openai_authenticate(keyfile):
    import openai
    with open(keyfile,'r') as f:
        api_key = f.read().strip('\n')
        assert api_key.startswith('sk-'),'Error loading API key. API key shouls starts with sk'
    openai.api_key = api_key
openai_authenticate('./key.txt')

## Loading dataset

In [10]:
import pandas as pd

# Loading the CSV file into a Pandas DataFrame
df = pd.read_csv('./books_dataset.csv')

# Cleaning the data, dropping rows with missing values
df.dropna(inplace=True)

# sorting by average rating the returning the first 2000 records
df = df.sort_values('average_rating', ascending=False).head(2000)
df.head()

Unnamed: 0,isbn13,title,authors,categories,description,published_year,average_rating
6738,9781932206081,Insights,Frederick Lenz,Spiritual life,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.0
5398,9780851621814,The Complete Theory Fun Factory,Katie Elliott;Ian Martin,Juvenile Nonfiction,(Boosey & Hawkes Scores/Books). Contains the m...,1996.0,5.0
5972,9781551052700,Ecuador Nature Guide,Christopher D. Jiggins,Botanique,The guide provides information on 76 species o...,2000.0,5.0
6671,9781890995522,The Diamond Color Meditation,John Diamond,Health & Fitness,The Diamond Color Meditation presents an inspi...,2006.0,5.0
4306,9780739844328,Bill Gates,Sara Barton-Wood,Juvenile Nonfiction,"Presents the life of Bill Gates, from his chil...",2001.0,5.0


## Embedding cost

In [12]:
import tiktoken

enc = tiktoken.encoding_for_model('text-embedding-3-small')
descriptions = list(df['description'])
total_tokens = sum([len(enc.encode(item)) for item in descriptions])
print(f'Total tokens: {total_tokens}')

cost = total_tokens * (0.000020/1000)
print(f'Estimated cost in USD: {cost:.10f}')

Total tokens: 165898
Estimated cost in USD: 0.0033179600


### Calculating the Embeddings and Cache Them Locally

In [13]:
def get_embedding_and_save_to_csv(embedding_cache_file):

    from openai.embeddings_utils import get_embedding
    
    df['embedding'] = df['description'].apply(lambda x : get_embedding(x, engine='text-embedding-3-small'))
    
    df.to_csv(embedding_cache_file)

In [14]:
get_embedding_and_save_to_csv('books_embedding.csv')

## Loading Embedding

In [18]:
import numpy as np
embedded_cache_file = 'books_embedding.csv'
df_embedding = pd.read_csv(embedded_cache_file)

# converting embeddings: str => numpy array
df_embedding['embedding'] = df_embedding['embedding'].apply(eval).apply(np.array)

In [19]:
df_embedding.shape

(2000, 9)

## Get Recommendation from Title

In [20]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [36]:
def recomendation_from_title(df_embedding,title,k):
    from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
    title = title.strip()
        
     # embedding of the target book
    book_embedding = df_embedding[df_embedding['title'].str.lower() == title.lower()]['embedding']
    
    # squeeze to a series (one dimension)
    book_embedding = book_embedding.squeeze()
    
    # getting the all the embeddings into a Python list
    embeddings = list(df_embedding['embedding'])
    
    # computing the distance from the target embedding to all the other embeddings
    distances = distances_from_embeddings(book_embedding,embeddings)
    # print(sorted(distances))
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    recommendations = list()
    for index in indices_of_nearest_neighbors[1:k+1]:
        
        book = dict()
        book['title'] = df_embedding.iloc[index]['title']
        book['description'] = df_embedding.iloc[index]['description']
        book['distance'] = distances[index]
        recommendations.append(book)
    
    return recommendations

In [39]:
title = input('Enter Book\'s Title: ')
book_recommendations = recomendation_from_title(df_embedding, title, 5)
# print(book_recommendations)

if book_recommendations:
    for i, item in enumerate(book_recommendations):
        print(f'Book Recommendation #{i+1}, Distance: {item["distance"]}')
        print(f'Title: {item["title"]}')
        print(f'Description: {item["description"]}')
        print()
        print('#' * 50)
        print()
else:
    print(f'Title {title} does not exist in the dataset.')

Enter Book's Title:  animal


Book Recommendation #1, Distance: 0.5218509221951955
Title: Ecuador Nature Guide
Description: The guide provides information on 76 species of birds, plants, mammals and insects of Ecuador. Each species description is accompanied by an illustration as well as information on ecology, local names and uses. Profits from the sale of this guide will go

##################################################

Book Recommendation #2, Distance: 0.5600394732364364
Title: Under the Sea Wind
Description: Describes the sea birds and sea creatures that inhabit the Eastern coasts of North America.

##################################################

Book Recommendation #3, Distance: 0.5676764344265837
Title: Bird Songs
Description: Drawing from the collection of the world-renowned Macaulay Library at the Cornell Lab of Ornithology, Bird Songs presents the most notable North American birdsincluding the rediscovered Ivory-billed Woodpeckerin a stunning new format. Renowned bird biologist Les Beletsky provi