# Books Recommendation System

### Install Required Libraries

In [1]:
!pip install -q openai
!pip install -q pandas
!pip install -q numpy

In [142]:
!pip install --upgrade openai



### Import Libraries

In [3]:
import openai
import pandas as pd
import numpy as np

In [143]:
from openai import OpenAI
client = OpenAI()

### Load Environment Variables

In [144]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

os.environ.get('OPENAI_API_KEY')

print("API Key Loaded:", os.environ.get('OPENAI_API_KEY') is not None)

API Key Loaded: True


### Load Dataset into Pandas DataFrame

In [145]:
df = pd.read_csv('./data.csv')
df.dropna(inplace=True)
df = df.sort_values('average_rating', ascending=False).head(2000)
df
# df.iloc[1581]

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
4306,9780739844328,0739844326,Bill Gates,Computer Legend,Sara Barton-Wood,Juvenile Nonfiction,http://books.google.com/books/content?id=Fr814...,"Presents the life of Bill Gates, from his chil...",2001.0,5.00,48.0,0.0
5972,9781551052700,1551052709,Ecuador Nature Guide,Southwest Forests : Sozoranga Forest Project,Christopher D. Jiggins,Botanique,http://books.google.com/books/content?id=1JjbG...,The guide provides information on 76 species o...,2000.0,5.00,96.0,1.0
6720,9781930901353,1930901356,The Irish Anatomist,A Study of Flann O'Brien,Keith Donohue,Biography & Autobiography,http://books.google.com/books/content?id=baEJA...,The most full length critical and biographical...,2002.0,5.00,222.0,1.0
6738,9781932206081,1932206086,Insights,Talks on the Nature of Existence,Frederick Lenz,Spiritual life,http://books.google.com/books/content?id=NOX2P...,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00,304.0,1.0
6671,9781890995522,1890995525,The Diamond Color Meditation,Color Pathway to the Soul,John Diamond,Health & Fitness,http://books.google.com/books/content?id=1ChsH...,The Diamond Color Meditation presents an inspi...,2006.0,5.00,74.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
974,9780142003299,0142003298,Octopussy,And the Living Daylights,Ian Fleming,Fiction,http://books.google.com/books/content?id=B5vGA...,"The legendary 007 deals with a rich, deranged ...",2004.0,3.55,120.0,4801.0
6197,9781578068012,1578068010,Daisy Bates,Civil Rights Crusader from Arkansas,Grif Stockley,Biography & Autobiography,http://books.google.com/books/content?id=rsd_m...,Daisy Bates (1914-1999) is renowned as the men...,2005.0,3.55,340.0,11.0
3645,9780586203194,0586203192,2061,Odyssey Three,Arthur C. Clarke,Computers,http://books.google.com/books/content?id=uSj0D...,Science fiction-roman.,1997.0,3.55,302.0,20216.0
5792,9781401200688,1401200680,On the Road to Perdition,Oasis,Max Allan Collins;Jose Luis Garcia-Lopez,Comics & Graphic Novels,http://books.google.com/books/content?id=VqrUX...,"Renegade mob hitman Michael O'Sullivan--the ""A...",2003.0,3.54,96.0,74.0


### Embedding Cost Calculation

In [150]:
import tiktoken
enc = tiktoken.encoding_for_model('text-embedding-3-small')
descriptions = list(df['description'])
total_tokens = sum([len(enc.encode(item)) for item in descriptions])
print(f'Total tokens: {total_tokens}')
cost = total_tokens * (0.0004/1000)
print(f'Estimated cost in USD: {cost:.10f}')

Total tokens: 188119
Estimated cost in USD: 0.0752476000


### Calculate the embeddings and cache them locally

In [51]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [52]:
def get_embeddings_and_save_to_csv(embedding_cache_file):
    df['embedding']=df['description'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
    df.to_csv(embedding_cache_file)

In [128]:
embedding_cache_file = 'book_embeddings.csv'
get_embeddings_and_save_to_csv(embedding_cache_file)

In [130]:
# an example of how the embeddings look without visualization
print(list(df_embeddings['embedding'].iloc[0]))

[0.05363256856799126, -0.018259329721331596, 0.01940474845468998, 0.06297559291124344, -0.009028598666191101, -0.08247017860412598, 0.04923056438565254, 0.06293068081140518, -0.0005979762063361704, 0.03085894137620926, 0.02757989801466465, 0.028141377493739128, 0.012172886170446873, -0.05053320154547691, 0.04521036893129349, 0.048107605427503586, -0.05188075080513954, 0.005471622571349144, -0.012958958745002747, 0.011358740739524364, 0.016653496772050858, 0.028298592194914818, 0.017978589981794357, 0.021257633343338966, -0.006198739167302847, -0.002664222614839673, 0.042650021612644196, 0.03270059451460838, 0.014912908896803856, 0.010134713724255562, -0.02107795886695385, -0.002502797171473503, 0.05206042528152466, 0.02032557502388954, 0.03827047720551491, -0.004463765770196915, -0.01889941655099392, -0.010241394862532616, 0.007304854691028595, 0.011179067194461823, -0.02735530585050583, -0.008573799394071102, 0.00023933085321914405, 0.029421551153063774, 0.0509374663233757, -0.0300279

### Load the Embeddings from the CSV

In [57]:
embedding_cache_file = 'book_embeddings.csv'
df_embeddings = pd.read_csv(embedding_cache_file)
df_embeddings['embedding'] = df_embeddings['embedding'].apply(eval).apply(np.array)
df_embeddings

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,embedding
0,4306,9780739844328,0739844326,Bill Gates,Computer Legend,Sara Barton-Wood,Juvenile Nonfiction,http://books.google.com/books/content?id=Fr814...,"Presents the life of Bill Gates, from his chil...",2001.0,5.00,48.0,0.0,"[0.05363256856799126, -0.018259329721331596, 0..."
1,5972,9781551052700,1551052709,Ecuador Nature Guide,Southwest Forests : Sozoranga Forest Project,Christopher D. Jiggins,Botanique,http://books.google.com/books/content?id=1JjbG...,The guide provides information on 76 species o...,2000.0,5.00,96.0,1.0,"[0.060896094888448715, -0.010619226843118668, ..."
2,6720,9781930901353,1930901356,The Irish Anatomist,A Study of Flann O'Brien,Keith Donohue,Biography & Autobiography,http://books.google.com/books/content?id=baEJA...,The most full length critical and biographical...,2002.0,5.00,222.0,1.0,"[0.03172249346971512, 0.03664880990982056, -0...."
3,6738,9781932206081,1932206086,Insights,Talks on the Nature of Existence,Frederick Lenz,Spiritual life,http://books.google.com/books/content?id=NOX2P...,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00,304.0,1.0,"[-0.011502875946462154, -0.009286032989621162,..."
4,6671,9781890995522,1890995525,The Diamond Color Meditation,Color Pathway to the Soul,John Diamond,Health & Fitness,http://books.google.com/books/content?id=1ChsH...,The Diamond Color Meditation presents an inspi...,2006.0,5.00,74.0,5.0,"[0.004097896162420511, 0.004661967046558857, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,974,9780142003299,0142003298,Octopussy,And the Living Daylights,Ian Fleming,Fiction,http://books.google.com/books/content?id=B5vGA...,"The legendary 007 deals with a rich, deranged ...",2004.0,3.55,120.0,4801.0,"[-0.027058351784944534, 0.019854001700878143, ..."
1996,6197,9781578068012,1578068010,Daisy Bates,Civil Rights Crusader from Arkansas,Grif Stockley,Biography & Autobiography,http://books.google.com/books/content?id=rsd_m...,Daisy Bates (1914-1999) is renowned as the men...,2005.0,3.55,340.0,11.0,"[0.07347182929515839, -0.028972012922167778, 0..."
1997,3645,9780586203194,0586203192,2061,Odyssey Three,Arthur C. Clarke,Computers,http://books.google.com/books/content?id=uSj0D...,Science fiction-roman.,1997.0,3.55,302.0,20216.0,"[-0.03379650041460991, 0.0369640588760376, -0...."
1998,5792,9781401200688,1401200680,On the Road to Perdition,Oasis,Max Allan Collins;Jose Luis Garcia-Lopez,Comics & Graphic Novels,http://books.google.com/books/content?id=VqrUX...,"Renegade mob hitman Michael O'Sullivan--the ""A...",2003.0,3.54,96.0,74.0,"[0.029538720846176147, 0.05673157796263695, -0..."


### Get Recommendation from Book title

In [116]:
def get_recommendation_from_title(df_embeddings, title, k):
    from scipy.spatial.distance import cosine

    if title not in list(df_embeddings['title']):
        return False

    book_embedding = df_embeddings[df_embeddings['title'] == title]['embedding'].iloc[0]
    book_embedding = book_embedding.squeeze()

    embeddings = list(df_embeddings['embedding'])

    distances = [cosine(book_embedding, emb) for emb in embeddings]

    #print(sorted(distances))
    closest_indices = np.argsort(distances)[:k]  
    # print(closest_indices)
    recommendations = list()
    for index in closest_indices[1:k+1]:
        book = dict()
        book['title']=df_embeddings.iloc[index]['title']
        book['description'] = df_embeddings.iloc[index]['description']
        book['distance']=distances[index]
        recommendations.append(book)

    return recommendations

In [117]:
get_recommendation_from_title(df_embeddings, 'The Irish Anatomist', 10)

[{'title': 'The Gatekeeper',
  'description': 'An Oxford scholar and author of The Truth About the Irish candidly recounts his experiences as an impoverished youth, a convent gatekeeper, a Catholic in Protestant England, a working-class Irish immigrant among the Oxbridge elite, and a Marxist visiting professor in Mormon Utah. Reprint. 15,000 first printing.',
  'distance': 0.5517319540653511},
 {'title': 'The Family Idiot',
  'description': 'With this volume, the University of Chicago Press completes its translation of a work that is indispensable not only to serious readers of Flaubert but to anyone interested in the last major contribution by one of the twentieth century\'s greatest thinkers. That Sartre\'s study of Flaubert, The Family Idiot, is a towering achievement in intellectual history has never been disputed. Yet critics have argued about the precise nature of this novel or biography or "criticism-fiction" which is the summation of Sartre\'s philosophical, social, and literar

In [147]:
title = input('Enter Book\'s Title: ')
book_recommendations = get_recommendation_from_title(df_embeddings, title, 5)
# print(book_recommendations)
if book_recommendations:
    for i, item in enumerate(book_recommendations):
        print(f'Book recommendation #{i+1}, Distance: {item["distance"]}')
        print(f'Title: {item["title"]}')
        print(f'Description: {item["description"]}')
        print()
        print('#' * 50)
        print()
else:
    print(f'Title {title} does not exist in the dataset')

Enter Book's Title:  Bill Gates


Book recommendation #1, Distance: 0.2091330601137581
Title: Hard Drive
Description: Chronicles the career of "Chairman Bill" Gates, the computer whiz kid who commands the powerful Microsoft computer software empire.

##################################################

Book recommendation #2, Distance: 0.5302106646883478
Title: Think and Grow Rich
Description: An updated edition of the best-selling guide features anecdotes about such modern figures as Bill Gates, Dave Thomas, and Sir John Templeton, explaining how their examples can enable modern readers to pursue wealth and overcome personal stumbling blocks. Original. 30,000 first printing.

##################################################

Book recommendation #3, Distance: 0.5524400806751422
Title: IWoz
Description: The high-tech wizard behind Apple offers a personal account of the creation of the first personal computer by marrying computer circuitry with a video screen and a typewriter keyboard to create the affordable, easy-to-u

### Displaying Emeddings on Atlas

In [131]:
import openai
import pandas as pd
import numpy as np
from nomic import atlas

In [133]:
embedding_cache_file = './book_embeddings.csv'
df = pd.read_csv(embedding_cache_file)
df

Unnamed: 0.1,Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,embedding
0,4306,9780739844328,0739844326,Bill Gates,Computer Legend,Sara Barton-Wood,Juvenile Nonfiction,http://books.google.com/books/content?id=Fr814...,"Presents the life of Bill Gates, from his chil...",2001.0,5.00,48.0,0.0,"[0.05363256856799126, -0.018259329721331596, 0..."
1,5972,9781551052700,1551052709,Ecuador Nature Guide,Southwest Forests : Sozoranga Forest Project,Christopher D. Jiggins,Botanique,http://books.google.com/books/content?id=1JjbG...,The guide provides information on 76 species o...,2000.0,5.00,96.0,1.0,"[0.06094173714518547, -0.010597361251711845, 0..."
2,6720,9781930901353,1930901356,The Irish Anatomist,A Study of Flann O'Brien,Keith Donohue,Biography & Autobiography,http://books.google.com/books/content?id=baEJA...,The most full length critical and biographical...,2002.0,5.00,222.0,1.0,"[0.03172249346971512, 0.03664880990982056, -0...."
3,6738,9781932206081,1932206086,Insights,Talks on the Nature of Existence,Frederick Lenz,Spiritual life,http://books.google.com/books/content?id=NOX2P...,"In 1983, when Rama - Dr. Frederick P. Lenz rec...",2003.0,5.00,304.0,1.0,"[-0.011502875946462154, -0.009286032989621162,..."
4,6671,9781890995522,1890995525,The Diamond Color Meditation,Color Pathway to the Soul,John Diamond,Health & Fitness,http://books.google.com/books/content?id=1ChsH...,The Diamond Color Meditation presents an inspi...,2006.0,5.00,74.0,5.0,"[0.004097896162420511, 0.004661967046558857, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,974,9780142003299,0142003298,Octopussy,And the Living Daylights,Ian Fleming,Fiction,http://books.google.com/books/content?id=B5vGA...,"The legendary 007 deals with a rich, deranged ...",2004.0,3.55,120.0,4801.0,"[-0.027058351784944534, 0.019854001700878143, ..."
1996,6197,9781578068012,1578068010,Daisy Bates,Civil Rights Crusader from Arkansas,Grif Stockley,Biography & Autobiography,http://books.google.com/books/content?id=rsd_m...,Daisy Bates (1914-1999) is renowned as the men...,2005.0,3.55,340.0,11.0,"[0.07347182929515839, -0.028972012922167778, 0..."
1997,3645,9780586203194,0586203192,2061,Odyssey Three,Arthur C. Clarke,Computers,http://books.google.com/books/content?id=uSj0D...,Science fiction-roman.,1997.0,3.55,302.0,20216.0,"[-0.03379650041460991, 0.0369640588760376, -0...."
1998,5792,9781401200688,1401200680,On the Road to Perdition,Oasis,Max Allan Collins;Jose Luis Garcia-Lopez,Comics & Graphic Novels,http://books.google.com/books/content?id=VqrUX...,"Renegade mob hitman Michael O'Sullivan--the ""A...",2003.0,3.54,96.0,74.0,"[0.029538720846176147, 0.05673157796263695, -0..."


In [134]:
df['embedding']=df['embedding'].apply(eval).apply(np.array)

In [135]:
data = df[['title', 'authors', 'categories']].to_dict('records')
data

[{'title': 'Bill Gates',
  'authors': 'Sara Barton-Wood',
  'categories': 'Juvenile Nonfiction'},
 {'title': 'Ecuador Nature Guide',
  'authors': 'Christopher D. Jiggins',
  'categories': 'Botanique'},
 {'title': 'The Irish Anatomist',
  'authors': 'Keith Donohue',
  'categories': 'Biography & Autobiography'},
 {'title': 'Insights',
  'authors': 'Frederick Lenz',
  'categories': 'Spiritual life'},
 {'title': 'The Diamond Color Meditation',
  'authors': 'John Diamond',
  'categories': 'Health & Fitness'},
 {'title': 'Fanning the Flame',
  'authors': 'Christopher J. H. Wright',
  'categories': 'Religion'},
 {'title': 'Harry Potter',
  'authors': 'J. K. Rowling',
  'categories': 'Juvenile Fiction'},
 {'title': "It's a Magical World",
  'authors': 'Bill Watterson',
  'categories': 'American wit and humor, Pictorial'},
 {'title': 'The John Deere Two-Cylinder Tractor Encyclopedia',
  'authors': 'Don Macmillan;Wayne G. Broehl',
  'categories': 'Reference'},
 {'title': 'Empire 2.0',
  'authors

In [136]:
embeddings = list(df['embedding'])
embeddings

[array([ 0.05363257, -0.01825933,  0.01940475, ...,  0.00565691,
         0.02335757, -0.01013471]),
 array([ 0.06094174, -0.01059736,  0.06788503, ...,  0.01770096,
        -0.04340661,  0.01640739]),
 array([ 0.03172249,  0.03664881, -0.02836364, ..., -0.02915981,
         0.03239426, -0.02483063]),
 array([-0.01150288, -0.00928603,  0.01074105, ...,  0.00862716,
         0.00284312,  0.00987628]),
 array([ 0.0040979 ,  0.00466197,  0.00653712, ..., -0.01951379,
        -0.01369624,  0.00075235]),
 array([ 0.02268183,  0.05379694,  0.04883891, ...,  0.01714459,
        -0.04443692, -0.03391848]),
 array([-0.03922549,  0.03654709, -0.03486229, ..., -0.04354547,
         0.01220396, -0.00626398]),
 array([ 0.03677513,  0.02311316, -0.02703355, ..., -0.01193278,
         0.00632279,  0.00155265]),
 array([ 0.0147869 ,  0.02000307, -0.03327636, ..., -0.00523945,
        -0.01422803,  0.00696265]),
 array([ 0.00920521,  0.05878863,  0.01218149, ..., -0.00845405,
         0.02080562, -0.03

In [148]:
embeddings = np.array(embeddings)
project = atlas.map_data(
    embeddings = embeddings,
    data = data)

[32m2024-04-28 22:35:44.871[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m914[0m - [1mCreating dataset `oblivious-fourier`[0m
[32m2024-04-28 22:35:45.988[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m110[0m - [1mUploading data to Atlas.[0m
1it [00:03,  3.37s/it]
[32m2024-04-28 22:35:49.389[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1593[0m - [1mUpload succeeded.[0m
[32m2024-04-28 22:35:49.393[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m125[0m - [1m`atefataya/oblivious-fourier`: Data upload succeeded to dataset`[0m
[32m2024-04-28 22:35:53.056[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1297[0m - [1mCreated map `oblivious-fourier` in dataset `atefataya/oblivious-fourier`: https://atlas.nomic.ai/data/atefataya/oblivious-fourier/map[0m
