# 📚 Book Recommendation System using OpenAI Embeddings
This notebook demonstrates how to build a simple book recommendation system using OpenAI's `text-embedding-ada-002` model. It includes loading and cleaning data, generating embeddings, and recommending similar books based on cosine similarity.

## 1. 📦 Import Libraries and Load Dataset

In [None]:
import openai
import os
import pandas as pd
import numpy as np

# Set API Key (you should replace this with your own key securely)
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load the books dataset
df = pd.read_csv("books_dataset.csv")
df.dropna(inplace=True)

# Select top 3000 books by rating
df = df.sort_values('average_rating', ascending=False).head(3000)
df.head()

## 2. 🔢 Count Tokens and Estimate Embedding Cost

In [None]:
# Install the tiktoken library used for tokenizing text (quietly, without extra output)
!pip install tiktoken -q
import tiktoken

# Load the tokenizer specific to the 'text-embedding-ada-002' model
enc = tiktoken.encoding_for_model('text-embedding-ada-002')

# Extract the 'description' column from the DataFrame as a list of strings
description = list(df['description'])

# Calculate the total number of tokens across all book descriptions
# Each description is tokenized using the model's tokenizer, and token counts are summed
total_tokens = sum([len(enc.encode(item)) for item in description])

# Display the total token count
print(f'Total tokens: {total_tokens}')

# Estimate the cost in USD based on OpenAI's embedding pricing
# The current rate is $0.0001 per 1,000 tokens
cost = total_tokens * (0.0001 / 1000)

# Print the estimated cost, formatted to 10 decimal places
print(f'Estimated cost in USD: {cost:.10F}')


## 3. 🧠 Generate and Save Embeddings

In [None]:
# Define a function to generate an embedding (vector representation) for a given text
def get_embedding(text):
    response = openai.embeddings.create(
        input=text,  # The input text to be embedded
        model='text-embedding-ada-002'  # The OpenAI model used for generating embeddings
    )
    return response.data[0].embedding  # Return the embedding vector from the response

# Define a function to compute embeddings for all book descriptions and save them to a CSV file
def get_embeddings_and_save_to_csv(embedding_cache_file):
    # Apply the embedding function to each description and store the result in a new 'embedding' column
    df['embedding'] = df['description'].apply(lambda x: get_embedding(x))

    # Save the DataFrame (with embeddings) to a CSV file
    df.to_csv(embedding_cache_file)

# Specify the filename where the embeddings will be saved
embedding_cache_file = 'book_embeddings.csv'

# Generate embeddings and save the updated DataFrame to the specified CSV file
get_embeddings_and_save_to_csv(embedding_cache_file)


## 4. 💾 Load and Prepare Embeddings

In [None]:
# Specify the path to the CSV file containing the saved embeddings
embedding_cache_file = 'book_embeddings.csv'

# Load the CSV file into a DataFrame
df_embeddings = pd.read_csv(embedding_cache_file)

# Convert the string representation of the embedding back to a NumPy array
# First, eval() turns the string into a Python list, then np.array() converts it to a NumPy array
df_embeddings['embedding'] = df_embeddings['embedding'].apply(eval).apply(np.array)

# Display the first few rows of the DataFrame to verify the data
df_embeddings.head()


## 5. 📚 Recommend Similar Books

In [None]:
from sklearn.metrics.pairwise import cosine_similarity  # Import function to compute cosine similarity

# Define a function to get top-k similar book recommendations based on a given book title
def get_recommendation_from_title(df_embeddings, title, k):
    # Check if the given title exists in the dataset
    if title not in df_embeddings['title'].values:
        return False  # Return False if the title is not found

    # Ensure all embeddings are NumPy arrays (in case they are not)
    df_embeddings['embedding'] = df_embeddings['embedding'].apply(lambda x: np.array(x))

    # Get the embedding for the selected book title and reshape it to match input shape for cosine similarity
    target_embedding = df_embeddings.loc[df_embeddings['title'] == title, 'embedding'].values[0].reshape(1, -1)

    # Calculate cosine similarity between the target book and all other books
    similarities = df_embeddings['embedding'].apply(
        lambda x: cosine_similarity(target_embedding, x.reshape(1, -1))[0][0]
    )

    # Store the similarity scores in a new column
    df_embeddings['similarity'] = similarities

    # Sort the DataFrame by similarity in descending order (most similar first)
    df_sorted = df_embeddings.sort_values(by='similarity', ascending=False)

    # Build the list of recommended books (excluding the input book itself)
    recommendations = []
    for _, row in df_sorted.iloc[0:k+1].iterrows():  # +1 to account for the input book being included
        book = {
            'title': row['title'],
            'description': row['description'],
            'similarity': row['similarity']
        }
        recommendations.append(book)

    return recommendations  # Return the list of recommended books


In [None]:
# Example usage
get_recommendation_from_title(df_embeddings, 'Colossians and Philemon', 10)