# Talk to PDF
This is a simple chatbot which can look up a give pdf and answer your questions.

### Download and install necessary packages

In [None]:
!pip install PyPDF2 langchain openai
!pip install typing-extensions --upgrade

### Import necessary libraries

In [None]:
from PyPDF2 import PdfReader # used to extract text from pdf
from langchain.text_splitter import CharacterTextSplitter # split text in smaller snippets
import os # read API key from environment variables. Not required if you are specifying the key in notebook.
from openai import OpenAI # used to access openai api
import json # used to create a json to store snippets and embeddings
from numpy import dot # used to match user questions with snippets.

### Parameters specifying different variables used in the code

In [None]:
EXTRACTED_TEXT_FILE_PATH = "pdf_text.txt" # text extracted from pdf
EXTRACTED_JSON_PATH = "extracted.json" # snippets and embeddings
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] # replace this with your openai api key or store the api key in env
EMBEDDING_MODEL = "text-embedding-ada-002" # embedding model used
GPT_MODEL = "gpt-3.5-turbo" # gpt model used. alternatively you can use gpt-4 or other models.
CHUNK_SIZE = 1000 # chunk size to create snippets
CHUNK_OVERLAP = 200 # check size to create overlap between snippets
CONFIDENCE_SCORE = 0.75 # specify confidence score to filter search results. [0,1] prefered: 0.75

### Helper Function to extract text from pdf
The extracted text is saved in a text file - `pdf_text.txt`

In [None]:
def extract_text_from_pdf(file_path: str):
    
    # Open the PDF file using the specified file_path
    reader = PdfReader(file_path)
    # Get the total number of pages in the PDF
    number_of_pages = len(reader.pages)

    # Initialize an empty string to store extracted text
    pdf_text = ""

    # Loop through each page of the PDF
    for i in range(number_of_pages):
        # Get the i-th page
        page = reader.pages[i]
        # Extract text from the page and append it to pdf_text
        pdf_text += page.extract_text()
        # Add a newline after each page's text for readability
        pdf_text += "\n"
    
    # Specify the file path for the new text file
    file_path = EXTRACTED_TEXT_FILE_PATH

    # Write the content to the text file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(pdf_text)

### Helper function to turn text into embeddings
This representation allows the computer to work with textual data in a way that is amenable to mathematical operations and analysis.

In [None]:
def create_embeddings(file_path: str):
    
    # Initialize a list to store text snippets
    snippets = []
    # Initialize a CharacterTextSplitter with specified settings
    text_splitter = CharacterTextSplitter(separator="\n",
                                         chunk_size=CHUNK_SIZE,
                                         chunk_overlap=CHUNK_OVERLAP,
                                         length_function=len)

    # Read the content of the file specified by file_path
    with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()

    # Split the text into snippets using the specified settings
    snippets = text_splitter.split_text(file_text)
    
    # Set the OpenAI API key
    openai.api_key = OPENAI_API_KEY
    
    # Initialize OpenAI Client
    client = OpenAI()
    
    # Request embeddings for the snippets using the specified model
    response = client.embeddings.create(input=snippets,model=EMBEDDING_MODEL)
    
    # Extract embeddings from the API response
    embedding_list = [response_object.embedding for response_object in response.data]

    # Create a JSON object containing embeddings and snippets
    embedding_json = {
        'embeddings': embedding_list,
        'snippets': snippets
    }
    
    # Convert the JSON object to a formatted JSON string
    json_object = json.dumps(embedding_json, indent=4)

    # Write the JSON string to a file specified by EXTRACTED_JSON_PATH
    with open(EXTRACTED_JSON_PATH, 'w', encoding="utf-8") as file:
        file.write(json_object)

### Helper function to read Embedding JSON file
Reads `extracted.json` and prepared embedding for chatbot.

In [None]:
def get_embeddings():
    
    # Open the JSON file containing embeddings and snippets
    with open(EXTRACTED_JSON_PATH,'r') as file:
        # Load the JSON data into a Python dictionary
        embedding_json = json.load(file)
        
    # Return the embeddings and snippets from the loaded JSON
    return embedding_json['embeddings'], embedding_json['snippets']

### Helper function to create Embedding from User's Question:
Output of this function is used to find the right embedding.

In [None]:
def user_question_embedding_creator(question):
    
    client = OpenAI()
    
    # Request embedding for the provided question using the specified model
    response = client.embeddings.create(input=question,model=EMBEDDING_MODEL)
    
    # Extract and return the embedding from the API response
    return response.data[0].embedding

### Helper function to answer user's question. 

In [None]:
def answer_users_question(user_question):
    
    try:
        # Create an embedding for the user's question
        user_question_embedding = user_question_embedding_creator(user_question)
    except Exception as e:
        # Handle any exception that occurred while using Embedding API.
        return f"An error occurred while creating embedding: {str(e)}"
        
    
    # Calculate cosine similarities between the user's question embedding and the document embeddings
    cosine_similarities = []
    for embedding in embeddings:
        cosine_similarities.append(dot(user_question_embedding,embedding))

    # Pair snippets with their respective cosine similarities and sort them by similarity
    scored_snippets = zip(snippets, cosine_similarities)
    sorted_snippets = sorted(scored_snippets, key=lambda x: x[1], reverse=True)
    
    # Filter snippets based on a confidence score and select the top 5 results
    formatted_top_results = [snipps for snipps, _score in sorted_snippets if _score > CONFIDENCE_SCORE]
    if len(formatted_top_results) > 5:
        formatted_top_results = formatted_top_results[:5]
        
    # Create the chatbot system using pdf_description provided by the user.
    chatbot_system = f"""You are provided with SEARCH RESULTS from a pdf. This pdf is a {pdf_description}. You need to generate answer to the user's question based on the given SEARCH RESULTS. SEARCH RESULTS as a python list. SEARCH RESULTS and USER's QUESTION are delimited by ``` \n If there is no information available, or question is irrelevent respond with - "Sorry! I can't help you." """
    
    # Create the prompt using results and user's question.
    prompt = f"""\
    SEARCH RESULTS:
    ```
    {formatted_top_results}
    ```
    USER'S QUESTION:
    ```
    {user_question}
    ```
    
    """
    
    # Prepare the chat conversation and use GPT model for generating a response
    messages = [{'role':'system', 'content':chatbot_system},
                {'role':'user', 'content':prompt}]
    
    try:
        client = OpenAI()
        completion = client.chat.completions.create(model=GPT_MODEL,
                                             messages=messages,
                                             temperature=0,
                                             stream=False)
    except Exception as e:
        # Handle exception while communicating with ChatCompletion API
        return f"An error occurred with chatbot: {str(e)}"
        
    # Return the chatbot response.
    return completion.choices[0].message.content

---

## Start Here
Specify the path to pdf below and run all the cells.

**Specify the path to pdf to be extracted.**

In [None]:
PDF_FILE_PATH = "chatgpt_unsesco.pdf"

**Converting pdf to text file.**
> ⚠️ **Needed only once.**
> You need to extract text only once.

This is a helper function to extract text from pdf specified above.

Add `#` to the beginning of this function after extracting text from the pdf.

In [None]:
extract_text_from_pdf(PDF_FILE_PATH)

**Creating embeddings from text file.**

> ⚠️ **Use it only when needed**
> Billed Function. Required only, once per pdf file.

`create_embeddings` function is billed, it uses OpenAI's APIs to create embeddings, use it only when required. Comment it after creating embeddings from the pdf.

In [None]:
create_embeddings(EXTRACTED_TEXT_FILE_PATH)

**Prepare Embeddings**

This reads the embeddings from the json file and stores them for chatbot.

In [None]:
embeddings, snippets = get_embeddings()

## Describe your pdf.
Please provide a detailed explanation of the content and purpose of your PDF.

In [None]:
pdf_description = """UNSESCO Guidelines on ChatGPT and Generative AI"""

## Chatbot
This is the logic for chatbot.

**To exit leave user input blank and hit enter.**

In [None]:
# Start an infinite loop, allowing the user to ask questions
while True:
    
    # Prompt the user to input a question
    print("👤USER:")
    
    # Read the user's question from the console
    user_question = input("")
    
    # Print a separator for readability
    print("----------------------")
    
    # Check if the user entered an empty question
    if user_question =="":
        
        # If the user entered an empty question, exit the loop
        break
    else:
        
        # If the user entered a question, proceed to generate a response
        print("🤖 BOT:")
        
        # Call the function to generate an answer based on the user's question
        # and print the bot's response
        print(answer_users_question(user_question=user_question))
        
        # Print a separator for readability
        print("----------------------")