In [10]:
import os
from dotenv import load_dotenv
from pypdf import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI


In [2]:
# Load the environment variables from the .env file located in the project root directory
load_dotenv()

# Retrieve the OpenAI API key from the environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise ValueError("Please set your OPENAI_API_KEY in the .env file")


In [3]:
# Read the PDF file and extract the text
reader = PdfReader("data/bitcoin.pdf")
number_of_pages = len(reader.pages)
raw_text = ""
for idx, page in enumerate(reader.pages):
    text = page.extract_text()
    # Ensure the extracted text is not None before processing it
    if text is not None:
        raw_text += text
        # Provide progress updates every 10 pages
        if idx % 10 == 0 and idx > 0:
            print(f"Processed {idx} pages of {number_of_pages}")
    else:
        print(f"Page {idx} is empty")

# Split the extracted text into manageable chunks
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)


In [None]:
# Embed the chunks using OpenAI's GPT-4 model
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="gpt-4", openai_api_type="open_ai")

In [5]:
# Convert the embeddings into a format that can be used for similarity search
docsearch = FAISS.from_texts(texts, embeddings)

In [7]:
# Load the question-answering chain
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [9]:
# Define the question to search for in the document
query = "What is Bitcoin?"
# Perform a similarity search for the question in the document chunks
docs = docsearch.similarity_search(query)
# Execute the question-answering chain with the retrieved documents and the question
chain.run(input_documents=docs, question=query)

' Bitcoin is a peer-to-peer electronic cash system which allows online payments to be sent directly from one party to another without going through a financial institution. It is secured by a peer-to-peer network using proof-of-work to record a public history of transactions, making it computationally impractical for an attacker to change if honest nodes control a majority of computing power.'