In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

In [None]:
#set OpenAI API Key
import os
os.environ["OPENAI_API_KEY"] = "API KEY HERE"

In [None]:
## connect google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

In [None]:
# locate pdf file(s)
reader = PdfReader('/content/gdrive/My Drive/GPT-4_vs_GPT-35_A_Concise_Showdown.pdf')

In [None]:
#Read data from PDF pages
raw_text = ''
for i, page in enumerate(reader.pages):
  text = page.extract_text()
  if text:
    raw_text += text

In [None]:
raw_text[:100]

In [None]:
#Split the text into smaller chunks for information retrieval

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
# initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [None]:
#Create a FAISS vector index from the split text chunks
docsearch = FAISS.from_texts(texts, embeddings)

In [None]:
#Load a question-answering chain using OpenAI's LLM

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain = load_qa_chain(OpenAI(), chain_type="stuff")

# Sample Questions

In [None]:
query = "Who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What is the article about?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)