In [2]:
# RUN THIS CELL FIRST!
!pip install -q langchain==0.0.150 pypdf pandas matplotlib tiktoken textract transformers openai faiss-cpu

In [50]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
import markdown
from IPython.display import HTML


In [51]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")

In [52]:

#NAIVE
chain = load_qa_chain(OpenAIChat(temperature=0), chain_type="stuff")

query = "How many percent of shoppers consider shipping speed a top priority?"
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)


'More than 85% of online shoppers consider shipping speed a top priority.'

In [53]:
# Advanced method - Split by chunk

# Step 1: Convert PDF to text
import textract
doc = textract.process("./scalable_timing_aware_network_design_via_lagrangian_decomposition.pdf")

# Step 2: Save to .txt and reopen (helps prevent issues)
with open('scalable_timing_aware_network_design_via_lagrangian_decomposition.txt', 'w') as f:
    f.write(doc.decode('utf-8'))

with open('scalable_timing_aware_network_design_via_lagrangian_decomposition.txt', 'r') as f:
    text = f.read()

# Step 3: Create function to count tokens
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))


# Step 4: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=512,
    chunk_overlap=24,
    length_function=count_tokens,
)

chunks = text_splitter.create_documents([text])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(chunks, embeddings)


In [55]:

chain = load_qa_chain(OpenAIChat(temperature=0), chain_type="stuff")


def query_model(query: str) -> str:
    docs = db.similarity_search(query)
    response = chain.run(input_documents=docs, question=query)
    html = markdown.markdown(response)
    return HTML(html)




In [56]:
query_model("How many \% of shoppers consider shipping speed a top priority?")


In [57]:

#Advanced method
query_model("Explain the tfmCF problem?")



In [58]:
query_model("What are the 4 contributions of this paper?")


In [48]:
query_model("State the mixed-integer programming formulation")


In [49]:
query_model("State the mixed-integer programming formulation in equation 1a, step by step explain everything")
