<a href="https://colab.research.google.com/github/aswanthkrishna/paperQA/blob/paper-qa/notebooks/Pdfs_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing and importing necessary packages

In [None]:
!pip install -q langchain==0.0.150 pypdf pandas matplotlib tiktoken textract transformers openai faiss-cpu fastapi kaleido uvicorn cohere six python-multipart beautifulsoup4

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

#Loading PDFs and breaking it to chunks

In [None]:
#Converting PDF to text
import textract
doc = textract.process("./HumanActivityRecognitionusingSmartphone.pdf")
doc1 = textract.process("./Embedded system.pdf")

#Save to .txt and reopen
with open('HumanActivityRecognitionusingSmartphone.txt', 'w') as f:
    f.write(doc.decode('utf-8'))

with open('HumanActivityRecognitionusingSmartphone.txt', 'a') as f:
    f.write(doc1.decode('utf-8'))

with open('HumanActivityRecognitionusingSmartphone.txt', 'r') as f:
    text = f.read()


#Creating function to count tokens
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

#Counting tokens
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

#Splitting text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 24,
    length_function = count_tokens,
)

chunks = text_splitter.create_documents([text])

#Embeddings

In [None]:
# Get embedding model
embeddings = OpenAIEmbeddings()

# Create vector database
db = FAISS.from_documents(chunks, embeddings)

#Setup Retrieval function

In [None]:
# Check similarity search is working
query = "How does HAR work?"
docs = db.similarity_search(query)
docs[0]
len(docs)

4

In [None]:
# Create QA chain to integrate similarity search with user queries (answers query from knowledge base)

chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

query = "How does HAR work?"
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)

' The Harvard architecture offers separate storage and signal buses for instructions and data. This architecture has data storage entirely contained within the CPU, and there is no access to the instruction storage as data. Computers have separate memory areas for program instructions and data using internal data buses, allowing simultaneous access to both instructions and data. Programs needed to be loaded by an operator; the processor could not boot itself. In a Harvard architecture, there is no need to make the two memories share properties.'

#Creating a chatbot using Conversational Retrieval Chain

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create conversation chain that uses our vectordb as retriver, this also allows for chat history management
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.1), db.as_retriever())

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""

    if query.lower() == 'quit':
        print("Thank you for using this chatbot!")
        return

    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))

    display(widgets.HTML(f'<b><font color="Lavender">User </font>&#x1F469;</b> {query}'))
    display(widgets.HTML(f'<b><font color="green">Bot </font>&#x1f916;</b> {result["answer"]}'))

print("Hello, I'm a chatbot! Type 'quit' to stop.")

input_box = widgets.Text(placeholder='Enter your question:')
input_box.on_submit(on_submit)

display(input_box)

Hello, I'm a chatbot! Type 'quit' to stop.


Text(value='', placeholder='Enter your question:')

HTML(value='<b><font color="Lavender">User </font>&#x1F469;</b> Which is better, Von neumann or Harvard')

HTML(value='<b><font color="green">Bot </font>&#x1f916;</b>  Harvard architecture is better than Von Neumann a…

HTML(value='<b><font color="Lavender">User </font>&#x1F469;</b> What is Human activity recognition? ')

HTML(value='<b><font color="green">Bot </font>&#x1f916;</b>  Human Activity Recognition is the process of reco…