# Project: Question-Answering on Private Documents
This is part of my **"Learn LangChain, Pinecone & OpenAI: Build Next-Gen LLM Apps"** course.

https://www.udemy.com/course/master-langchain-pinecone-openai-build-llm-applications/?referralCode=4B17E3BD4CBBEA3B8321

In [None]:
pip install -q "openai<1"

In [None]:
pip install -q "langchain<1"

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [12]:
# pip install pypdf -q

In [4]:
# pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


### Loading Documents

In [6]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data
  

In [7]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


### Chunking Data

In [18]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

### Calculating Cost

In [20]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [31]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [27]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [52]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.run(q)
    return answer
    
    
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history
    

### Running Code

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
data = load_document('files/us_constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

NameError: name 'load_document' is not defined

In [3]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

Loading files/the_great_gatsby.docx
The Project Gutenberg eBook of The Great Gatsby, by F. Scott Fitzgerald



This eBook is for the use of anyone anywhere in the United States and

most other parts of the world at no cost and with almost no restrictions

whatsoever. You may copy it, give it away or re-use it under the terms

of the Project Gutenberg License included with this eBook or online at

www.gutenberg.org. If you are not located in the United States, you

will have to check the laws of the country where you are located before

using this eBook.



Title: The Great Gatsby



Author: F. Scott Fitzgerald



Release Date: January 17, 2021 [eBook #64317]

[Most recently updated: January 24 2021]



Language: English





Produced by: Alex Cabal for the Standard Ebooks project, based on a

             transcription produced for Project Gutenberg Australia.



*** START OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***





			   The Great Gatsby

				  by

			 F. Scott Fitzgerald

In [8]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

OpenAI LP ist ein US-amerikanisches Unternehmen, das sich mit der Erforschung von künstlicher Intelligenz (KI, englisch Artificial Intelligence, AI) beschäftigt. Die gewinnorientierte Tochtergesellschaft OpenAI LP wird dabei durch das Non-Profit-Mutterunternehmen OpenAI Inc. kontrolliert.
Zentrale Geldgeber der Organisation sind der Unternehmer Elon Musk sowie das Unternehmen Microsoft. Stand 2015 war das Ziel von OpenAI, künstliche Intelligenz auf Open-Source-Basis auf eine Art und Weise zu entwickeln und zu vermarkten, dass sie der Gesellschaft Vorteile bringt und nicht schadet. Die Organisation wollte eine „freie Zusammenarbeit“ mit anderen Institutionen und Forschern, indem sie ihre Patente und Forschungsergebnisse für die Öffentlichkeit zugänglich mache. Die Firma ist mit über 1 Milliarde US-Dollar von Spenden finanziert.OpenAI beschäftigt sich nach eigenen Angaben mit der Frage der „existenziellen Bedrohung durch künstliche Intelligenz“ – also dem möglichen Übertreffen und Ersetz



  lis = BeautifulSoup(html).find_all('li')


In [45]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

190


In [21]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.006684


In [32]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [46]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index askadocument and embeddings ...Ok


In [35]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

The provided context appears to be excerpts from the United States Constitution. The document outlines the principles, structure, and powers of the US government, and establishes the rights and freedoms of its citizens.


In [51]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

Write Quit or Exit to quit.
Question #1: quit
Quitting ... bye bye!


In [43]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [39]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index chatgpt and embeddings ...Ok


In [42]:
# q = "Ce este ChatGPT?"
q = 'Ce este InstructGPT?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

InstructGPT este menționat ca predecesorul lui ChatGPT, însă nu se oferă o descriere detaliată a programului și a capacităților sale.


In [81]:
# asking with memory
chat_history = []
question = 'How many amendments are in the U.S. Constitution?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

As of September 2021, there are 27 amendments in the U.S. Constitution.
[('How many amendments are in the U.S. Constitution?', 'As of September 2021, there are 27 amendments in the U.S. Constitution.')]


In [82]:
question = 'Multiply that number by 2'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)


The result of multiplying the number of amendments in the U.S. Constitution by 2 is not provided in the given context. The original U.S. Constitution had seven articles and twenty-seven amendments have been made since its ratification. Thus, if you multiply twenty-seven amendments by 2, then the result would be 54.
[('How many amendments are in the U.S. Constitution?', 'As of September 2021, there are 27 amendments in the U.S. Constitution.'), ('Multiply that number by 2', 'The result of multiplying the number of amendments in the U.S. Constitution by 2 is not provided in the given context. The original U.S. Constitution had seven articles and twenty-seven amendments have been made since its ratification. Thus, if you multiply twenty-seven amendments by 2, then the result would be 54.')]


### Ask with Memory Loop

In [None]:
import time
i = 1

chat_history = []

print("Write Quit or Exit to quit")
while True:
    q = input(f"Question #{i}")
    i = i + 1
    if q.lower() in ["quit","exit"]:
        print("Qutting")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("----------------------------------------------------------------------")