In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
from typing import List


### 3 -  Loading Different Document Formats



In [7]:
pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
pip install chromadb -q

In [5]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file) -> List[str]:
    import os
    name, extension = os.path.splitext(file)
    try:
        if extension == '.pdf':
            from langchain.document_loaders import PyPDFLoader
            print(f'Loading {file}')
            loader = PyPDFLoader(file)
        elif extension == '.docx':
            from langchain.document_loaders import Docx2txtLoader
            print(f'Loading {file}')
            loader = Docx2txtLoader(file)
        elif extension == '.txt':
            from langchain.document_loaders import TextLoader
            loader = TextLoader(file)
        else:
           # print('Document format is not supported!')
           # return None
            raise Exception ('Document format is not supported!')
    except Exception as e:
        print('Caught a document error: ' + repr(e)) 
    data = loader.load()
    return data

In [8]:
data = load_document('files/attention_is_all_you_need.pdf')
print(data[0].page_content)

Loading files/attention_is_all_you_need.pdf
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.comNoam Shazeer∗
Google Brain
noam@google.comNiki Parmar∗
Google Research
nikip@google.comJakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.comAidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.eduŁukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more parallelizable and requiring signi

In [None]:
def load_from_wikipedia(query:str, lang:str = 'en'):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=2)
    data = loader.load()
    return data

In [57]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter  
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0) 
    chunks = text_splitter.split_documents(data) 
    return chunks

In [58]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
    
# print_embedding_cost(chunks)

#next step is to create embeddings and print them and then upload to a vector database

In [59]:
#create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
def create_embeddings(chunks):
    from langchain.vectorstores import Chroma
    from langchain.embeddings.openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store

In [None]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.enviro.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ...', ends='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('ok')
    return vector_store

In [None]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.enviro.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('deleteing all indexes')
        for index in indexes:
            pinecone.delete_index(index)
        print('ok')
    else:
        pinecone.delete_index(index_name)

Use chains to combine LLM models

In [60]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(q)
    return answer


### RUNNING CODE

In [61]:
data = load_document('files/state_of_the_union.txt')
print (f'You have {len(data)} pages in your data')

You have 1 pages in your data


In [62]:
chunks = chunk_data(data)
print(len(chunks))

200


In [63]:
vector_store = create_embeddings(chunks)

In [64]:
q = 'what is the whole document about?'
# q = 'what is resoning and acting in LLMs?'
# q = 'Summarize the entire document in a few paragraphs.'

k = 3
answer = ask_and_get_answer(vector_store, q, k)
print(answer)

Based on the given context, it appears that the document is about the formation of character, finding purpose, and forging the future of a nation. It emphasizes the commitment to protecting freedom, expanding fairness and opportunity, and saving democracy.


In [28]:
import time

i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i+1
    if q.lower() in  ['quit', 'exit']:
        print('Quitting ... Bye Bye!')
        time.sleep(2)
        break
        
    answer = ask_and_get_answer(vector_store, q, 5)
    print(answer)
    print(f'\n {"-"*50} \n')
    

Write Quit of Exit to quit.
Question #1: What did the president say about Ketanji Brown Jackson?
The president said that Ketanji Brown Jackson is one of our nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.

 -------------------------------------------------- 

Question #2: quit
Quitting ... Bye Bye!


In [68]:
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=0.1)
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    
    return result, chat_history

In [69]:
chat_history = list()

q = 'What did the president say about Ketanji Brown Jackson?'
result, chat_history = ask_with_memory(vector_store, q, chat_history)

print(result['answer'])
print('-' * 100)
print(chat_history)  # for debugging

The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence.
----------------------------------------------------------------------------------------------------
[('What did the president say about Ketanji Brown Jackson?', "The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence.")]


In [70]:
q = 'Did he mention who she succeeded?'
result, chat_history = ask_with_memory(vector_store, q, chat_history)

print(result['answer'])
print('-' * 100)
print(chat_history)

He mentioned that Circuit Court of Appeals Judge Ketanji Brown Jackson will succeed Justice Breyer.
----------------------------------------------------------------------------------------------------
[('What did the president say about Ketanji Brown Jackson?', "The president said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence."), ('Did he mention who she succeeded?', 'He mentioned that Circuit Court of Appeals Judge Ketanji Brown Jackson will succeed Justice Breyer.')]


### Ask with Memory Loop

In [None]:
import time
i = 1

chat_history = []

print("Write Quit or Exit to quit")
while True:
    q = input(f"Question #{i}")
    i = i + 1
    if q.lower() in ["quit","exit"]:
        print("Qutting")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("----------------------------------------------------------------------")