In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
pip install pypdf -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file) # Splitting the Name and Extension

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

In [8]:
data = load_document("files/us_constitution.pdf")
print(data[1].page_content)

Loading files/us_constitution.pdf
The
House
of
Representatives
shall
be
composed
of
Members
chosen
every
second
Y ear
by
the
People
of
the
several
States,
and
the
Electors
in
each
State
shall
have
the
Qualifications
requisite
for
Electors
of
the
most
numerous
Branch
of
the
State
Legislature.
No
Person
shall
be
a
Representative
who
shall
not
have
attained
to
the
Age
of
twenty
five
Y ears,
and
been
seven
Y ears
a
Citizen
of
the
United
States,
and
who
shall
not,
when
elected,
be
an
Inhabitant
of
that
State
in
which
he
shall
be
chosen.
Representatives
and
direct
T axes
shall
be
apportioned
among
the
several
States
which
may
be
included
within
this
Union,
according
to
their
respective
Numbers,
which
shall
be
determined
by
adding
to
the
whole
Number
of
free
Persons,
including
those
bound
to
Service
for
a
T erm
of
Y ears,
and
excluding
Indians
not
taxed,
three
fifths
of
all
other
Persons.
The
actual
Enumeration
shall
be
made
within
three
Y ears
after
the
first
Meeting
of
the
Congress
of
the
U

In [9]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [11]:
data = load_from_wikipedia("GPT-4")
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was launched on March 14, 2023, and made publicly available via the paid chatbot product ChatGPT Plus, via OpenAI's API, and via the free chatbot Microsoft Copilot.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 
Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4, equipped with vision capabilities (GPT-4V), is capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technic

In [20]:
data = load_document('files/us_constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [12]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [13]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

In [14]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec

    
    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
        ) 
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [24]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')

In [21]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

190


In [22]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.000334


In [26]:
pip install -q IProgress 

Note: you may need to restart the kernel to use updated packages.


In [27]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [28]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index askadocument and embeddings ...Ok


## ASKING AND GETTING ANSWERS

In [29]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4o-mini', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

In [33]:
q = "What is the whole document about?"
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

The text provided is excerpts from the United States Constitution. The Constitution outlines the framework for the government of the United States, establishing the branches of government, their powers, and the rights of the citizens. It serves as the supreme law of the United States.


## While Loop For Asking Questions

In [35]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer['result']}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.

Answer: The First Amendment of the United States Constitution guarantees the freedom of religion, speech, press, assembly, and the right to petition the government.

 -------------------------------------------------- 


Answer: The concept of federalism as mentioned in the Constitution refers to the division of power between the central government and the individual states. It establishes a system where both levels of government have their own distinct powers and responsibilities. This division of power is intended to create a balance between a strong central authority and individual state autonomy. The Constitution outlines specific powers granted to the federal government, powers reserved for the states, and powers shared between the two levels of government. This framework of federalism ensures that no single entity has unchecked power and helps maintain a system of checks and balances in the United States government.

 ---------------------------------

In [36]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [37]:
data = load_from_wikipedia('Google Gemini', 'de')
chunks = chunk_data(data)
chunks

[Document(metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des Sommers in weiteren Ländern verfügbar wurde. Google Gemini ist in über 40 Sprachen verfügbar.', 'source': 'https://de.wikipedia.org/wiki/Google_Gemini'}, page_content='Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des'),
 Document(metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschr

In [38]:
index_name = 'gemini'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index gemini and embeddings ...Ok


In [39]:
q = 'Was ist Google Gemini?'
answer = ask_and_get_answer(vector_store, q)
print(answer["result"])

Google Gemini ist ein KI-basierter, multimodaler Chatbot, der von Google entwickelt wurde. Er war zuvor unter dem Namen Google Bard bekannt. Gemini bietet eine kostenlose Basisversion an, während eine leistungsfähigere Version namens Gemini Advanced kostenpflichtig ist und nur über Google One AI Premium zu buchen ist.


## Using Chroma as a Vector DB

In [45]:
pip install -q chromadb

Note: you may need to restart the kernel to use updated packages.


In [48]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate an embedding model from OpenAI (smaller version for efficiency)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  

    # Create a Chroma vector store using the provided text chunks and embedding model, 
    # configuring it to save data to the specified directory 
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory) 

    return vector_store  # Return the created vector store

In [49]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536) 

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) 

    return vector_store  # Return the loaded vector store


In [50]:
data = load_document("files/rag_powered_by_google_search.pdf")
chunks = chunk_data(data, chunk_size = 256)
vector_store = create_embeddings_chroma(chunks)

Loading files/rag_powered_by_google_search.pdf


In [52]:
# Asking questions
q = 'What is Vertex AI Search?'
answer = ask_and_get_answer(vector_store, q)
print(answer["result"])

Vertex AI Search is a feature that offers customizable answers, search tuning, vector search, grounding, and compliance updates for enterprises. It provides generative AI capabilities and enterprise-ready features for improved search experiences.


In [58]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains 
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a ChatGPT LLM (temperature controls randomness)
llm = ChatOpenAI(model_name='gpt-4o-mini', temperature=0)  

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})  


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the ChatGPT LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)

In [67]:
# create a function to ask questions
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [60]:
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result["answer"])

The Stack Overflow dataset had 8 million pairs of questions and answers.


In [68]:
q = 'Multiply that number by 10.'
result = ask_question(q, crc)
print(result)

{'question': 'Multiply that number by 10.', 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content='The Stack Overflow dataset had 8 million pairs of questions and answers.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='8 million multiplied by 10 is 80 million.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='80 million multiplied by 10 is 800 million.')], 'answer': '80 million multiplied by 10 is 800 million.'}


In [69]:
print(result["answer"])

80 million multiplied by 10 is 800 million.


In [70]:
# Now you can iterate over the chat_history if it exists
if 'chat_history' in result:
    for i in result['chat_history']:
        print(i)
else:
    print("No chat history found.")

content='How many pairs of questions and answers had the StackOverflow dataset?'
content='The Stack Overflow dataset had 8 million pairs of questions and answers.'
content='Multiply that number by 10.'
content='8 million multiplied by 10 is 80 million.'
content='Multiply that number by 10.'
content='80 million multiplied by 10 is 800 million.'


## Using Custom Prompts

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

llm = ChatOpenAI(model_name='gpt-4o-mini', temperature=0)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


system_template = r'''
Use the following pieces of context to answer the user's question.
If you don't find the answer in the provided context, just respond "I don't know."
---------------
Context: ```{context}```
'''

user_template = '''
Question: ```{question}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt = ChatPromptTemplate.from_messages(messages)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt': qa_prompt },
    verbose=True
)

In [71]:
db = load_embeddings_chroma()
q = 'How many pairs of questions and answers had the StackOverflow dataset?'
result = ask_question(q, crc)
print(result)

  vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


{'question': 'How many pairs of questions and answers had the StackOverflow dataset?', 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content='The Stack Overflow dataset had 8 million pairs of questions and answers.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='8 million multiplied by 10 is 80 million.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='80 million multiplied by 10 is 800 million.'), HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content="I don't know.")], 'answer': "I don't know."}


In [73]:
result["answer"]

"I don't know."

In [74]:
q = 'When was Elon Musk born?'
result = ask_question(q, crc)
print(result)

{'question': 'When was Elon Musk born?', 'chat_history': [HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content='The Stack Overflow dataset had 8 million pairs of questions and answers.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='8 million multiplied by 10 is 80 million.'), HumanMessage(content='Multiply that number by 10.'), AIMessage(content='80 million multiplied by 10 is 800 million.'), HumanMessage(content='How many pairs of questions and answers had the StackOverflow dataset?'), AIMessage(content="I don't know."), HumanMessage(content='When was Elon Musk born?'), AIMessage(content="I don't know.")], 'answer': "I don't know."}


In [75]:
result["answer"]

"I don't know."