<a href="https://colab.research.google.com/github/abmishra1234/LLM-Apps/blob/Development/Q%26AWithPrivateData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project - Question, Answer on Private Documents which is owned by your Company or Individuals

Installation Required For Project

In [None]:
!pip install -r requirement.txt

# Imports required for Project

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

# Loading Documents

In [3]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


# Chunking Data

In [4]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

# Calculating the Cost

In [6]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

Embedding andd Uploading to a vector Database (Pinecone)

In [7]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec


    pc = pinecone.Pinecone()

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=PodSpec(
                environment='gcp-starter'
            )
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object.
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')

    return vector_store


In [8]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()

    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')


# Asking and Getting Answers

In [9]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer


In [None]:
#pip install -q chromadb

In [11]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate an embedding model from OpenAI (smaller version for efficiency)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # Create a Chroma vector store using the provided text chunks and embedding model,
    # configuring it to save data to the specified directory
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)

    return vector_store  # Return the created vector store


In [12]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    # Instantiate the same embedding model used during creation
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # Load a Chroma vector store from the specified directory, using the provided embedding function
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    return vector_store  # Return the loaded vector store


# Running Code

In [None]:
#!pip install langchain_openai

In [15]:
# Loading the pdf document into LangChain
data = load_document('files/keph102.pdf')

# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

Loading files/keph102.pdf


In [19]:
# Asking questions
q = 'What is the size of nucleus?'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

The size of a nucleus is in the range of 10^(-15) m to 10^(-14) m.


In [21]:
# Load a Chroma vector store from the specified directory (default ./chroma_db)
db = load_embeddings_chroma()
q = 'What is atomic clock?'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

An atomic clock is a highly accurate timekeeping device that uses the vibrations of atoms, typically cesium atoms, to regulate and measure time. They are considered the most accurate timepieces available and are used as standards for timekeeping around the world.


In [24]:
# We can't ask follow-up questions. There is no memory (chat history) available.
q = 'What is the number?'
answer = ask_and_get_answer(vector_store, q)
print(answer['result'])

I don't have enough context to determine the specific number being asked about.


# Adding Memory ( Chat History )

In [37]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain  # Import class for building conversational AI chains
from langchain.memory import ConversationBufferMemory  # Import memory for storing conversation history

# Instantiate a ChatGPT LLM (temperature controls randomness)
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

# Configure vector store to act as a retriever (finding similar items, returning top 5)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 5})


# Create a memory buffer to track the conversation
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

crc = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Link the ChatGPT LLM
    retriever=retriever,  # Link the vector store based retriever
    memory=memory,  # Link the conversation memory
    chain_type='stuff',  # Specify the chain type
    verbose=False  # Set to True to enable verbose logging for debugging
)

In [38]:
# create a function to ask questions
def ask_question(q, chain):
    result = chain.invoke({'question': q})
    return result

In [39]:
data = load_document('files/Chem_11_Chapter02.pdf')
chunks = chunk_data(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading files/Chem_11_Chapter02.pdf


In [40]:
q = 'What is atomic clock?'
result = ask_question(q, crc)
print(result['answer'])

An atomic clock is a highly accurate timekeeping device that uses the vibrations of atoms to regulate the time. Specifically, cesium atomic clocks are commonly used as they are very accurate and provide a standard for measuring time intervals. These clocks are used in national standards laboratories to maintain precise time measurements.


In [41]:
for item in result['chat_history']:
    print(item)

content='What is atomic clock?'
content='An atomic clock is a highly accurate timekeeping device that uses the vibrations of atoms to regulate the time. Specifically, cesium atomic clocks are commonly used as they are very accurate and provide a standard for measuring time intervals. These clocks are used in national standards laboratories to maintain precise time measurements.'


# Testing code

In [44]:
# Loading the pdf document into LangChain
data = load_document('files/The Mahabharata of Krishna-Dwaipayana Vyasa (Complete 18 Volumes) - Kisari Mohan Ganguli.pdf')

# Splitting the document into chunks
chunks = chunk_data(data, chunk_size=256)



Loading files/The Mahabharata of Krishna-Dwaipayana Vyasa (Complete 18 Volumes) - Kisari Mohan Ganguli.pdf


In [45]:
print_embedding_cost(chunks)

Total Tokens: 4079197
Embedding Cost in USD: 0.081584


In [46]:
# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = create_embeddings_chroma(chunks)

Loop for Asking Questions?

In [47]:
while True:
    q = input('Your question: ')
    if q.lower() in 'exit quit bye':
        print('Bye bye!')
        break
    result = ask_question(q, crc)
    print(result['answer'])
    print('-' * 100)

Your question: Who is samvaran?
Samvarana is a king mentioned in Hindu mythology. He was the husband of Tapatī, the daughter of Surya, and the father of Kuru.
----------------------------------------------------------------------------------------------------
Your question: Create the genology of Krishna
Krishna is mentioned as being born of the Sattwata race and is referred to as the root of the Pandavas. He is also described as being related to Rishis Narada and Parvata, with Narada being his maternal uncle and Parvata being his sister's son. Krishna is said to be the substance of the twenty-four objects of knowledge and is named Krishna because he unites what is implied by the two words Krishi.
----------------------------------------------------------------------------------------------------
Your question: How is Sanjaya introduced in mahabharat
Sanjaya is introduced in the Mahabharata as a charioteer.
-------------------------------------------------------------------------------