# Question-Answering Application Using LangChain and Chromadb

### Installing requried libraries

In [1]:
!pip install openai -q
!pip install langchain -q
!pip install -q chromadb
!pip install python-dotenv -q
!pip install tiktoken -q
!pip install pypdf -q
!pip install docx2txt -q

### Load Environment Variable

In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

os.environ.get('OPENAI_API_KEY')

print("API Key Loaded:", os.environ.get('OPENAI_API_KEY') is not None)

API Key Loaded: True


### Load documents with different formats

In [4]:
def extract_text_from_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported by our application!')
        return None

    data = loader.load()
    return data

### Chunking Strategies and splitting the documents

In [5]:
def split_text_into_chunks(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)

    return chunks

### Create the function for generating questions and answers

In [6]:
def generate_answer_from_vector_store(vector_store, question):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(question)

    return answer

### Define Create Function for Embedding

In [7]:
def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)

    return vector_store

### Define Load Emedding Function

In [8]:
def load_embeddings_chroma(persist_directory='./chroma_db'):
    from langchain.vectorstores import Chroma
    from langchain_openai import OpenAIEmbeddings

    embedding = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    vector_store= Chroma(persist_directory=persist_directory, embedding_function = embedding)
    return vector_store

### Testing

In [9]:
data = extract_text_from_document('Files/GoogleGeminiFamily.pdf')
chunks = split_text_into_chunks(data, chunk_size=256)
vector_store = create_embeddings_chroma(chunks)

Loading Files/GoogleGeminiFamily.pdf


In [12]:
question = 'What is Google Gemini Family?'
answer = generate_answer_from_vector_store(vector_store, question)
print(answer['result'])

The "Gemini family" likely refers to the members involved in the Gemini project at Google. This includes Google DeepMind (GDM), Google Research (GR), Knowledge and Information (K&I), Core ML, Cloud, Labs, and more.
