In [1]:
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings  # or HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# Load the data
loader1 = TextLoader("data.txt")
loader2 = TextLoader("data2.txt")
docs1 = loader1.load()
docs2 = loader2.load()

# Combine documents
all_docs = docs1 + docs2

# Split into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(all_docs)


Created a chunk of size 2415, which is longer than the specified 1000


In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(split_docs, embedding, persist_directory="./chroma_db")
vectorstore.persist()


In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""


In [None]:
import os
os.environ["GOOGLE_API_KEY"] = ""



from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI  # ✅ Gemini model

# Embedding setup (using HuggingFace)
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load vector store
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

# Setup retriever
retriever = vectorstore.as_retriever()

# Optional: Test retriever
results = retriever.invoke("GA5 Question 8 Clarification")

for i, doc in enumerate(results, start=1):
    print(f"\n--- Document {i} ---")
    print(doc.page_content)

# ✅ Load Gemini model (gemini-pro)
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0, convert_system_message_to_human=True)

# RAG pipeline
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Query
query = "What is this text about?"
response = qa_chain.invoke({"query": query})

# Output
print("Answer:", response["result"])
print("\nSources:")
for i, doc in enumerate(response["source_documents"], start=1):
    print(f"\n--- Source {i} ---")
    print(doc.page_content)


ModuleNotFoundError: No module named 'langchain_google_genai'

In [2]:


# chat gpt

from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI  # ✅ OpenAI model

# Embedding setup (you can keep using free local embeddings)
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load vector store
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

# Setup retriever
retriever = vectorstore.as_retriever()

results = retriever.invoke("GA5 Question 8 Clarification")

# for i, doc in enumerate(results, start=1):
#     print(f"\n--- Document {i} ---")
#     print(doc.page_content)

# # Load OpenAI model (gpt-3.5-turbo or gpt-4)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# RAG pipeline
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Query
query = "What is this text about?"
response = qa_chain.invoke({"query": query})

print("Answer:", response["result"])


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:


import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
# Place this line at the very beginning of your script, before any HuggingFaceEndpoint initialization.

from langchain.chains import RetrievalQA
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint

# Load vector store
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

# Setup retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})  # Ensure at least 2 results

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    max_new_tokens=512,
    temperature=0.1,
)

# RAG pipeline
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Query and show response with sources
query = "GA5 Question 8 Clarification"
response = qa_chain.invoke({"query": query})

print("Answer:", response["result"])
print("\nSource Documents:")
for i, doc in enumerate(response["source_documents"][:2], 1):
    print(f"\nSource {i}:")
    print(doc.page_content[:200] + "...")  # Show first 200 characters of each source

Answer:  The question asks to calculate the average price of all products sold in a given month. The solution involves selecting the required month from the database, filtering the products sold in that month, calculating the total price of all products, and then dividing it by the number of products sold in that month to get the average price. The code provided in the solution uses the Pandas library in Python to perform these operations. The solution also includes error handling for cases where the selected month does not have any products sold.

Source Documents:

Source 1:
GA4 - Data Sourcing - Discussion Thread [TDS May 2025] => https://discourse.onlinedegree.iitm.ac.in/t/ga4-data-sourcing-discussion-thread-tds-may-2025/178881
Project1 - Virtual TA - Discussion Thread ...

Source 2:
GA4 - Data Sourcing - Discussion Thread [TDS May 2025] => https://discourse.onlinedegree.iitm.ac.in/t/ga4-data-sourcing-discussion-thread-tds-may-2025/178881
Project1 - Virtual TA - Discussion Thread .