In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
## Test the LLM model
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model = 'gpt-4o-mini')

In [4]:
## Test the LLM model

llm = ChatOpenAI(model = 'gpt-4o-mini')
llm_response = llm.invoke("Tell me a joke")

llm_response

AIMessage(content='Why did the scarecrow win an award? \n\nBecause he was outstanding in his field!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 11, 'total_tokens': 29, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_8bda4d3a2c', 'id': 'chatcmpl-CBzDf43tGTJQFrPUa5c4HfSAuYeLx', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--590f3e82-e549-4bc1-a589-480b439df141-0', usage_metadata={'input_tokens': 11, 'output_tokens': 18, 'total_tokens': 29, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [5]:
## Test the Document splitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function=len
)
docx_loader = Docx2txtLoader("documents\GreenGrow Innovations_ Company History.docx")
documents = docx_loader.load()
print(len(documents))

splits = text_splitter.split_documents(documents)
print(f"Splits the documents into {len(splits)} chunks.")

1
Splits the documents into 2 chunks.


In [7]:
splits[0].metadata

{'source': 'documents\\GreenGrow Innovations_ Company History.docx'}

In [8]:
splits[0].page_content

'GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient.\n\n\n\nIn its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture. Their first product, the WaterWise Sensor, was launched in 2012 and quickly gained popularity among local farmers. This success allowed the company to expand its research and development efforts.\n\n\n\nBy 2015, GreenGrow had outgrown its garage origins and moved into a proper office and research facility in the outskirts of Portland. This move coincided with the development of their second major product, the SoilHealth Monitor, which used advanced sensors to analyze soil composition and provide real-time recommendations for optimal crop growth.'

In [9]:
## Create a useful function to split the document

def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupportd file type: {file_path}")
        documents.extend(loader.load())

    return documents

# Load documents from a folder
folder_path = "documents"
documents = load_documents(folder_path)

In [10]:
print(f"Loaded {len(documents)} documents from the folder.")
splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")

Loaded 5 documents from the folder.
Split the documents into 8 chunks.


In [11]:
embed_model = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")
document_embeddings = embed_model.embed_documents([split.page_content for split in splits])
document_embeddings[0]

  embed_model = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


[0.029335757717490196,
 -0.027797536924481392,
 -0.018463410437107086,
 -0.05892423912882805,
 0.08951884508132935,
 -0.02608221210539341,
 -0.10806204378604889,
 0.03759301081299782,
 -0.00500865513458848,
 -0.04168721288442612,
 -0.01638769544661045,
 -0.02612929977476597,
 -0.031359948217868805,
 0.010636319406330585,
 -0.07282694429159164,
 -0.0030147756915539503,
 -0.01876572147011757,
 -0.01873072236776352,
 0.0665455311536789,
 -0.07033522427082062,
 0.0032461038790643215,
 -0.015937551856040955,
 0.06552287191152573,
 0.014457416720688343,
 -0.011876823380589485,
 0.10337899625301361,
 -0.005314050707966089,
 0.0017421699594706297,
 0.003655016189441085,
 -0.036102183163166046,
 -0.012576725333929062,
 0.04590796306729317,
 0.030862493440508842,
 -0.054005078971385956,
 0.02634744718670845,
 0.08197517693042755,
 -0.03140774741768837,
 -0.04810932278633118,
 0.04922594502568245,
 -0.002444606041535735,
 0.005876438692212105,
 -0.014024660922586918,
 -0.003966038580983877,
 -0.0

In [12]:
len(document_embeddings[0])

384

In [13]:
collection_name = "my_collection"

## Create vectore store with documents
vector_store = Chroma.from_documents(
    documents = splits, 
    embedding = embed_model, 
    collection_name="adaptive_rag", 
    persist_directory="./chroma_db"
    )

In [14]:
print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [15]:
# Perform similarity search

query = "When was GreenGrow Innovations founded?"
search_results = vector_store.similarity_search(query, k=2)

In [16]:
print(f"\nTop 2 most related chuinks for the quey are: '{query}'\n")
for i, result in enumerate(search_results,1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


Top 2 most related chuinks for the quey are: 'When was GreenGrow Innovations founded?'

Result 1:
Source: documents\GreenGrow Innovations_ Company History.docx
Content: GreenGrow Innovations was founded in 2010 by Sarah Chen and Michael Rodriguez, two agricultural engineers with a passion for sustainable farming. The company started in a small garage in Portland, Oregon, with a simple mission: to make farming more environmentally friendly and efficient.



In its early days, GreenGrow focused on developing smart irrigation systems that could significantly reduce water usage in agriculture. Their first product, the WaterWise Sensor, was launched in 2012 and quickly gained popularity among local farmers. This success allowed the company to expand its research and development efforts.



By 2015, GreenGrow had outgrown its garage origins and moved into a proper office and research facility in the outskirts of Portland. This move coincided with the development of their second major produc

In [17]:
template = ''' Answer the question based only on the following context:
{context}

Question: {question}
Answer:
'''

prompt = ChatPromptTemplate.from_template(
    template= template
)

In [18]:
dict([("constant","What is your name? Today is a sunny day with no rain possibility")])

{'constant': 'What is your name? Today is a sunny day with no rain possibility'}

In [19]:
## Example to create a runnable using prompt template
def constant_context(_):
    return "What is your name? Today is a sunny day with no rain possibility"

# rag_chain = (
#     {"context":dict([("constant","What is your name? Today is a sunny day with no rain possibility")]), "question": RunnablePassthrough()} | prompt
# )

rag_chain = (
    {"context":constant_context, "question": RunnablePassthrough()} | prompt
)

In [20]:
rag_chain.invoke("What is your name?")

ChatPromptValue(messages=[HumanMessage(content=' Answer the question based only on the following context:\nWhat is your name? Today is a sunny day with no rain possibility\n\nQuestion: What is your name?\nAnswer:\n', additional_kwargs={}, response_metadata={})])

In [21]:
retriever = vector_store.as_retriever(search_kwargs={"k": 2})

In [22]:
def doc2str(docs_):
    return "\n\n".join(docs.page_content for docs in docs_)

In [23]:
rag_chain = (
    {"context": retriever | doc2str, "question": RunnablePassthrough()} 
    |prompt
    |llm
    |StrOutputParser()
)

In [24]:
question = "When was GreenGrow Innovations founded?"
response = rag_chain.invoke(question)
print(response)

GreenGrow Innovations was founded in 2010.


In [25]:
## Handling Followup prompts
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder

chat_history = []
chat_history.extend([
    HumanMessage(content = question),
    AIMessage(content = response)
    ])

In [26]:
## Rewrite and contexualized the Prompt

rewrite_prompt_context = """
You are given:
- A conversation history between a user and an assistant.
- The most recent user question, which may depend on or reference earlier context.

Your task:
- Reformulate the latest user question into a clear, standalone query that can be understood without seeing the prior chat.
- Preserve the user’s original intent and meaning.
- If the question is already self-contained, return it unchanged.
- Do not attempt to answer the question.
"""

contextualized_q_prompt = ChatPromptTemplate.from_messages([
    ('system',rewrite_prompt_context),
    MessagesPlaceholder("chat_history"),
    ('human',"{input}")
])

In [27]:
contextualized_chain = contextualized_q_prompt | llm | StrOutputParser()

In [28]:
contextualized_chain.invoke({"input": "Where it is headquartered?", "chat_history" : chat_history})

'Where is the headquarters of GreenGrow Innovations located?'

In [29]:
## Now, contextualized prompt writing is not the end product that we wanted. However, we wanted retrieved document from the newly written contextualised query by LLM

from langchain.chains import create_history_aware_retriever

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualized_q_prompt
)

history_aware_retriever.invoke({"input":"Where it is headquartered?","chat_history": chat_history })

[Document(id='ac671b61-cfe9-48e6-ab58-6bf761bcfec2', metadata={'source': 'documents\\GreenGrow Innovations_ Company History.docx'}, page_content="The company's breakthrough came in 2018 with the introduction of the EcoHarvest System, an integrated solution that combined smart irrigation, soil monitoring, and automated harvesting techniques. This system caught the attention of large-scale farmers across the United States, propelling GreenGrow to national prominence.\n\n\n\nToday, GreenGrow Innovations employs over 200 people and has expanded its operations to include offices in California and Iowa. The company continues to focus on developing sustainable agricultural technologies, with ongoing projects in vertical farming, drought-resistant crop development, and AI-powered farm management systems.\n\n\n\nDespite its growth, GreenGrow remains committed to its original mission of promoting sustainable farming practices. The company regularly partners with universities and research institu

In [30]:
retriever.invoke("Where it is headquartered?")

## See these retriver are very much away from finding headquarters of "GreenGrow Innovations". The output contains headquarters of other companies. Hence, contenxt awareness is important.

[Document(id='f44bc684-d3dc-4f08-a772-4b6a2c424f50', metadata={'source': 'documents\\Company_ TechWave Innovations.docx'}, page_content='Company: TechWave Innovations\n\nHeadquarters: TechWave Innovations is headquartered in San Francisco, California, USA. As a leader in cutting-edge AI and machine learning solutions, the company thrives in the heart of Silicon Valley, benefiting from its proximity to tech giants and a dynamic startup ecosystem. With its headquarters in this global technology hub, TechWave Innovations has access to top talent and a vast network of innovation-driven enterprises.'),
 Document(id='16f1e59d-bdb0-4c4e-a4fe-f0db52e40672', metadata={'source': 'documents\\Company_ TechWave Innovations.docx'}, page_content='Company: TechWave Innovations\n\nHeadquarters: TechWave Innovations is headquartered in San Francisco, California, USA. As a leader in cutting-edge AI and machine learning solutions, the company thrives in the heart of Silicon Valley, benefiting from its pro

In [31]:
## Now using the above history_aware_retriever we now need to create an LLM chain. for that we can used stuffed document chain of LLM to stuff the whole content retrived to LLM.
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

qa_prompts = ChatPromptTemplate.from_messages([
    ("system", "You are helpful AI Assistant. Use the following context to answer the user's question."),
    ("system", "Context: {context}"),
     MessagesPlaceholder(variable_name="chat_history"),
     ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm,qa_prompts)  ## This will create a stuff document chain. That means stuff all the documents inside the context and then call LLM to genetrate the answer.
rag_chain = create_retrieval_chain(history_aware_retriever,question_answer_chain) ## This will retrieved the documents after contextualized from the user query and then passed to stuff document chain and generate the answer.

In [32]:
rag_chain.invoke({"input": "Where it is headquartered?", "chat_history":chat_history})

{'input': 'Where it is headquartered?',
 'chat_history': [HumanMessage(content='When was GreenGrow Innovations founded?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='GreenGrow Innovations was founded in 2010.', additional_kwargs={}, response_metadata={})],
 'context': [Document(id='ac671b61-cfe9-48e6-ab58-6bf761bcfec2', metadata={'source': 'documents\\GreenGrow Innovations_ Company History.docx'}, page_content="The company's breakthrough came in 2018 with the introduction of the EcoHarvest System, an integrated solution that combined smart irrigation, soil monitoring, and automated harvesting techniques. This system caught the attention of large-scale farmers across the United States, propelling GreenGrow to national prominence.\n\n\n\nToday, GreenGrow Innovations employs over 200 people and has expanded its operations to include offices in California and Iowa. The company continues to focus on developing sustainable agricultural technologies, with ongoing project

#### Handle multiple users. Introduce a data base where we will store the user id, along with its query and AI answers to contextualized the next query.

In [43]:
import uuid
import sqlite3

In [44]:
DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id TEXT,
    user_query TEXT,
    gpt_response TEXT,
    model TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()

In [48]:
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = cursor.fetchall()
conn.close()

In [49]:
table_names

[('application_logs',), ('sqlite_sequence',)]

In [52]:
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

cursor.execute("SELECT * FROM application_logs") # Replace 'your_table' with your table name
rows = cursor.fetchall()
conn.close()

In [53]:
session_id = str(uuid.uuid4())
chat_history = get_chat_history(session_id)
user_question1 = "What is GreenGrow Innovations?"
response = rag_chain.invoke({"input":user_question1, "chat_history": chat_history})['answer']
insert_application_logs(session_id, user_question1, response, 'gpt-4-o')
print(f"Human: {user_question1}")
print(f"AI: {response}\n")

Human: What is GreenGrow Innovations?
AI: GreenGrow Innovations is a company that focuses on developing sustainable agricultural technologies. Founded on the mission of promoting sustainable farming practices, the company gained national prominence after the introduction of its EcoHarvest System in 2018. This system integrates smart irrigation, soil monitoring, and automated harvesting techniques, making it appealing to large-scale farmers across the United States.

As of now, GreenGrow employs over 200 people and has expanded its operations with offices in California and Iowa. The company is actively working on projects related to vertical farming, drought-resistant crop development, and AI-powered farm management systems. Additionally, GreenGrow partners with universities and research institutions to advance agricultural technology and organizes annual conferences to share knowledge within the farming community.



In [54]:
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

cursor.execute("SELECT * FROM application_logs") # Replace 'your_table' with your table name
rows = cursor.fetchall()
conn.close()

In [64]:
session_id = '3b7725f8-d873-42e2-8632-229a76447233'
chat_history = get_chat_history(session_id)
user_question1 = "What is it?"
response = rag_chain.invoke({"input":user_question1, "chat_history": chat_history})['answer']
insert_application_logs(session_id, user_question1, response, 'gpt-4-o')
print(f"Human: {user_question1}")
print(f"AI: {response}\n")

Human: What is it?
AI: GreenGrow Innovations is a company that specializes in developing and providing sustainable agricultural technologies. It aims to enhance farming practices through innovations such as smart irrigation, soil monitoring, automated harvesting, and other solutions designed to promote more efficient and environmentally friendly agriculture. The company's flagship product, the EcoHarvest System, has been a significant advancement in the agricultural sector, capturing the interest of large-scale farmers in the U.S. GreenGrow is committed to sustainability and regularly collaborates with academic and research institutions while hosting conferences to share knowledge within the industry.



In [65]:
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()

cursor.execute("SELECT * FROM application_logs") # Replace 'your_table' with your table name
rows = cursor.fetchall()
conn.close()

In [66]:
rows

[(1,
  '3b7725f8-d873-42e2-8632-229a76447233',
  'What is GreenGrow Innovations?',
  'GreenGrow Innovations is a company that focuses on developing sustainable agricultural technologies. Founded on the mission of promoting sustainable farming practices, the company gained national prominence after the introduction of its EcoHarvest System in 2018. This system integrates smart irrigation, soil monitoring, and automated harvesting techniques, making it appealing to large-scale farmers across the United States.\n\nAs of now, GreenGrow employs over 200 people and has expanded its operations with offices in California and Iowa. The company is actively working on projects related to vertical farming, drought-resistant crop development, and AI-powered farm management systems. Additionally, GreenGrow partners with universities and research institutions to advance agricultural technology and organizes annual conferences to share knowledge within the farming community.',
  'gpt-4-o',
  '2025-09-

In [70]:
from enum import Enum

class Test(Enum):
    TEST1 = "hi"
    TEST2 = "test2"
    TEST3 = "test3"

In [72]:
Test.TEST1.value

'hi'

In [74]:
dict({'a': 1, 'b': 2})['a']

1

In [None]:
a = 5