In [1]:
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_community.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate
from langchain_community.vectorstores import chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain import HuggingFacePipeline
from transformers import pipeline
# RAG Chain
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM,AutoTokenizer
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain import HuggingFacePipeline
import transformers
import torch  
import os
from langchain_huggingface import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Defining the model

In [2]:
model = 'microsoft/phi-2'

tokenizer = AutoTokenizer.from_pretrained(model)

model = AutoModelForCausalLM.from_pretrained(model,torch_dtype="auto",device_map='auto')

pipe = pipeline("text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=256,
    temperature=0.5)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.57s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


#### Embbeding and using vector database

In [None]:
embedding = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1",model_kwargs={"device": "cuda"})
chroma_db = Chroma(
    persist_directory="./chroma_capstone_db_new_reduced_hugging_face",
    embedding_function=embedding,
    collection_name="Tomato"  # Specify which collection to load czx
)

### Chromadb as Retriever using mmr

In [4]:
question = "give me the cure for tomato plant in Kashmir in the summer months"
chroma_retriever = chroma_db.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k":12})

chroma_retriever.get_relevant_documents(question)

  chroma_retriever.get_relevant_documents(question)


[Document(id='7a4e23f8-cc6d-4858-bafa-104203d9dde8', metadata={'disease_name': 'Late blight', 'StateName': 'ASSAM'}, page_content='DistrictName:DHUBRI\nStateName:ASSAM\nSeason_English:Monsoon Crop\nMonth:November\nDisease:Late blight\nQueryText:ASKING FOR ANY PLANT PROTECTION MEASURES TO BE ADOPTED AGAINST LATE BLIGHT DISEASE IN TOMATO\nKccAns:INDOFIL M-45 SOLUTION  25 GRAMS PER 1 LITRE OF WATER HAS BEEN SUGGESTED TO SPRAY ON THE CROP AT THE FOLLOWING STAGES: 1 FIRST AT 30 DAYS AFTER TRANSPLANTING 2 SECOND AT 60 DAYS AFTER TRANSPLANTING AND 3 THIRD AT 90 DAYS AFTER TRANSPLANTING AS THE PLANT PROTECTION MEASURES TO BE ADOPTED AGAINST LATE BLIGHT DISEASE IN TOMATO'),
 Document(id='2740b954-c961-4c6b-96bc-a5e30e496354', metadata={'StateName': 'JAMMU AND KASHMIR', 'disease_name': 'Aphids'}, page_content='DistrictName:BADGAM\nStateName:JAMMU AND KASHMIR\nSeason_English:Monsoon Crop\nMonth:April\nDisease:Aphids\nQueryText:HOW TO CONTROL APHIDS IN THGE SEEDLINGS OF TOMATO \nKccAns:SPRAY MELAT

#### Prompting

In [5]:
prompt_template = """
You are an agricultural assistant specialized in answering questions about plant diseases.  
Your task is to provide answers strictly based on the provided context when possible.  

Each document contains the following fields:  
- DistrictName  
- StateName  
- Season_English  
- Month  
- Disease  
- QueryText  
- KccAns (this is the official response section from source documents)

Guidelines for answering:
1. If a relevant answer is available in KccAns, use that with minimal changes.  
2. Use DistrictName, StateName, Season_English, Month, and Disease only to help interpret the question and select the correct KccAns, but **do not include these details in the final answer unless the question explicitly asks for them**.  
3. If the answer is not available in the context, then rely on your own agricultural knowledge to provide the best possible answer.  
4. Do not invent or assume information when KccAns is present; only fall back to your own knowledge when the context has no suitable answer.  

CONTEXT:
{context}

QUESTION:
{question}

OUTPUT:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

#### Retriever

In [6]:
h_retrieval_QA1 = RetrievalQA.from_chain_type(
    llm=HuggingFacePipeline(pipeline=pipe),
    chain_type="stuff",
    retriever=chroma_retriever,
    input_key="query",
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

  llm=HuggingFacePipeline(pipeline=pipe),


In [8]:
print(h_retrieval_QA1.invoke({"query": question})["result"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



You are an agricultural assistant specialized in answering questions about plant diseases.  
Your task is to provide answers strictly based on the provided context when possible.  

Each document contains the following fields:  
- DistrictName  
- StateName  
- Season_English  
- Month  
- Disease  
- QueryText  
- KccAns (this is the official response section from source documents)

Guidelines for answering:
1. If a relevant answer is available in KccAns, use that with minimal changes.  
2. Use DistrictName, StateName, Season_English, Month, and Disease only to help interpret the question and select the correct KccAns, but **do not include these details in the final answer unless the question explicitly asks for them**.  
3. If the answer is not available in the context, then rely on your own agricultural knowledge to provide the best possible answer.  
4. Do not invent or assume information when KccAns is present; only fall back to your own knowledge when the context has no suitable