## web scrapping 

In [1]:
import os
os.environ['user_agent']='dd'

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from bs4.filter import SoupStrainer
from langchain.schema import Document

bs4_strainer = SoupStrainer(class_=("content-column-content"))

loader= WebBaseLoader(
    web_paths=("https://spb.kerala.gov.in/economic-review/ER2016/chapter02_03.php",),
    bs_kwargs={"parse_only": bs4_strainer}
)

docs=loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def format(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [4]:
document=format(docs)
document

"\nAgriculture and Allied Sectors\nCrop wise Analysis\nRice\nRice is the most important food crop grown in Kerala. It occupies 7.46 percent of the total cropped area of the state. However, the area under rice has been falling at an alarming rate ever since the 1980s. From 8.82 lakh hectare in 1974-75, the paddy area has come down to 1.96 lakh hectare in 2015-16. The production has also concomitantly declined from 13.76 lakh MT in 1972-73 (peak of production) to 5.49 lakh MT in 2015-16 (Appendix 2.4). Moreover, the productivity of the crop is very low in the State (2790 kg/ha), though it is higher than the national average (2424 kg/ha). There has only been a marginal increase in the productivity of rice in the past four decades. China, which is the major producer of rice in the world, reports a productivity (6744 kg/ha) more than three times the productivity of rice in Kerala. The productivity of rice in Egypt is the highest in the world (9088 kg/ha), which is nearly four-fold of our pr

## text splitting for manageable chunks

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1200, chunk_overlap=100, add_start_index=True
)

all_splits=text_splitter.split_documents(docs)

In [15]:
import json

with open("C:\krishisahayi\lang-chain-bot\data\scraped_chunks.json", "w", encoding="utf-8") as f:
    json.dump([doc.model_dump() for doc in all_splits], f, ensure_ascii=False, indent=2)

In [7]:
all_splits

[Document(metadata={'source': 'https://spb.kerala.gov.in/economic-review/ER2016/chapter02_03.php', 'start_index': 1}, page_content='Agriculture and Allied Sectors\nCrop wise Analysis\nRice\nRice is the most important food crop grown in Kerala. It occupies 7.46 percent of the total cropped area of the state. However, the area under rice has been falling at an alarming rate ever since the 1980s. From 8.82 lakh hectare in 1974-75, the paddy area has come down to 1.96 lakh hectare in 2015-16. The production has also concomitantly declined from 13.76 lakh MT in 1972-73 (peak of production) to 5.49 lakh MT in 2015-16 (Appendix 2.4). Moreover, the productivity of the crop is very low in the State (2790 kg/ha), though it is higher than the national average (2424 kg/ha). There has only been a marginal increase in the productivity of rice in the past four decades. China, which is the major producer of rice in the world, reports a productivity (6744 kg/ha) more than three times the productivity o

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [9]:
from langchain_chroma import Chroma

vectorstore= Chroma.from_documents(documents=all_splits,embedding=hf_embeddings)

In [None]:
questions=" tell me all the type of crops grown in kerela? "
retriever= vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":5})
retrieved_docs=retriever.invoke(questions)

retrieved_docs

[Document(id='d38db001-4dc1-4555-b24b-7bc3f8809b0c', metadata={'start_index': 24170, 'source': 'https://spb.kerala.gov.in/economic-review/ER2016/chapter02_03.php'}, page_content='For rice development, assistance was given to padasekhara samithies  for sustaining rice cultivation in 1.79 lakh ha through group farming  and for speciality rice cultivation in 890 ha.\n For coconut development, the Department established 10 coconut  nurseries and 474 organic manure production units, supported 26   Keragramams covering 12.300ha, distributed 2600 climbing devices \r\n              and installed irrigation units in 656ha. A total of 4.64 lakh nos.   of coconut seedlings and 4.43 lakh coconut plants, covering 2.50 lakh WCT, 1.34 lakh of Dwarf, 34,833 Number of TxD and 23,882 DxT \r\n              coconut plants were produced. \nFor vegetable development, the student community was mobilized and vegetable cultivation was carried out in 5592 educational institutions,            project based culti

In [11]:
context= ''.join([doc.page_content for doc in retrieved_docs])
context

'For rice development, assistance was given to padasekhara samithies  for sustaining rice cultivation in 1.79 lakh ha through group farming  and for speciality rice cultivation in 890 ha.\n For coconut development, the Department established 10 coconut  nurseries and 474 organic manure production units, supported 26   Keragramams covering 12.300ha, distributed 2600 climbing devices \r\n              and installed irrigation units in 656ha. A total of 4.64 lakh nos.   of coconut seedlings and 4.43 lakh coconut plants, covering 2.50 lakh WCT, 1.34 lakh of Dwarf, 34,833 Number of TxD and 23,882 DxT \r\n              coconut plants were produced. \nFor vegetable development, the student community was mobilized and vegetable cultivation was carried out in 5592 educational institutions,            project based cultivation in 334 institutions and 62 lakhs seed \r\n              kits and 31,654 grow bags were distributed. The department supported           800 clusters including 50 new cluste

In [12]:
from langchain_ollama.llms import OllamaLLM

llm=OllamaLLM(model="llama3.2:3b")

response= llm.invoke(f"""answer the question according to the context given very breifly:
                    Question:{questions}.
                    context:{context}
                    """)

In [13]:
print(response)

Here are the types of crops grown in Kerala:

1. Rice
2. Coconut
3. Pepper
4. Vegetables (various types)
5. Cashew


In [14]:
import graphviz
dot_code="""
digraph G {
    graph [
        rankdir=TB;
        labelloc="t";
        fontname="Helvetica,Arial,sans-serif";
        bgcolor="white";
        splines=spline;
    ];
    node [
        shape=rect;
        style="filled,rounded";
        fontname="Helvetica,Arial,sans-serif";
        fontsize=10;
        penwidth=1.5;
        color="#2E4A62";
        fillcolor="#F0F8FF";
    ];
    edge [
        fontname="Helvetica,Arial,sans-serif";
        fontsize=9;
        arrowsize=0.7;
        color="#696969";
    ];

    // User Interaction & Input
    subgraph cluster_0 {
        label="User Input";
        rank=same;
        style=filled;
        color="#E0E0E0";
        node [fillcolor="#DDEEFF"];
        Farmer [label="Farmer\n(Malayalam Voice/Text)", shape=oval];
        Voice_to_Text [label="Voice-to-Text\n(Malayalam)"];
        PWA_App [label="PWA/Mobile App\n(Text)"];

        Farmer -> Voice_to_Text;
        Farmer -> PWA_App;
    }

    // Core Processing
    subgraph cluster_1 {
        label="Core AI Pipeline";
        rank=same;
        style=filled;
        color="#E0E0E0";
        node [fillcolor="#E0FFE0"];
        NLP_Engine [label="NLP Engine\n(Intent & NER)"];
        RAG_Orchestrator [label="RAG System\n(LangChain)"];
        LLM [label="LLM\n(Llama 3/GPT-4)"];

        Voice_to_Text -> NLP_Engine;
        PWA_App -> NLP_Engine;
        NLP_Engine -> RAG_Orchestrator;
        RAG_Orchestrator -> LLM;
    }

    // Knowledge & Data
    subgraph cluster_2 {
        label="Knowledge & Data Sources";
        rank=same;
        style=filled;
        color="#E0E0E0";
        node [fillcolor="#FFFAEE"];
        Internal_KB [label="Internal Knowledge Base\n(Schemes, Pesticides)"];
        Weather_API [label="Weather API"];
        Farmer_DB [label="Farmer Profile DB"];
        
        RAG_Orchestrator -> Internal_KB [label="Retrieve"];
        RAG_Orchestrator -> Weather_API [label="Fetch"];
        RAG_Orchestrator -> Farmer_DB [label="Query"];
        
        Internal_KB -> Notifications_Trigger [style=dashed, label="New Scheme Alert"];
        Weather_API -> Notifications_Trigger [style=dashed, label="Cyclone/Weather Alert"];
    }

    // Output & Services
    subgraph cluster_3 {
        label="Output & Services";
        rank=same;
        style=filled;
        color="#E0E0E0";
        node [fillcolor="#DDEEFF"];
        Response_Engine [label="Response Engine\n(Formatting)"];
        Text_to_Speech [label="Text-to-Speech\n(Malayalam)"];
        Notifications_Trigger [label="Notifications Scheduler"];

        LLM -> Response_Engine;
        Response_Engine -> Text_to_Speech;
        Response_Engine -> PWA_App [label="Text Output"];
        Text_to_Speech -> Farmer [label="Voice Output"];
        Notifications_Trigger -> Farmer [label="Alerts"];
    }
}
"""

# Create a Source object from the dot code
src = graphviz.Source(dot_code)

# Define the filename for the output file
filename = 'farming_assistant_architecture'

# Use the render() method to save the file
# The `view=False` argument prevents the image from opening automatically after saving.
src.render(filename, view=False, cleanup=True)

# You can now see the saved file in the same directory as your Jupyter Notebook
print(f"Diagram saved as '{filename}.png'")

Diagram saved as 'farming_assistant_architecture.png'


In [2]:
response = {"messages":[{"role":"assistant","content":"reply.content"}]}

In [14]:
print(response["messages"][-1].get("content"))

reply.content
