<a href="https://colab.research.google.com/github/arunpiyush25/Build-For-Bharat/blob/main/Digital_Bharat_(advanced).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Colab cell 0 ‚Äî install libs
!pip install --quiet pandas requests pycountry python-dotenv

In [19]:
import os
import requests
import pandas as pd
import time
import threading
from tqdm import tqdm

In [21]:
# Colab Cell 1 ‚Äî Setup Environment & API Keys
# (Please add your API keys)
import os

# Set your API keys directly (you can later load them from .env)
os.environ["RAIN_API_KEY"] = "579b46****23bdd000001****91074f284*****69f5f0a1"
os.environ["CROP_API_KEY"] = "579b464db66*****3bdd0*****009981******4489cf8469f5f0a1"

# Define dataset resource URLs
RAIN_RESOURCE_URL = "https://api.data.gov.in/resource/8e0bd482-4aba-4d99-9cb9-ff124f6f1c2f"
CROP_RESOURCE_URL = "https://api.data.gov.in/resource/35be999b-0208-4354-b557-f6ca9a5355de"

In [22]:
# Colab Cell 2- [For Live Data Fetching] Fetching the data live from data.gov.in APIs

def fetch_full_dataset(resource_url, api_key, limit=10000, retry_delay=5, max_retries=3):
    """
    Fetches the complete dataset from data.gov.in API efficiently.
    - No record limit (fetches until no more data)
    - Handles retries on timeouts/errors
    - Large batch size for speed
    """
    all_records = []
    offset = 0
    total = 0

    print(f"Fetching all data from: {resource_url}")

    while True:
        params = {
            "api-key": api_key,
            "format": "json",
            "limit": limit,
            "offset": offset
        }

        # retry mechanism
        for attempt in range(max_retries):
            try:
                resp = requests.get(resource_url, params=params, timeout=60)
                resp.raise_for_status()
                js = resp.json()
                break
            except Exception as e:
                print(f"Error at offset {offset}: {e}")
                if attempt < max_retries - 1:
                    print(f"Retrying in {retry_delay}s...")
                    time.sleep(retry_delay)
                else:
                    print("Max retries reached. Stopping fetch.")
                    return pd.DataFrame(all_records)

        records = js.get("records", [])
        if not records:
            break

        all_records.extend(records)
        total += len(records)

        tqdm.write(f"Fetched {total} records...")

        if len(records) < limit:
            # No more data available
            break

        offset += limit

    print(f"\nFetch completed successfully with {len(all_records)} records.")
    return pd.DataFrame(all_records)


In [26]:
# Colab Cell 3 - [For Live Data Fetching] Automatically Fetches the data at 1 AM daily & updates it.

def fetch_dataset(resource_url, api_key, limit=10000):
    """Fetches dataset from data.gov.in resource."""
    params = {
        "api-key": api_key,
        "format": "json",
        "limit": limit
    }
    response = requests.get(resource_url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get("records", [])
    else:
        print(f"Failed to fetch data from {resource_url}")
        return []

def save_datasets():
    """Fetch and save both rainfall and crop datasets locally."""
    print(f"\nFetching datasets at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ...")

    rain_data = fetch_dataset(RAIN_RESOURCE_URL, os.environ["RAIN_API_KEY"])
    crop_data = fetch_dataset(CROP_RESOURCE_URL, os.environ["CROP_API_KEY"])

    # Save to JSON files
    with open("rainfall_data.json", "w") as f:
        import json
        json.dump(rain_data, f, indent=2)
    with open("crop_data.json", "w") as f:
        json.dump(crop_data, f, indent=2)

    print(f"Datasets updated successfully at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}!")

def schedule_daily_update():
    """Keeps the notebook running and fetches new data every day at 1 AM."""
    while True:
        now = datetime.datetime.now()
        target_time = now.replace(hour=1, minute=0, second=0, microsecond=0)

        # If 1 AM has passed today, schedule for tomorrow
        if now >= target_time:
            target_time += datetime.timedelta(days=1)

        # Wait until target time
        wait_seconds = (target_time - now).total_seconds()
        print(f"Next update scheduled for {target_time.strftime('%Y-%m-%d %H:%M:%S')} ({wait_seconds/3600:.2f} hours from now)")
        time.sleep(wait_seconds)

        # Fetch new datasets
        save_datasets()

# --- To start scheduled refresh ---
# Uncomment the line below to start continuous refresh
# schedule_daily_update()

In [6]:
# Colab Cell 4 - APIs setup of the two datasets - "District-wise, season-wise crop production statistics from 1997" & "Sub Divisional Monthly Rainfall from 1901 to 2017"

rain_api = os.environ["RAIN_API_KEY"]
crop_api = os.environ["CROP_API_KEY"]

rain_df = fetch_full_dataset(RAIN_RESOURCE_URL, rain_api)
crop_df = fetch_full_dataset(CROP_RESOURCE_URL, crop_api)

# Save locally for future use in session
rain_df.to_csv("rainfall_full.csv", index=False)
crop_df.to_csv("crop_full.csv", index=False)

print("Files saved: rainfall_full.csv & crop_full.csv")


Fetching all data from: https://api.data.gov.in/resource/8e0bd482-4aba-4d99-9cb9-ff124f6f1c2f
Fetched 4188 records...

Fetch completed successfully with 4188 records.
Fetching all data from: https://api.data.gov.in/resource/35be999b-0208-4354-b557-f6ca9a5355de
Fetched 10000 records...
Fetched 20000 records...
Fetched 30000 records...
Fetched 40000 records...
Fetched 50000 records...
Fetched 60000 records...
Fetched 70000 records...
Fetched 80000 records...
Fetched 90000 records...
Fetched 100000 records...
Fetched 110000 records...
Fetched 120000 records...
Fetched 130000 records...
Fetched 140000 records...
Fetched 150000 records...
Fetched 160000 records...
Fetched 170000 records...
Fetched 180000 records...
Fetched 190000 records...
Fetched 200000 records...
Fetched 210000 records...
Fetched 220000 records...
Fetched 230000 records...
Fetched 240000 records...
Fetched 246091 records...

Fetch completed successfully with 246091 records.
Files saved: rainfall_full.csv & crop_full.csv


In [7]:
# Colab Cell 5 ‚Äî Preprocess & Combine

def preprocess_datasets(rain_df, crop_df):
    """Clean, map, and merge rainfall + crop datasets by state and year."""

    # ---Rainfall Cleaning ---
    rain_df["annual"] = pd.to_numeric(rain_df.get("annual"), errors="coerce")
    rain_df["year"] = pd.to_numeric(rain_df.get("year"), errors="coerce")

    subdivision_to_state = {
        "East Rajasthan": "Rajasthan", "West Rajasthan": "Rajasthan",
        "Gujarat Region": "Gujarat", "Saurashtra & Kutch": "Gujarat",
        "East Madhya Pradesh": "Madhya Pradesh", "West Madhya Pradesh": "Madhya Pradesh",
        "Coastal Karnataka": "Karnataka", "North Interior Karnataka": "Karnataka",
        "South Interior Karnataka": "Karnataka",
        "East Uttar Pradesh": "Uttar Pradesh", "West Uttar Pradesh": "Uttar Pradesh",
        "Bihar Plateau": "Bihar", "Sub-Himalayan West Bengal & Sikkim": "West Bengal",
        "Gangetic West Bengal": "West Bengal", "Vidarbha": "Maharashtra",
        "Madhya Maharashtra": "Maharashtra", "Marathwada": "Maharashtra",
        "Coastal Maharashtra": "Maharashtra", "Telangana": "Telangana",
        "Coastal Andhra Pradesh": "Andhra Pradesh", "Rayalaseema": "Andhra Pradesh",
        "Tamil Nadu": "Tamil Nadu", "Kerala": "Kerala", "Odisha": "Odisha",
        "Jharkhand": "Jharkhand", "Chhattisgarh": "Chhattisgarh",
        "Haryana Delhi Chandigarh": "Haryana", "Punjab": "Punjab",
        "Himachal Pradesh": "Himachal Pradesh", "Jammu & Kashmir": "Jammu & Kashmir",
        "Assam & Meghalaya": "Assam", "Nagaland Manipur Mizoram Tripura": "North East",
        "Arunachal Pradesh": "Arunachal Pradesh"
    }

    rain_df["state_name"] = rain_df["subdivision"].map(subdivision_to_state)
    rain_df = rain_df.dropna(subset=["state_name", "annual", "year"])
    rain_state_df = (
        rain_df.groupby(["state_name", "year"], as_index=False)["annual"]
        .mean()
        .rename(columns={"annual": "avg_annual_rainfall"})
    )

    # --- Crop Cleaning ---
    crop_df["area_"] = pd.to_numeric(crop_df.get("area_"), errors="coerce")
    crop_df["production_"] = pd.to_numeric(crop_df.get("production_"), errors="coerce")
    crop_df["crop_year"] = pd.to_numeric(crop_df.get("crop_year"), errors="coerce")
    crop_df = crop_df.dropna(subset=["state_name", "production_", "crop_year"])
    crop_state_df = (
        crop_df.groupby(["state_name", "crop_year"], as_index=False)["production_"]
        .sum()
        .rename(columns={"crop_year": "year", "production_": "total_production"})
    )

    # --- Merge Datasets on (state, year) ---
    merged_df = pd.merge(
        rain_state_df, crop_state_df,
        on=["state_name", "year"], how="inner"
    )

    print(f"Merged dataset: {merged_df.shape[0]} rows")
    return merged_df

# Example usage:
merged_df = preprocess_datasets(rain_df, crop_df)
merged_df.head()


Merged dataset: 277 rows


Unnamed: 0,state_name,year,avg_annual_rainfall,total_production
0,Andhra Pradesh,1997,972.1,21093500.0
1,Andhra Pradesh,1998,1310.8,24997600.0
2,Andhra Pradesh,1999,825.1,25276770.0
3,Andhra Pradesh,2000,992.3,1109120000.0
4,Andhra Pradesh,2001,1009.3,1139997000.0


In [8]:
!pip install --quiet google-generativeai

In [9]:
os.environ["GEMINI_API_KEY"] = "AIzaSyAMfLtNcN9****0vN6zIVmVPk21LOo"
# (Please add your API keys)

In [10]:
# Colab Cell 6 - Parsing Intent from the entered question for computaional calculations by Pandas

import signal
import google.generativeai as genai
import json, re

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, timeout_handler)


def query_intent_gemini(question: str):
    model = genai.GenerativeModel("gemini-2.5-flash-lite")

    prompt = f"""
You are an expert intent parser for agricultural analytics.
Extract structured info from the question below.

Question: "{question}"

Rules:
- Output ONLY valid JSON.
- Keys: intent, states, crops, years, metrics.
- States must be explicit Indian states.
- If missing, set [] for crops/metrics/states and "last_5" for years.
Example:
{{
  "intent": "compare_rainfall_and_crop",
  "states": ["Punjab", "Haryana"],
  "crops": [],
  "years": "last_5",
  "metrics": ["average_annual_rainfall", "top_3_crops"]
}}
"""

    retries = 2
    for attempt in range(retries):
        try:
            signal.alarm(40)  # 40 sec timeout
            response = model.generate_content(prompt)
            signal.alarm(0)
            raw = response.text.strip()

            cleaned = (
                raw.replace("‚Äú", '"')
                .replace("‚Äù", '"')
                .replace("'", '"')
                .replace("```json", "")
                .replace("```", "")
            )
            match = re.search(r"\{.*\}", cleaned, re.DOTALL)
            text = match.group(0) if match else cleaned
            data = json.loads(text)
            for k in ["intent", "states", "crops", "years", "metrics"]:
                if k not in data:
                    data[k] = [] if k in ["states", "crops", "metrics"] else "last_5" if k == "years" else "unknown"
            return data
        except TimeoutException:
            print(f"Gemini timeout (attempt {attempt+1}/{retries})")
        except Exception as e:
            print(f"Gemini parse error: {e}")
        finally:
            signal.alarm(0)

    return {"intent": "unknown", "states": [], "crops": [], "years": "last_5", "metrics": []}


In [11]:
q1 = "Compare the average annual rainfall in uttar pradesh and Haryana for the last 5 years. Also list top 3 crops in each state."
q2 = "Which district in Maharashtra has the highest production of Rice in 2022 and compare it with the lowest in Karnataka?"

print(query_intent_gemini(q1))
print(query_intent_gemini(q2))


{'intent': 'compare_rainfall_and_crop', 'states': ['Uttar Pradesh', 'Haryana'], 'crops': [], 'years': 'last_5', 'metrics': ['average_annual_rainfall', 'top_3_crops']}
{'intent': 'compare_production', 'states': ['Maharashtra', 'Karnataka'], 'crops': ['Rice'], 'years': ['2022'], 'metrics': ['highest_production', 'lowest_production']}


In [12]:
# Colab Cell 7 - Computation Calculations done by Pandas

from datetime import datetime

def execute_query(intent_data, merged_df, rain_df, crop_df):
    """
    Executes structured query parsed by LLM using Pandas.
    Supports rainfall & crop analysis for multiple states and time windows.
    """
    states = intent_data.get("states", [])
    crops = intent_data.get("crops", [])
    metrics = intent_data.get("metrics", [])
    years = intent_data.get("years", "last_5")

    max_year = merged_df["year"].max()
    if isinstance(years, str) and "last_" in years:
        n = int(years.split("_")[1])
        year_range = list(range(max_year - n + 1, max_year + 1))
    else:
        year_range = [int(years)] if str(years).isdigit() else merged_df["year"].unique()

    results = {}

    if "average_annual_rainfall" in metrics:
        rain_subset = merged_df[
            merged_df["state_name"].isin(states) & merged_df["year"].isin(year_range)
        ]
        rain_summary = (
            rain_subset.groupby("state_name")["avg_annual_rainfall"]
            .mean()
            .reset_index()
            .rename(columns={"avg_annual_rainfall": "avg_rainfall_last_period"})
        )
        results["rainfall_summary"] = rain_summary.to_dict(orient="records")

    if "top_3_crops" in metrics:
        crop_subset = crop_df[
            crop_df["state_name"].isin(states) & crop_df["crop_year"].isin(year_range)
        ]
        top_crops = (
            crop_subset.groupby(["state_name", "crop"])["production_"]
            .sum()
            .reset_index()
            .sort_values(["state_name", "production_"], ascending=[True, False])
        )
        top_crops = top_crops.groupby("state_name").head(3)
        results["top_crops"] = top_crops.to_dict(orient="records")
        results["years_used"] = year_range
    return results


In [24]:
# TESTING

intent_data = {
    "intent": "compare_rainfall_and_crop",
    "states": ["Punjab", "Haryana"],
    "crops": [],
    "years": "last_5",
    "metrics": ["average_annual_rainfall", "top_3_crops"]
}

query_result = execute_query(intent_data, merged_df, rain_df, crop_df)
query_result


{'rainfall_summary': [{'state_name': 'Punjab',
   'avg_rainfall_last_period': 523.92}],
 'top_crops': [{'state_name': 'Haryana',
   'crop': 'Wheat',
   'production_': 35814000.0},
  {'state_name': 'Haryana', 'crop': 'Rice', 'production_': 10980300.0},
  {'state_name': 'Haryana', 'crop': 'Sugarcane', 'production_': 8799000.0},
  {'state_name': 'Punjab', 'crop': 'Wheat', 'production_': 83738000.0},
  {'state_name': 'Punjab', 'crop': 'Rice', 'production_': 55143000.0},
  {'state_name': 'Punjab', 'crop': 'Sugarcane', 'production_': 25199000.0}],
 'years_used': [2010, 2011, 2012, 2013, 2014]}

In [25]:
# Colab Cell 8 - Calling the LLM for getting final natural human readable answer

def generate_human_readable_answer(question: str, intent_data: dict, query_result: dict):
    model = genai.GenerativeModel("gemini-2.5-flash-lite")

    # Extract years used for calculations (if available)
    years_used = query_result.get("years_used", [])
    years_text = ""
    if years_used:
        if len(years_used) == 1:
            years_text = f"The data used is from the year {years_used[0]}."
        else:
            years_text = f"The analysis considers data from {years_used[0]} to {years_used[-1]}."

    # Construct improved prompt
    prompt = f"""
You are an AI analyst that converts structured data into a natural, human-like summary.

Original user question:
"{question}"

Parsed intent (from LLM):
{json.dumps(intent_data, indent=2)}

Structured data result:
{json.dumps(query_result, indent=2)}

Additional context:
{years_text}

Now write a clear, factual, and concise answer to the question.
Guidelines:
- Use natural, analytical tone.
- Summarize rainfall and crop data insightfully if present.
- Mention the years considered explicitly if relevant.
- Be honest if any part of the data is unavailable or incomplete.
- Use bullet points or short sentences for clarity.
- End with:
  Sources: IMD Rainfall Dataset, Government Crop Statistics (2010-2014)
"""

    retries = 2
    for attempt in range(retries):
        try:
            signal.alarm(35)
            response = model.generate_content(prompt)
            signal.alarm(0)
            if response and response.text:
                return response.text.strip()
        except TimeoutException:
            print(f"‚è±Ô∏è Gemini call timed out (attempt {attempt+1}/{retries})")
        except Exception as e:
            print(f"Gemini error: {e}")
        finally:
            signal.alarm(0)
        time.sleep(2 * (attempt + 1))

    return (
        "Unable to generate a detailed summary right now. Please try again later.\n\n"
        "Sources: IMD Rainfall Dataset, Government Crop Statistics"
    )


In [15]:
# TESTING

question = "Compare the average annual rainfall in Punjab and Haryana for the last 5 years. Also list top 3 crops in each state."

intent_data = query_intent_gemini(question)
query_result = execute_query(intent_data, merged_df, rain_df, crop_df)
final_answer = generate_human_readable_answer(question, intent_data, query_result)

print("Query Intent:", intent_data)
print("Query Result:", query_result)
print("\nFinal Human-Readable Answer:\n")
print(final_answer)


Query Intent: {'intent': 'compare_rainfall_and_crop', 'states': ['Punjab', 'Haryana'], 'crops': [], 'years': 'last_5', 'metrics': ['average_annual_rainfall', 'top_3_crops']}
Query Result: {'rainfall_summary': [{'state_name': 'Punjab', 'avg_rainfall_last_period': 523.92}], 'top_crops': [{'state_name': 'Haryana', 'crop': 'Wheat', 'production_': 35814000.0}, {'state_name': 'Haryana', 'crop': 'Rice', 'production_': 10980300.0}, {'state_name': 'Haryana', 'crop': 'Sugarcane', 'production_': 8799000.0}, {'state_name': 'Punjab', 'crop': 'Wheat', 'production_': 83738000.0}, {'state_name': 'Punjab', 'crop': 'Rice', 'production_': 55143000.0}, {'state_name': 'Punjab', 'crop': 'Sugarcane', 'production_': 25199000.0}], 'years_used': [2010, 2011, 2012, 2013, 2014]}

Final Human-Readable Answer:

Here's a comparison of average annual rainfall and top crops in Punjab and Haryana for the period of 2010 to 2014:

**Average Annual Rainfall:**

*   Punjab recorded an average annual rainfall of 523.92 mm d

In [16]:
# Find common years and states between rainfall and crop datasets
common_years = sorted(set(rain_df["year"]).intersection(set(crop_df["crop_year"])))
common_states = sorted(set(rain_df["state_name"]).intersection(set(crop_df["state_name"])))

print("Common Years:", common_years)
print("Common States:", common_states)


Common Years: [1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
Common States: ['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Chhattisgarh', 'Gujarat', 'Himachal Pradesh', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Punjab', 'Rajasthan', 'Tamil Nadu', 'Telangana', 'Uttar Pradesh', 'West Bengal']


In [17]:
# Colab Cell 9 - Setting Frontend with Gradio

!pip install gradio --quiet
import gradio as gr

In [18]:
# Colab Cell 10 - Setting the UI according to Digital Bharat project

def full_pipeline(question):
    # Step 1: intent extraction
    intent_data = query_intent_gemini(question)
    # Step 2: query execution
    query_result = execute_query(intent_data, merged_df, rain_df, crop_df)
    # Step 3: natural answer with sources
    final_answer = generate_human_readable_answer(question, intent_data, query_result)

    return (
        json.dumps(intent_data, indent=2),
        json.dumps(query_result, indent=2),
        final_answer
    )

# ------------------------- UI DESIGN -------------------------
theme = gr.themes.Soft(
    primary_hue="green",
    secondary_hue="teal",
    neutral_hue="gray",
    radius_size="lg",
    text_size="md"
)

with gr.Blocks(theme=theme, title="üåæ Bharat Agriculture cum Climate Assistant") as demo:
    gr.Markdown(
        """
        <div style="text-align:center; padding:10px 0">
            <h1 style="color:#1a5e1a;">üåæ Bharat Agriculture cum Climate Assistant</h1>
            <p style="font-size:17px;">Ask smart questions about rainfall and crop statistics across Indian states.<br>
            Powered by <b>Google's Gemini LLM</b> and government datasets by "Ministry of Agriculture & Farmers Welfare" and "India Meteorological Department (IMD)"</p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            question = gr.Textbox(
                label="üí¨ Ask your question",
                placeholder="e.g. Compare rainfall in Punjab and Rajasthan over the last 5 years",
                lines=2
            )
            submit_btn = gr.Button("üîç Analyze Question", variant="primary")

    with gr.Row():
        with gr.Column(scale=1):
            intent_box = gr.Code(
                label="üß† Parsed Intent Of The question(via LLM)",
                language="json"
            )
        with gr.Column(scale=1):
            result_box = gr.Code(
                label="üìä Computation Results of the Query (via Pandas Library)",
                language="json"
            )

    gr.Markdown("### ü™¥ Natural language Answer Citing sourced datset from data.gov.in")

    answer_box = gr.Markdown(
        value="> Your AI-generated answer will appear here...",
        elem_id="answer_box"
    )

    submit_btn.click(
        full_pipeline,
        inputs=[question],
        outputs=[intent_box, result_box, answer_box]
    )

    gr.Markdown(
        """
        <hr>
        <div style="text-align:center; color:gray; font-size:14px;">
        Data sources: Data.gov.in [IMD Rainfall Dataset ¬∑ Government Crop Statistics]
         |  ¬© 2025 Bharat Digital Project
        </div>
        """
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ff69a397f906c1470b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


