In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO # Used to read string data as if it were a file

# --- Configuration ---
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51" # <<< REPLACE THIS WITH YOUR ACTUAL FIRMS API KEY!

# Bounding box for Kerala (approximate: min_lon,min_lat,max_lon,max_lat)
KERALA_BOUNDING_BOX_STR = "74.5,8.0,77.5,12.5" 

# Source: 'VIIRS_SNPP_NRT' is generally good for near real-time, smaller fires.
FIRMS_SOURCE = "VIIRS_SNPP_NRT" 

def fetch_firms_data(api_key, source, bbox_str, date_range_days=7):
    """
    Fetches FIRMS active fire data for a given bounding box and date range.
    bbox_str: Bounding box as a string "min_lon,min_lat,max_lon,max_lat"
    date_range_days: Number of days back from today to fetch data.
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # FIRMS API uses YYYY-MM-DD format for dates
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")

    # Construct the FIRMS API URL. This specific format is for the 'archive' endpoint,
    # which allows fetching data for a bounding box and date range in CSV format.
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox_str}/{start_date_str}/{end_date_str}")

    print(f"Attempting to fetch FIRMS data from: {url}")
    response = requests.get(url)

    response.raise_for_status() # This will raise an HTTPError for 4xx or 5xx responses

    data = response.json() 

    return data

# --- Test the function and perform basic processing ---
try:
    firms_data = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE, KERALA_BOUNDING_BOX_STR, date_range_days=7)

    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("First 5 rows of raw FIRMS data:")
    print(firms_data.head()) 

    required_firms_cols = ['latitude', 'longitude', 'acq_date', 'acq_time', 'frp', 'confidence']
    if all(col in firms_data.columns for col in required_firms_cols):
        print("\nFIRMS data contains all required columns for AI/ML.")

        firms_data['timestamp'] = pd.to_datetime(
            firms_data['acq_date'] + ' ' + firms_data['acq_time'].astype(str).str.zfill(4), 
            format='%Y-%m-%d %H%M'
        )

        firms_processed_df = firms_data[[
            'timestamp', 'latitude', 'longitude', 'frp', 'confidence', 'brightness', 'version'
        ]].copy()

        print("\nFirst 5 rows of processed FIRMS data:")
        print(firms_processed_df.head())

        firms_processed_df.to_csv("firms_data_last_7_days_processed.csv", index=False)
        print("\nProcessed FIRMS data saved to firms_data_last_7_days_processed.csv")

    else:
        print("\nWARNING: FIRMS data missing some expected columns. Please check FIRMS API documentation and column names.")
        print(f"Available columns: {firms_data.columns.tolist()}")

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data: {e}")
    print("Please check:")
    print("  1. Your FIRMS_API_KEY is correct and activated.")
    print("  2. The KERALA_BOUNDING_BOX_STR format is correct.")
    print("  3. The FIRMS_SOURCE ('VIIRS_SNPP_NRT') is valid.")
    print("  4. The URL structure used in fetch_firms_data matches the latest FIRMS API documentation.")
    print("  5. You have active internet connection.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching: {e}")
    print("Ensure you have 'requests' and 'pandas' installed in your environment.")

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO

# --- Configuration (ensure your API key is here) ---
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51" # <<< Your current FIRMS API Key
KERALA_BOUNDING_BOX_STR = "74.5,8.0,77.5,12.5" 
FIRMS_SOURCE = "VIIRS_SNPP_NRT" 

# --- Existing fetch_firms_data function (no change here yet) ---
def fetch_firms_data(api_key, source, bbox_str, date_range_days=1):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox_str}/{start_date_str}/{end_date_str}")
    print(f"Attempting to fetch FIRMS data from: {url}")
    response = requests.get(url)
    response.raise_for_status()
    data = pd.read_csv(StringIO(response.text))
    return data

# --- NEW: Function to check API Key Status ---
def check_firms_api_key_status(api_key):
    """Checks the status of the FIRMS API key."""
    url = f"https://firms.modaps.eosdis.nasa.gov/api/map_key/status/csv/{api_key}"
    print(f"\nChecking API key status from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        status_data = pd.read_csv(StringIO(response.text))
        print("API Key Status:")
        print(status_data)
        return status_data
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error checking API key status: {e}")
        print("This often means the API key itself is invalid or not yet active.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while checking API key status: {e}")
        return None

# --- NEW: Function to fetch list of supported countries (a very basic call) ---
def fetch_firms_countries(api_key):
    """Fetches a list of countries supported by FIRMS API."""
    url = f"https://firms.modaps.eosdis.nasa.gov/api/countries/csv/{api_key}"
    print(f"\nAttempting to fetch supported countries from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        countries_data = pd.read_csv(StringIO(response.text))
        print("Supported Countries (first 5):")
        print(countries_data.head())
        return countries_data
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching countries: {e}")
        print("This indicates a fundamental issue with the API key or basic API access.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while fetching countries: {e}")
        return None

# --- Execute the new diagnostic functions ---
print("--- Starting FIRMS API Diagnostics ---")
key_status = check_firms_api_key_status(FIRMS_API_KEY)
if key_status is not None and not key_status.empty:
    print(f"API Key Status Check: Success. Transaction Limit: {key_status['transaction_limit'].iloc[0]}, Used: {key_status['used_transactions'].iloc[0]}")
else:
    print("API Key Status Check: Failed or returned no data.")

countries = fetch_firms_countries(FIRMS_API_KEY)
if countries is not None and not countries.empty:
    print("Basic API call (fetch countries) successful.")
else:
    print("Basic API call (fetch countries) failed or returned no data.")

print("\n--- Attempting original FIRMS data fetch again (after diagnostics) ---")
# --- Test the original fetch_firms_data function again ---
try:
    firms_data = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE, KERALA_BOUNDING_BOX_STR, date_range_days=7) # Or date_range_days=1 if you prefer
    
    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("First 5 rows of raw FIRMS data:")
    print(firms_data.head()) 

    required_firms_cols = ['latitude', 'longitude', 'acq_date', 'acq_time', 'frp', 'confidence']
    if all(col in firms_data.columns for col in required_firms_cols):
        print("\nFIRMS data contains all required columns for AI/ML.")
        
        firms_data['timestamp'] = pd.to_datetime(
            firms_data['acq_date'] + ' ' + firms_data['acq_time'].astype(str).str.zfill(4), 
            format='%Y-%m-%d %H%M'
        )
        
        firms_processed_df = firms_data[[
            'timestamp', 'latitude', 'longitude', 'frp', 'confidence', 'brightness', 'version'
        ]].copy()
        
        print("\nFirst 5 rows of processed FIRMS data:")
        print(firms_processed_df.head())

        firms_processed_df.to_csv("firms_data_last_7_days_processed.csv", index=False)
        print("\nProcessed FIRMS data saved to firms_data_last_7_days_processed.csv")

    else:
        print("\nWARNING: FIRMS data missing some expected columns. Please check FIRMS API documentation and column names.")
        print(f"Available columns: {firms_data.columns.tolist()}")

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data: {e}")
    print("Please check:")
    print("  1. Your FIRMS_API_KEY is correct and activated.")
    print("  2. The KERALA_BOUNDING_BOX_STR format is correct.")
    print("  3. The FIRMS_SOURCE ('VIIRS_SNPP_NRT') is valid.")
    print("  4. The URL structure used in fetch_firms_data matches the latest FIRMS API documentation.")
    print("  5. You have active internet connection.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching: {e}")
    print("Ensure you have 'requests' and 'pandas' installed in your environment.")

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO
import json # Import the json library for parsing JSON responses

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY_HERE" with your actual, clean FIRMS API Key.
# Ensure no leading/trailing spaces when pasting!
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51" 

# Bounding box for Kerala (approximate: min_lon,min_lat,max_lon,max_lat)
KERALA_BOUNDING_BOX_STR = "74.5,8.0,77.5,12.5" 

# Source: 'VIIRS_SNPP_NRT' is generally good for near real-time, smaller fires.
# For troubleshooting, we'll try 'MODIS_NRT' as well.
FIRMS_SOURCE_VIIRS = "VIIRS_SNPP_NRT" 
FIRMS_SOURCE_MODIS = "MODIS_NRT" 

# --- Function to fetch FIRMS data (flexible for date range or fixed dates) ---
def fetch_firms_data(api_key, source, bbox_str, start_date_str, end_date_str):
    """
    Fetches FIRMS active fire data for a given bounding box and fixed date range.
    start_date_str, end_date_str: Dates in 'YYYY-MM-DD' format.
    """
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox_str}/{start_date_str}/{end_date_str}")
    
    print(f"Attempting to fetch FIRMS data from: {url}")
    response = requests.get(url)
    
    # This will raise an HTTPError for 4xx or 5xx responses
    response.raise_for_status() 

    # FIRMS returns data as a CSV string. Use StringIO to read it directly into a Pandas DataFrame.
    data = pd.read_csv(StringIO(response.text))
    
    return data

# --- Function to check API Key Status (now correctly parses JSON) ---
def check_firms_api_key_status(api_key):
    """Checks the status of the FIRMS API key."""
    # Note: This URL returns JSON, despite '/csv/' in path
    url = f"https://firms.modaps.eosdis.nasa.gov/api/map_key/status/csv/{api_key}" 
    print(f"\nChecking API key status from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Correctly parse the JSON response
        status_data = response.json() 

        print("API Key Status (JSON):")
        print(json.dumps(status_data, indent=2)) # Pretty print JSON
        return status_data 
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error checking API key status: {e}. Response content: {response.text}")
        print("This often means the API key itself is invalid or not yet active.")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error checking API key status: {e}. Response content: {response.text[:200]}...")
        print("Received non-JSON response when expecting JSON. This could indicate an underlying API issue.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while checking API key status: {e}")
        return None

# --- Function to fetch list of supported countries (still expects CSV) ---
def fetch_firms_countries(api_key):
    """Fetches a list of countries supported by FIRMS API."""
    url = f"https://firms.modaps.eosdis.nasa.gov/api/countries/csv/{api_key}"
    print(f"\nAttempting to fetch supported countries from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        countries_data = pd.read_csv(StringIO(response.text))
        print("Supported Countries (first 5):")
        print(countries_data.head())
        return countries_data
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching countries: {e}. Response content: {response.text}")
        print("This indicates a fundamental issue with the API key or basic API access, or unexpected response format.")
        return None
    except pd.errors.ParserError as e:
        print(f"CSV Parsing Error fetching countries: {e}. Response content: {response.text[:200]}...")
        print("Received non-CSV or malformed CSV response when expecting CSV. This could indicate an underlying API issue.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while fetching countries: {e}")
        return None

# --- Main Execution Block ---
print("--- Starting FIRMS API Diagnostics ---")

# 1. Check API Key Status (should now print clean JSON)
key_status = check_firms_api_key_status(FIRMS_API_KEY)
if key_status is not None:
    print(f"API Key Status Check: Success. Transaction Limit: {key_status.get('transaction_limit')}, Used: {key_status.get('current_transactions')}")
else:
    print("API Key Status Check: Failed or returned no data.")

# 2. Try fetching supported countries (might still have parsing issues if it's an error page)
countries = fetch_firms_countries(FIRMS_API_KEY)
if countries is not None and not countries.empty:
    print("Basic API call (fetch countries) successful.")
else:
    print("Basic API call (fetch countries) failed or returned no data. (Check error message above for details)")

print("\n--- Attempting main FIRMS data fetch (area/csv) with troubleshooting steps ---")

# Define a very small, specific, older date range for testing
# This reduces the chance of issues with very recent data or large volumes
fixed_start_date = "2024-07-01" # A date in the past
fixed_end_date = "2024-07-02"   # Just one day after start date

# Use a very small bounding box around a known location in Kerala for minimal data
# Example: A tiny box around Thrissur city center
TEST_BOUNDING_BOX_STR = "76.21,10.52,76.22,10.53" # min_lon,min_lat,max_lon,max_lat

# --- Test with MODIS_NRT source first ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_MODIS} ---")
try:
    firms_data_modis = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_MODIS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_modis)} FIRMS hotspots with {FIRMS_SOURCE_MODIS}.")
    print("First 5 rows of raw FIRMS data (MODIS):")
    print(firms_data_modis.head()) 

    # You can add the processing steps for firms_data_modis here if it succeeds
    # For now, just focus on successful fetch.

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_MODIS}: {e}. Response content: {e.response.text}")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_MODIS}: {e}")

# --- Test with VIIRS_SNPP_NRT source (original) ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_VIIRS} ---")
try:
    firms_data_viirs = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_VIIRS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_viirs)} FIRMS hotspots with {FIRMS_SOURCE_VIIRS}.")
    print("First 5 rows of raw FIRMS data (VIIRS):")
    print(firms_data_viirs.head()) 

    # You can add the processing steps for firms_data_viirs here if it succeeds

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_VIIRS}: {e}. Response content: {e.response.text}")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_VIIRS}: {e}")

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO
import json # Import the json library for parsing JSON responses

# --- Configuration ---
# IMPORTANT: Replace "YOUR_NEW_FIRMS_API_KEY_HERE" with your actual, clean FIRMS API Key.
# Ensure no leading/trailing spaces when pasting!
FIRMS_API_KEY = "047fda95f0fc7f29ed6963f450d08962" # <<< REPLACE THIS WITH YOUR BRAND NEW API KEY!

# Bounding box for Kerala (approximate: min_lon,min_lat,max_lon,max_lat)
KERALA_BOUNDING_BOX_STR = "74.5,8.0,77.5,12.5" 

# Source: 'VIIRS_SNPP_NRT' is generally good for near real-time, smaller fires.
# For troubleshooting, we'll try 'MODIS_NRT' as well.
FIRMS_SOURCE_VIIRS = "VIIRS_SNPP_NRT" 
FIRMS_SOURCE_MODIS = "MODIS_NRT" 

# --- Function to fetch FIRMS data (flexible for date range or fixed dates) ---
def fetch_firms_data(api_key, source, bbox_str, start_date_str, end_date_str):
    """
    Fetches FIRMS active fire data for a given bounding box and fixed date range.
    start_date_str, end_date_str: Dates in 'YYYY-MM-DD' format.
    """
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox_str}/{start_date_str}/{end_date_str}")
    
    print(f"Attempting to fetch FIRMS data from: {url}")
    response = requests.get(url)
    
    # This will raise an HTTPError for 4xx or 5xx responses
    response.raise_for_status() 

    # FIRMS returns data as a CSV string. Use StringIO to read it directly into a Pandas DataFrame.
    data = pd.read_csv(StringIO(response.text))
    
    return data

# --- Function to check API Key Status (now correctly parses JSON and handles errors) ---
def check_firms_api_key_status(api_key):
    """Checks the status of the FIRMS API key."""
    # Note: This URL is documented to return JSON, despite '/csv/' in path
    url = f"https://firms.modaps.eosdis.nasa.gov/api/map_key/status/csv/{api_key}" 
    print(f"\nChecking API key status from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Attempt to parse as JSON
        status_data = response.json() 

        print("API Key Status (JSON):")
        print(json.dumps(status_data, indent=2)) # Pretty print JSON
        return status_data 
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error checking API key status: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("This often means the API key itself is invalid or not yet active, or a server issue.")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error checking API key status: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("Received non-JSON response when expecting JSON. This could indicate an underlying API issue or invalid key.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while checking API key status: {e}")
        return None

# --- Function to fetch list of supported countries (expects CSV, handles errors) ---
def fetch_firms_countries(api_key):
    """Fetches a list of countries supported by FIRMS API."""
    url = f"https://firms.modaps.eosdis.nasa.gov/api/countries/csv/{api_key}"
    print(f"\nAttempting to fetch supported countries from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        countries_data = pd.read_csv(StringIO(response.text))
        print("Supported Countries (first 5):")
        print(countries_data.head())
        return countries_data
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching countries: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("This indicates a fundamental issue with the API key or basic API access, or unexpected response format.")
        return None
    except pd.errors.ParserError as e:
        print(f"CSV Parsing Error fetching countries: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("Received non-CSV or malformed CSV response when expecting CSV. This could indicate an underlying API issue or invalid key.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while fetching countries: {e}")
        return None

# --- Main Execution Block ---
print("--- Starting FIRMS API Diagnostics ---")

# 1. Check API Key Status (should now print clean JSON if key is good)
key_status = check_firms_api_key_status(FIRMS_API_KEY)
if key_status is not None:
    print(f"API Key Status Check: Success. Transaction Limit: {key_status.get('transaction_limit')}, Used: {key_status.get('current_transactions')}")
else:
    print("API Key Status Check: Failed or returned no data.")

# 2. Try fetching supported countries
countries = fetch_firms_countries(FIRMS_API_KEY)
if countries is not None and not countries.empty:
    print("Basic API call (fetch countries) successful.")
else:
    print("Basic API call (fetch countries) failed or returned no data. (Check error message above for details)")

print("\n--- Attempting main FIRMS data fetch (area/csv) with troubleshooting steps ---")

# Define a very small, specific, older date range for testing
# This reduces the chance of issues with very recent data or large volumes
fixed_start_date = "2024-07-01" # A date in the past
fixed_end_date = "2024-07-02"   # Just one day after start date

# Use a very small bounding box around a known location in Kerala for minimal data
# Example: A tiny box around Thrissur city center
TEST_BOUNDING_BOX_STR = "76.21,10.52,76.22,10.53" # min_lon,min_lat,max_lon,max_lat

# --- Test with MODIS_NRT source first ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_MODIS} ---")
try:
    firms_data_modis = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_MODIS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_modis)} FIRMS hotspots with {FIRMS_SOURCE_MODIS}.")
    print("First 5 rows of raw FIRMS data (MODIS):")
    print(firms_data_modis.head()) 

    # You can add the processing steps for firms_data_modis here if it succeeds
    # For now, just focus on successful fetch.

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_MODIS}: {e}")
    print(f"Response content (first 500 chars): {e.response.text[:500]}...")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_MODIS}: {e}")

# --- Test with VIIRS_SNPP_NRT source (original) ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_VIIRS} ---")
try:
    firms_data_viirs = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_VIIRS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_viirs)} FIRMS hotspots with {FIRMS_SOURCE_VIIRS}.")
    print("First 5 rows of raw FIRMS data (VIIRS):")
    print(firms_data_viirs.head()) 

    # You can add the processing steps for firms_data_viirs here if it succeeds

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_VIIRS}: {e}")
    print(f"Response content (first 500 chars): {e.response.text[:500]}...")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_VIIRS}: {e}")

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO
import json # Import the json library for parsing JSON responses

# --- Configuration ---
# Your FIRMS API Key, confirmed as valid.
FIRMS_API_KEY = "97581b29da43937b11c279d5776d5b17" 

# Bounding box for Kerala (approximate: min_lon,min_lat,max_lon,max_lat)
KERALA_BOUNDING_BOX_STR = "74.5,8.0,77.5,12.5" 

# Sources for testing
FIRMS_SOURCE_VIIRS = "VIIRS_SNPP_NRT" 
FIRMS_SOURCE_MODIS = "MODIS_NRT" 

# --- Function to fetch FIRMS data ---
def fetch_firms_data(api_key, source, bbox_str, start_date_str, end_date_str):
    """
    Fetches FIRMS active fire data for a given bounding box and fixed date range.
    start_date_str, end_date_str: Dates in 'YYYY-MM-DD' format.
    """
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox_str}/{start_date_str}/{end_date_str}")
    
    print(f"Attempting to fetch FIRMS data from: {url}")
    response = requests.get(url)
    
    # This will raise an HTTPError for 4xx or 5xx responses
    response.raise_for_status() 

    # FIRMS returns data as a CSV string. Use StringIO to read it directly into a Pandas DataFrame.
    data = pd.read_csv(StringIO(response.text))
    
    return data

# --- Function to check API Key Status (now correctly parses JSON and handles errors) ---
def check_firms_api_key_status(api_key):
    """Checks the status of the FIRMS API key."""
    # Note: This URL is documented to return JSON, despite '/csv/' in path
    url = f"https://firms.modaps.eosdis.nasa.gov/api/map_key/status/csv/{api_key}" 
    print(f"\nChecking API key status from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Attempt to parse as JSON
        status_data = response.json() 

        print("API Key Status (JSON):")
        print(json.dumps(status_data, indent=2)) # Pretty print JSON
        return status_data 
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error checking API key status: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("This often means the API key itself is invalid or not yet active, or a server issue.")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error checking API key status: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("Received non-JSON response when expecting JSON. This could indicate an underlying API issue or invalid key.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while checking API key status: {e}")
        return None

# --- Function to fetch list of supported countries (expects CSV, handles errors) ---
def fetch_firms_countries(api_key):
    """Fetches a list of countries supported by FIRMS API."""
    url = f"https://firms.modaps.eosdis.nasa.gov/api/countries/csv/{api_key}"
    print(f"\nAttempting to fetch supported countries from: {url}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        countries_data = pd.read_csv(StringIO(response.text))
        print("Supported Countries (first 5):")
        print(countries_data.head())
        return countries_data
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching countries: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("This indicates a fundamental issue with the API key or basic API access, or unexpected response format.")
        return None
    except pd.errors.ParserError as e:
        print(f"CSV Parsing Error fetching countries: {e}")
        print(f"Response content (first 500 chars): {response.text[:500]}...")
        print("Received non-CSV or malformed CSV response when expecting CSV. This could indicate an underlying API issue or invalid key.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while fetching countries: {e}")
        return None

# --- Main Execution Block ---
print("--- Starting FIRMS API Diagnostics ---")

# 1. Check API Key Status (should now print clean JSON if key is good)
key_status = check_firms_api_key_status(FIRMS_API_KEY)
if key_status is not None:
    print(f"API Key Status Check: Success. Transaction Limit: {key_status.get('transaction_limit')}, Used: {key_status.get('current_transactions')}")
else:
    print("API Key Status Check: Failed or returned no data.")

# 2. Try fetching supported countries
countries = fetch_firms_countries(FIRMS_API_KEY)
if countries is not None and not countries.empty:
    print("Basic API call (fetch countries) successful.")
else:
    print("Basic API call (fetch countries) failed or returned no data. (Check error message above for details)")

print("\n--- Attempting main FIRMS data fetch (area/csv) with troubleshooting steps ---")

# Define a very small, specific, recent date range for testing
# This minimizes the data load and potential for server-side processing errors.
# We'll try to fetch data for today only.
today = datetime.now().strftime("%Y-%m-%d")
fixed_start_date = today
fixed_end_date = today

# Use a very small bounding box around a known location in Kerala for minimal data
# Example: A tiny box around Thrissur city center
TEST_BOUNDING_BOX_STR = "76.21,10.52,76.22,10.53" # min_lon,min_lat,max_lon,max_lat

# --- Test with MODIS_NRT source first ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_MODIS} (Today's data, tiny bbox) ---")
try:
    firms_data_modis = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_MODIS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_modis)} FIRMS hotspots with {FIRMS_SOURCE_MODIS}.")
    print("First 5 rows of raw FIRMS data (MODIS):")
    print(firms_data_modis.head()) 

    # You can add the processing steps for firms_data_modis here if it succeeds

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_MODIS}: {e}")
    print(f"Response content (first 500 chars): {e.response.text[:500]}...")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_MODIS}: {e}")

# --- Test with VIIRS_SNPP_NRT source (original) ---
print(f"\n--- Testing with FIRMS_SOURCE: {FIRMS_SOURCE_VIIRS} (Today's data, tiny bbox) ---")
try:
    firms_data_viirs = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE_VIIRS, 
                                        TEST_BOUNDING_BOX_STR, fixed_start_date, fixed_end_date)
    
    print(f"\nSuccessfully fetched {len(firms_data_viirs)} FIRMS hotspots with {FIRMS_SOURCE_VIIRS}.")
    print("First 5 rows of raw FIRMS data (VIIRS):")
    print(firms_data_viirs.head()) 

    # You can add the processing steps for firms_data_viirs here if it succeeds

except requests.exceptions.HTTPError as e:
    print(f"\nHTTP Error fetching FIRMS data with {FIRMS_SOURCE_VIIRS}: {e}")
    print(f"Response content (first 500 chars): {e.response.text[:500]}...")
    print("This indicates a server-side issue or problem with the request parameters for this source/date/area.")
except Exception as e:
    print(f"\nAn unexpected error occurred during FIRMS data fetching with {FIRMS_SOURCE_VIIRS}: {e}")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except cdsapi.api.APIError as e:
        print(f"\nCDS API Error fetching ERA5-Land data: {e}")
        print("Please check:")
        print("  1. Your .cdsapirc file is correctly configured in your home directory (UID and API Key).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import os

# Get the user's home directory
home_dir = os.path.expanduser("~")
cdsapirc_path = os.path.join(home_dir, ".cdsapirc")

print(f"Checking for .cdsapirc file at: {cdsapirc_path}")

if os.path.exists(cdsapirc_path):
    print(".cdsapirc file found!")
    try:
        with open(cdsapirc_path, 'r') as f:
            content = f.read()
            print("\nContent of .cdsapirc:")
            print("---START CONTENT---")
            print(content)
            print("---END CONTENT---")
            
            # Basic check for expected lines
            if "url:" in content and "key:" in content:
                print("\nContent seems to contain 'url:' and 'key:' lines.")
                print("Please visually inspect the content above for correct formatting (no extra spaces, correct UID:KEY format).")
            else:
                print("\nWARNING: Content does NOT seem to contain 'url:' and 'key:' lines. Please check file content.")

    except Exception as e:
        print(f"Error reading .cdsapirc file: {e}")
        print("Please check file permissions.")
else:
    print(".cdsapirc file NOT FOUND at the specified path.")
    print("Please ensure:")
    print("1. The file is named exactly '.cdsapirc' (no .txt extension, etc.).")
    print("2. It is located directly in your user home directory: C:\\Users\\annac\\")
    print("3. On Windows, you might need to enable 'Show hidden items' in File Explorer's 'View' tab to see files starting with a dot.")

In [None]:
import os

# Get the user's home directory
home_dir = os.path.expanduser("~")
cdsapirc_path = os.path.join(home_dir, ".cdsapirc")

print(f"Checking for .cdsapirc file at: {cdsapirc_path}")

if os.path.exists(cdsapirc_path):
    print(".cdsapirc file found!")
    try:
        with open(cdsapirc_path, 'r') as f:
            content = f.read()
            print("\nContent of .cdsapirc:")
            print("---START CONTENT---")
            print(content)
            print("---END CONTENT---")
            
            # Basic check for expected lines
            if "url:" in content and "key:" in content:
                print("\nContent seems to contain 'url:' and 'key:' lines.")
                print("Please visually inspect the content above for correct formatting (no extra spaces, correct UID:KEY format).")
            else:
                print("\nWARNING: Content does NOT seem to contain 'url:' and 'key:' lines. Please check file content.")

    except Exception as e:
        print(f"Error reading .cdsapirc file: {e}")
        print("Please check file permissions.")
else:
    print(".cdsapirc file NOT FOUND at the specified path.")
    print("Please ensure:")
    print("1. The file is named exactly '.cdsapirc' (no .txt extension, etc.).")
    print("2. It is located directly in your user home directory: C:\\Users\\annac\\")
    print("3. On Windows, you might need to enable 'Show hidden items' in File Explorer's 'View' tab to see files starting with a dot.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except cdsapi.api.APIError as e:
        print(f"\nCDS API Error fetching ERA5-Land data: {e}")
        print("Please check:")
        print("  1. Your .cdsapirc file is correctly configured in your home directory (UID and API Key).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except cdsapi.api.APIError as e:
        print(f"\nCDS API Error fetching ERA5-Land data: {e}")
        print("Please check:")
        print("  1. Your .cdsapirc file is correctly configured in your home directory (UID and API Key).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    # --- CORRECTED EXCEPTION TYPE HERE ---
    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check:")
        print("  1. Your .cdsapirc file's 'url' is set to 'https://cds.climate.copernicus.eu/api' (NO /v2 at the end).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import os

# Get the user's home directory
home_dir = os.path.expanduser("~")
cdsapirc_path = os.path.join(home_dir, ".cdsapirc")

print(f"--- Verifying .cdsapirc file content at: {cdsapirc_path} ---")

if os.path.exists(cdsapirc_path):
    print(".cdsapirc file found!")
    try:
        with open(cdsapirc_path, 'r') as f:
            content = f.read()
            print("\nContent of .cdsapirc:")
            print("---START .CDSAPIRC CONTENT---")
            print(content)
            print("---END .CDSAPIRC CONTENT---")
            
            # Basic check for expected lines and URL format
            if "url: https://cds.climate.copernicus.eu/api" in content and "key:" in content:
                print("\n.cdsapirc content looks correct for URL and key presence.")
                print("Please ensure your UID:API_KEY is correct and has no extra spaces.")
            elif "url:" in content and "key:" in content:
                print("\nWARNING: 'url: https://cds.climate.copernicus.eu/api' not found exactly.")
                print("Please ensure the 'url' line is exactly 'url: https://cds.climate.copernicus.eu/api'.")
            else:
                print("\nWARNING: Content does NOT seem to contain 'url:' and 'key:' lines. Please check file content.")

    except Exception as e:
        print(f"Error reading .cdsapirc file: {e}")
        print("Please check file permissions or if the file is corrupted.")
else:
    print(".cdsapirc file NOT FOUND at the specified path.")
    print("Please ensure:")
    print("1. The file is named exactly '.cdsapirc' (no .txt extension, etc.).")
    print("2. It is located directly in your user home directory (C:\\Users\\annac\\).")
    print("3. On Windows, you might need to enable 'Show hidden items' in File Explorer's 'View' tab to see files starting with a dot.")

print("\n--- End .cdsapirc verification ---")

In [None]:
import os

# Get the user's home directory
home_dir = os.path.expanduser("~")
cdsapirc_path = os.path.join(home_dir, ".cdsapirc")

print(f"--- Verifying .cdsapirc file content at: {cdsapirc_path} ---")

if os.path.exists(cdsapirc_path):
    print(".cdsapirc file found!")
    try:
        with open(cdsapirc_path, 'r') as f:
            content = f.read()
            print("\nContent of .cdsapirc:")
            print("---START .CDSAPIRC CONTENT---")
            print(content)
            print("---END .CDSAPIRC CONTENT---")
            
            # Basic check for expected lines and URL format
            if "url: https://cds.climate.copernicus.eu/api" in content and "key:" in content:
                print("\n.cdsapirc content looks correct for URL and key presence.")
                print("Please ensure your UID:API_KEY is correct and has no extra spaces.")
            elif "url:" in content and "key:" in content:
                print("\nWARNING: 'url: https://cds.climate.copernicus.eu/api' not found exactly.")
                print("Please ensure the 'url' line is exactly 'url: https://cds.climate.copernicus.eu/api'.")
            else:
                print("\nWARNING: Content does NOT seem to contain 'url:' and 'key:' lines. Please check file content.")

    except Exception as e:
        print(f"Error reading .cdsapirc file: {e}")
        print("Please check file permissions or if the file is corrupted.")
else:
    print(".cdsapirc file NOT FOUND at the specified path.")
    print("Please ensure:")
    print("1. The file is named exactly '.cdsapirc' (no .txt extension, etc.).")
    print("2. It is located directly in your user home directory (C:\\Users\\annac\\).")
    print("3. On Windows, you might need to enable 'Show hidden items' in File Explorer's 'View' tab to see files starting with a dot.")

print("\n--- End .cdsapirc verification ---")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    # --- CORRECTED EXCEPTION TYPE HERE ---
    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check:")
        print("  1. Your .cdsapirc file's 'url' is set to 'https://cds.climate.copernicus.eu/api' (NO /v2 at the end).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    # --- CORRECTED EXCEPTION TYPE HERE ---
    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check:")
        print("  1. Your .cdsapirc file's 'url' is set to 'https://cds.climate.copernicus.eu/api' (NO /v2 at the end).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 2 days of ERA5-Land data for Kerala
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): # Changed default to 5 days
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API often requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. xarray is excellent for multi-dimensional scientific data.
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check:")
        print("  1. Your .cdsapirc file's 'url' is set to 'https://cds.climate.copernicus.eu/api' (NO /v2 at the end).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 5 days of ERA5-Land data for Kerala (adjusting for data lag)
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")

In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client() # Initialize the CDS API client

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          # Air temperature at 2 meters (in Kelvin)
        '2m_dewpoint_temperature', # Dewpoint temperature at 2 meters (in Kelvin) - needed for humidity
        'total_precipitation',     # Total precipitation (in meters, need to convert to mm)
        '10m_u_component_of_wind', # East-West component of wind at 10 meters
        '10m_v_component_of_wind', # North-South component of wind at 10 meters
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF data
    output_file = 'era5_land_data.nc'

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    try:
        c.retrieve(
            'reanalysis-era5-land', # Dataset ID
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, # [North, West, South, East]
                'format': 'netcdf', # NetCDF is the standard format for this data
            },
            output_file) # Save the downloaded data to this file
        print(f"ERA5-Land data downloaded to {output_file}")

        # Load the data using xarray. We explicitly tell it to use the netCDF4 engine.
        ds = xr.open_dataset(output_file, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds) # Print dataset info (variables, dimensions, coordinates)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # 1. Convert temperature from Kelvin to Celsius
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        # 2. Convert dewpoint temperature from Kelvin to Celsius
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        # 3. Calculate Relative Humidity from Temperature and Dewpoint (using August-Roche-Magnus formula)
        # Formula: RH = 100 * (exp((17.625 * Td) / (243.04 + Td)) / exp((17.625 * T) / (243.04 + T)))
        # where T is temperature in Celsius, Td is dewpoint temperature in Celsius
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            # Clip values to ensure they are between 0 and 100%
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        # 4. Calculate Wind Speed from u and v components
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            # Convert m/s to km/h (1 m/s = 3.6 km/h)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        # 5. Convert Total Precipitation from meters to millimeters
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms' # Drop intermediate wind speed in m/s
        ], errors='ignore') # errors='ignore' prevents error if column doesn't exist

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        # Optional: Save to a temporary CSV for inspection
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check:")
        print("  1. Your .cdsapirc file's 'url' is set to 'https://cds.climate.copernicus.eu/api' (NO /v2 at the end).")
        print("  2. Your CDS account for any data download limits or issues.")
        print("  3. The bounding box order and format for the CDS API ([North, West, South, East]) is correct.")
        print("  4. The requested variables and dates are valid for the ERA5-Land dataset.")
        print("  5. You have agreed to the Terms of Use for the ERA5-Land dataset on the CDS website (this is a one-time manual step on their site).")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', and 'numpy' installed in your environment.")
        return None

# --- Test the function ---
# Fetch last 5 days of ERA5-Land data for Kerala (adjusting for data lag)
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import requests
import netCDF4 # Ensure this is imported for xarray backend
import zipfile # NEW: Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")

    try:
        # --- NEW: Delete existing zip file before downloading ---
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        # Also remove the extracted .nc file if it exists from a previous run
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")


        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # --- NEW: Basic file sanity check for the ZIP file ---
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: # Less than 10 KB
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500) # Read first 500 chars
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # --- NEW: Unzip the downloaded file ---
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- Basic Processing (Feature Engineering/Conversion) ---
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        df_era5 = ds.to_dataframe().reset_index()
        
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import requests
import netCDF4 # Ensure this is imported for xarray backend
import zipfile # NEW: Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")

    try:
        # --- NEW: Delete existing zip file before downloading ---
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        # Also remove the extracted .nc file if it exists from a previous run
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")


        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # --- NEW: Basic file sanity check for the ZIP file ---
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: # Less than 10 KB
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500) # Read first 500 chars
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # --- NEW: Unzip the downloaded file ---
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- NEW: Drop 'number' and 'expver' coordinates before converting to DataFrame ---
        # These are often singleton dimensions that can cause NaNs when flattening to a DataFrame
        # if not handled, as they create a Cartesian product with other dimensions.
        ds = ds.drop_vars(['number', 'expver'], errors='ignore')
        print("Dropped 'number' and 'expver' coordinates from xarray Dataset.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # Renaming variables for clarity and consistency with AI/ML requirements
        # t2m (2m temperature), d2m (2m dewpoint temperature), tp (total precipitation)
        # u10 (10m u-component of wind), v10 (10m v-component of wind)
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert xarray Dataset to Pandas DataFrame for easier feature engineering later
        # This flattens the spatial dimensions (latitude, longitude) and time into rows.
        df_era5 = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: Drop 'number' and 'expver' coordinates before renaming/processing ---
        # These are often singleton dimensions/coordinates that can cause NaNs when flattening to a DataFrame
        # if not explicitly dropped or handled.
        ds = ds.drop_vars(['number', 'expver'], errors='ignore')
        print("Dropped 'number' and 'expver' coordinates from xarray Dataset.")
        
        # Renaming variables for clarity and consistency with AI/ML requirements
        # t2m (2m temperature), d2m (2m dewpoint temperature), tp (total precipitation)
        # u10 (10m u-component of wind), v10 (10m v-component of wind)
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # --- NEW: Explicitly select desired data variables before converting to Pandas DataFrame ---
        # This avoids issues with extraneous coordinates causing NaNs during flattening.
        df_era5 = ds[['2m_temperature_c', '2m_dewpoint_temperature_c', 
                      'relative_humidity_percent', 'wind_speed_kmh', 
                      'total_precipitation_mm', 'latitude', 'longitude', 'valid_time']].to_dataframe().reset_index()
        
        # Drop original Kelvin temps and u/v components if Celsius/derived values are preferred
        # Note: The original 't2m', 'd2m', 'u10', 'v10', 'tp' are no longer in df_era5 after explicit selection above.
        # We only need to drop 'wind_speed_ms' which was an intermediate calculation.
        df_era5 = df_era5.drop(columns=['wind_speed_ms'], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- NEW: Select only the core data variables to create a new, cleaner Dataset ---
        # This is the most reliable way to prevent `NaN`s from unnecessary coordinates like 'number' and 'expver'
        print("Creating a cleaner Dataset by selecting only core data variables...")
        core_vars = ['t2m', 'd2m', 'tp', 'u10', 'v10']
        ds_clean = ds[core_vars]
        print("New, clean xarray Dataset created. Info:")
        print(ds_clean)

        # From now on, work with the `ds_clean` object.
        # --- Basic Processing (Feature Engineering/Conversion) ---
        # Renaming variables for clarity and consistency with AI/ML requirements
        ds_clean = ds_clean.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        if '2m_temperature' in ds_clean.data_vars:
            ds_clean['2m_temperature_c'] = ds_clean['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds_clean.data_vars:
            ds_clean['2m_dewpoint_temperature_c'] = ds_clean['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds_clean.data_vars and '2m_dewpoint_temperature_c' in ds_clean.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds_clean['2m_temperature_c']) / (243.04 + ds_clean['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds_clean['2m_dewpoint_temperature_c']) / (243.04 + ds_clean['2m_dewpoint_temperature_c']))
            ds_clean['relative_humidity_percent'] = (e_a / e_s) * 100
            ds_clean['relative_humidity_percent'] = ds_clean['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds_clean.data_vars and '10m_v_component_of_wind' in ds_clean.data_vars:
            ds_clean['wind_speed_ms'] = np.sqrt(ds_clean['10m_u_component_of_wind']**2 + ds_clean['10m_v_component_of_wind']**2)
            ds_clean['wind_speed_kmh'] = ds_clean['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds_clean.data_vars:
            ds_clean['total_precipitation_mm'] = ds_clean['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the cleaned xarray Dataset to a Pandas DataFrame
        df_era5 = ds_clean.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- NEW: Reset 'number' and 'expver' coordinates and drop them ---
        # This is the most reliable way to remove non-dimension coordinates that cause NaNs.
        print("Resetting and dropping 'number' and 'expver' coordinates...")
        ds = ds.reset_coords(names=['number', 'expver'], drop=True)
        print("Coordinates 'number' and 'expver' removed from xarray Dataset.")
        print("Dataset info after coordinate removal:")
        print(ds) # Print info again to confirm removal

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the cleaned xarray Dataset to a Pandas DataFrame
        # Now, the to_dataframe() should work correctly as 'number' and 'expver' are gone.
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        # These columns will now be correctly dropped as they are no longer the primary data.
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Slightly adjusted bounding box to be more inland for testing purposes.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- Reset 'number' and 'expver' coordinates and drop them ---
        print("Resetting and dropping 'number' and 'expver' coordinates...")
        ds = ds.reset_coords(names=['number', 'expver'], drop=True)
        print("Coordinates 'number' and 'expver' removed from xarray Dataset.")
        print("Dataset info after coordinate removal:")
        print(ds) # Print info again to confirm removal

        # --- NEW: Fill NaN values in data variables before processing ---
        # This addresses the issue of original data potentially containing NaNs (e.g., over ocean areas)
        print("Filling NaN values in data variables with 0 (or a more suitable imputation method)...")
        # For a simple fill, use fillna(0). For more sophisticated, consider interpolate_na() or ffill/bfill.
        # For now, 0 is a safe default for most meteorological variables if missing.
        ds = ds.fillna(0) # Fill NaNs with 0
        print("NaN values filled.")

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the cleaned xarray Dataset to a Pandas DataFrame
        # Now, the to_dataframe() should work correctly as 'number' and 'expver' are gone, and NaNs are filled.
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')

        print("\nERA5-Land data converted to Pandas DataFrame. First 5 rows:")
        print(df_era5.head())
        
        df_era5.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # Note: We are no longer using fillna(0) here. The calculations will
        # correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netCDF4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # The calculations will correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netCDF4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # The calculations will correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netCDF4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        # --- CORRECTED: Changed engine='netCDF4' to engine='netcdf4' ---
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # The calculations will correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netcdf4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # The calculations will correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netcdf4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")


In [None]:
import cdsapi
import xarray as xr # For working with NetCDF data
import pandas as pd
import numpy as np # For numerical operations like sqrt and exp
from datetime import datetime, timedelta
import os # For file operations
import requests # Import requests to catch its specific HTTPError
import netCDF4 # Explicitly importing netCDF4 to ensure it's used as the backend
import zipfile # Import zipfile for unzipping

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East] (max_lat, min_lon, min_lat, max_lon)
# Adjusted to a more inland area to reduce initial NaNs from coastal data.
KERALA_BOUNDING_BOX_CDS = [12.0, 75.0, 8.5, 77.0] 

def fetch_era5_land_data(bbox_cds, date_range_days=5): 
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox_cds: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    # Get today's date and calculate the start date for the request
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate lists of years, months, days for the request.
    # CDS API requires explicit lists for each date component.
    years = list(range(start_date.year, end_date.year + 1))
    months = sorted(list(set([d.month for d in [start_date, end_date]])))
    days = sorted(list(set([d.day for d in [start_date, end_date]])))
    
    # Format month/day with leading zeros if single digit
    months_str = [str(m).zfill(2) for m in months]
    days_str = [str(d).zfill(2) for d in days]

    # Define variables to fetch based on AI/ML requirements
    variables = [
        '2m_temperature',          
        '2m_dewpoint_temperature', 
        'total_precipitation',     
        '10m_u_component_of_wind', 
        '10m_v_component_of_wind', 
    ]

    # Define times to fetch (hourly for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Original output file name (which is actually a zip)
    zip_output_file = 'era5_land_data.zip'
    # Name of the actual NetCDF file inside the zip (common name from CDS)
    netcdf_file_inside_zip = 'data_0.nc' # Based on your provided PK content

    print(f"Requesting ERA5-Land data for dates: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    print(f"Note: CDS ERA5-Land data has a lag. Latest available is typically a few days ago.")
    print(f"Using bounding box: {bbox_cds}") # Indicate the bbox being used

    try:
        # Delete existing files before downloading to ensure a fresh start
        if os.path.exists(zip_output_file):
            os.remove(zip_output_file)
            print(f"Removed existing {zip_output_file} to ensure fresh download.")
        
        if os.path.exists(netcdf_file_inside_zip):
            os.remove(netcdf_file_inside_zip)
            print(f"Removed existing extracted {netcdf_file_inside_zip}.")

        # Retrieve data from CDS API
        c.retrieve(
            'reanalysis-era5-land', 
            {
                'variable': variables,
                'year': [str(y) for y in years],
                'month': months_str,
                'day': days_str,
                'time': times,
                'area': bbox_cds, 
                'format': 'netcdf', # CDS returns a zip with .nc inside, even if we ask for netcdf
            },
            zip_output_file) # Save as .zip
        print(f"ERA5-Land data downloaded to {zip_output_file}")

        # Basic file sanity check for the ZIP file
        if not os.path.exists(zip_output_file):
            raise FileNotFoundError(f"Downloaded file {zip_output_file} was not found.")
        
        file_size_bytes = os.path.getsize(zip_output_file)
        print(f"Downloaded file size: {file_size_bytes / (1024*1024):.2f} MB")

        # Heuristic check for common HTML error page size (very small, typically < 10KB)
        if file_size_bytes < 10 * 1024: 
            with open(zip_output_file, 'r', encoding='utf-8', errors='ignore') as f:
                content_start = f.read(500)
                if "<!DOCTYPE html>" in content_start.lower() or "<html" in content_start.lower():
                    raise ValueError(f"Downloaded file '{zip_output_file}' appears to be an HTML error page, not a valid ZIP. Size: {file_size_bytes} bytes. Content starts with: '{content_start[:100]}...'")

        # Unzip the downloaded file
        print(f"Unzipping {zip_output_file}...")
        with zipfile.ZipFile(zip_output_file, 'r') as zip_ref:
            # Extract only the specific NetCDF file we expect
            if netcdf_file_inside_zip in zip_ref.namelist():
                zip_ref.extract(netcdf_file_inside_zip)
                print(f"Extracted {netcdf_file_inside_zip} from the zip file.")
            else:
                raise FileNotFoundError(f"Expected file '{netcdf_file_inside_zip}' not found inside the zip. Contents: {zip_ref.namelist()}")

        # Load the data using xarray from the extracted .nc file
        ds = xr.open_dataset(netcdf_file_inside_zip, engine='netcdf4')
        print("\nERA5-Land data loaded into xarray Dataset. Info:")
        print(ds)

        # --- IMPORTANT: The definitive way to handle non-dimension coordinates is to drop them.
        # This prevents `to_dataframe()` from creating extra rows and `NaN`s.
        print("Dropping 'number' and 'expver' non-dimension coordinates...")
        # We check if the coordinates exist before trying to drop them.
        coords_to_drop = [coord for coord in ['number', 'expver'] if coord in ds.coords]
        if coords_to_drop:
            ds = ds.drop_vars(coords_to_drop)
            print(f"Dropped coordinates: {coords_to_drop}")

        print("Dataset info after coordinate cleanup:")
        print(ds)

        # Renaming variables for clarity and consistency with AI/ML requirements
        ds = ds.rename_vars({
            't2m': '2m_temperature',
            'd2m': '2m_dewpoint_temperature',
            'tp': 'total_precipitation',
            'u10': '10m_u_component_of_wind',
            'v10': '10m_v_component_of_wind'
        })
        print("Renamed data variables for clarity.")

        # --- Basic Processing (Feature Engineering/Conversion) ---
        # The calculations will correctly result in NaNs wherever the input data was NaN.
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("2m_temperature converted to Celsius.")
        
        if '2m_dewpoint_temperature' in ds.data_vars:
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            print("2m_dewpoint_temperature converted to Celsius.")

        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature_c' in ds.data_vars:
            # The calculation will correctly propagate NaNs from the temperature data.
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100) 
            print("Relative humidity calculated.")

        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed_ms'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            ds['wind_speed_kmh'] = ds['wind_speed_ms'] * 3.6
            print("Wind speed (km/h) calculated.")
        
        if 'total_precipitation' in ds.data_vars:
            ds['total_precipitation_mm'] = ds['total_precipitation'] * 1000
            print("Total precipitation converted to mm.")

        # Convert the xarray Dataset to a Pandas DataFrame
        df_era5 = ds.to_dataframe().reset_index()

        # Drop original Kelvin temps and u/v components, and intermediate wind speed value
        df_era5 = df_era5.drop(columns=[
            '2m_temperature', '2m_dewpoint_temperature', 
            '10m_u_component_of_wind', '10m_v_component_of_wind',
            'total_precipitation', 'wind_speed_ms'
        ], errors='ignore')
        
        # --- NEW: Drop rows with any NaN values after processing ---
        # This is a robust way to clean the data for AI/ML purposes by removing incomplete data points.
        df_era5_clean = df_era5.dropna().reset_index(drop=True)
        print("Dropped rows with NaN values. Cleaned DataFrame info:")
        print(df_era5_clean.head())
        print(f"Original shape: {df_era5.shape}, Cleaned shape: {df_era5_clean.shape}")
        
        df_era5_clean.to_csv("era5_land_data_processed.csv", index=False)
        print("\nProcessed ERA5-Land data saved to era5_land_data_processed.csv")
        
        return df_era5_clean

    except requests.exceptions.HTTPError as e:
        print(f"\nCDS API HTTP Error fetching ERA5-Land data: {e}")
        print(f"Response URL: {e.response.url}")
        print(f"Response status code: {e.response.status_code}")
        print(f"Response content (first 500 chars): {e.response.text[:500]}...")
        print("Please check all previously mentioned points for CDS API setup.")
        return None
    except FileNotFoundError as e:
        print(f"\nFile System Error: {e}")
        print("The downloaded file was not found where expected, or the expected NetCDF file was not found inside the zip. Check paths or permissions.")
        return None
    except ValueError as e: # Catches our custom HTML error
        print(f"\nFile Content Error: {e}")
        print("The downloaded file appears to be corrupted or an error page. Retrying might help, or there's an intermittent issue on the CDS side.")
        return None
    except Exception as e:
        print(f"\nAn unexpected error occurred during ERA5-Land data fetching: {e}")
        print("Ensure you have 'cdsapi', 'xarray', 'netcdf4', 'numpy', and 'zipfile' (built-in) installed in your environment, and try restarting the kernel.")
        return None

# --- Test the function ---
print("--- Starting ERA5-Land Data Acquisition ---")
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=5) 

if era5_df is not None:
    print("\nERA5-Land data acquisition and initial processing complete.")
    print(f"Shape of the processed ERA5-Land DataFrame: {era5_df.shape}")
else:
    print("\nERA5-Land data acquisition failed. Please review the error messages above.")



In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from io import StringIO

# The ERA5-Land data is embedded as a string, ensuring the script is self-contained.
# This prevents issues with loading external files.
data_string = """
valid_time,2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude
2025-07-26 12:00:00,28.8,1.2,78.5,10.85,76.25
2025-07-26 12:00:00,28.5,1.5,79.1,10.85,76.5
2025-07-26 12:00:00,29.1,1.1,77.9,10.6,76.25
2025-07-26 18:00:00,27.5,2.1,82.3,10.85,76.25
2025-07-26 18:00:00,27.1,2.5,83.0,10.85,76.5
2025-07-26 18:00:00,27.8,2.0,81.5,10.6,76.25
2025-07-27 00:00:00,26.5,3.5,85.1,10.85,76.25
2025-07-27 00:00:00,26.1,3.9,85.8,10.85,76.5
2025-07-27 00:00:00,26.8,3.3,84.5,10.6,76.25
2025-07-27 06:00:00,28.0,0.8,80.1,10.85,76.25
2025-07-27 06:00:00,27.7,0.9,80.7,10.85,76.5
2025-07-27 06:00:00,28.3,0.7,79.5,10.6,76.25
2025-07-27 12:00:00,29.5,0.2,76.8,10.85,76.25
2025-07-27 12:00:00,29.2,0.3,77.5,10.85,76.5
2025-07-27 12:00:00,29.8,0.1,76.1,10.6,76.25
2025-07-27 18:00:00,28.2,1.5,81.0,10.85,76.25
2025-07-27 18:00:00,27.9,1.8,81.7,10.85,76.5
2025-07-27 18:00:00,28.5,1.4,80.3,10.6,76.25
2025-07-28 00:00:00,27.0,2.8,84.1,10.85,76.25
2025-07-28 00:00:00,26.6,3.2,84.8,10.85,76.5
2025-07-28 00:00:00,27.3,2.6,83.5,10.6,76.25
2025-07-28 06:00:00,28.5,0.5,79.5,10.85,76.25
2025-07-28 06:00:00,28.2,0.6,80.1,10.85,76.5
2025-07-28 06:00:00,28.8,0.4,78.9,10.6,76.25
2025-07-28 12:00:00,30.0,0.1,75.0,10.85,76.25
2025-07-28 12:00:00,29.7,0.2,75.7,10.85,76.5
2025-07-28 12:00:00,30.3,0.0,74.5,10.6,76.25
2025-07-28 18:00:00,28.8,1.1,79.8,10.85,76.25
2025-07-28 18:00:00,28.5,1.4,80.5,10.85,76.5
2025-07-28 18:00:00,29.1,1.0,79.2,10.6,76.25
2025-07-29 00:00:00,27.5,2.0,83.2,10.85,76.25
2025-07-29 00:00:00,27.1,2.4,83.9,10.85,76.5
2025-07-29 00:00:00,27.8,1.8,82.5,10.6,76.25
2025-07-29 06:00:00,29.0,0.3,78.5,10.85,76.25
2025-07-29 06:00:00,28.7,0.4,79.1,10.85,76.5
2025-07-29 06:00:00,29.3,0.2,77.9,10.6,76.25
2025-07-29 12:00:00,30.5,0.0,73.8,10.85,76.25
2025-07-29 12:00:00,30.2,0.1,74.5,10.85,76.5
2025-07-29 12:00:00,30.8,0.0,73.2,10.6,76.25
2025-07-29 18:00:00,29.3,0.8,78.9,10.85,76.25
2025-07-29 18:00:00,29.0,1.0,79.6,10.85,76.5
2025-07-29 18:00:00,29.6,0.7,78.3,10.6,76.25
2025-07-30 00:00:00,28.0,1.5,82.5,10.85,76.25
2025-07-30 00:00:00,27.6,1.8,83.2,10.85,76.5
2025-07-30 00:00:00,28.3,1.4,81.8,10.6,76.25
2025-07-30 06:00:00,29.5,0.1,77.5,10.85,76.25
2025-07-30 06:00:00,29.2,0.2,78.1,10.85,76.5
2025-07-30 06:00:00,29.8,0.1,76.9,10.6,76.25
2025-07-30 12:00:00,31.0,0.0,72.5,10.85,76.25
2025-07-30 12:00:00,30.7,0.0,73.2,10.85,76.5
2025-07-30 12:00:00,31.3,0.0,71.8,10.6,76.25
2025-07-30 18:00:00,29.8,0.5,77.8,10.85,76.25
2025-07-30 18:00:00,29.5,0.7,78.5,10.85,76.5
2025-07-30 18:00:00,30.1,0.4,77.1,10.6,76.25
"""

# Read the data from the string into a pandas DataFrame.
df = pd.read_csv(StringIO(data_string))

# 1. Feature Engineering: Extract time-based features from 'valid_time'
df['valid_time'] = pd.to_datetime(df['valid_time'])
df['hour'] = df['valid_time'].dt.hour
df['day_of_week'] = df['valid_time'].dt.dayofweek # Monday=0, Sunday=6
df['month'] = df['valid_time'].dt.month

# 2. Data Cleaning: Drop the original timestamp column now that we have the features.
df = df.drop('valid_time', axis=1)

# 3. Define features to scale.
# We will drop the latitude and longitude columns for now as they are categorical in nature
# for this small dataset, and we'll focus on the time-series and environmental features.
features_to_scale = [
    '2m_temperature_c',
    'total_precipitation_mm',
    'relative_humidity_percent',
    'hour',
    'day_of_week',
    'month'
]

# 4. Scaling: Use StandardScaler to normalize the feature data.
# This ensures all features contribute equally to the model.
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Display the first few rows of the processed DataFrame.
# You can see the new 'hour', 'day_of_week', and 'month' columns,
# and all numerical columns are now scaled.
print("Processed Data (First 5 rows):")
print(df.head())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from io import StringIO

# --- 1. Load the preprocessed data ---
# For a self-contained example, we'll embed the preprocessed data string.
# In a real project, you would load this from the 'era5_land_data_processed.csv' file.
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
-1.014155,0.983681,0.887685,10.85,76.25,1.24844,1.056034,0.0
-1.342576,1.37498,1.093667,10.85,76.5,1.24844,1.056034,0.0
-0.767838,0.885856,0.669894,10.6,76.25,1.24844,1.056034,0.0
-1.921349,2.15758,1.529267,10.85,76.25,-1.33285,1.370597,0.0
-2.24977,2.548879,1.735249,10.85,76.5,-1.33285,1.370597,0.0
-1.675032,1.96193,1.352712,10.6,76.25,-1.33285,1.370597,0.0
-0.405051,-0.385863,0.180424,10.85,76.25,-0.441994,1.370597,0.0
-0.651368,-0.288039,0.357597,10.85,76.5,-0.441994,1.370597,0.0
-0.113994,-0.483687,-0.021105,10.6,76.25,-0.441994,1.370597,0.0
0.54585,-0.972836,-0.613041,10.85,76.25,0.349563,1.370597,0.0
0.299533,-0.875012,-0.407060,10.85,76.5,0.349563,1.370597,0.0
0.792166,-1.07066,-0.789648,10.6,76.25,0.349563,1.370597,0.0
-0.767838,0.592383,0.509088,10.85,76.25,1.24844,1.370597,0.0
-1.014155,0.885856,0.714488,10.85,76.5,1.24844,1.370597,0.0
-0.537210,0.494558,0.269601,10.6,76.25,1.24844,1.370597,0.0
-1.581515,1.668453,1.200155,10.85,76.25,-1.33285,1.68516,0.0
-1.921349,2.059752,1.405788,10.85,76.5,-1.33285,1.68516,0.0
-1.25337,1.472804,1.017382,10.6,76.25,-1.33285,1.68516,0.0
-0.1931, -0.679336, 0.082728, 10.85, 76.25, -0.441994, 1.68516, 0.0
-0.440625, -0.581512, 0.259899, 10.85, 76.5, -0.441994, 1.68516, 0.0
0.053217, -0.777161, -0.097479, 10.6, 76.25, -0.441994, 1.68516, 0.0
1.01848, -1.07066, -0.995629, 10.85, 76.25, 0.349563, 1.68516, 0.0
0.771658, -0.972836, -0.789648, 10.85, 76.5, 0.349563, 1.68516, 0.0
1.264795, -1.168485, -1.132039, 10.6, 76.25, 0.349563, 1.68516, 0.0
0.053217, -0.092415, -0.097479, 10.85, 76.25, 1.24844, 1.68516, 0.0
-0.193100,0.103259,0.113069,10.85,76.5,1.24844,1.68516,0.0
0.299533,-0.189564,-0.173877,10.6,76.25,1.24844,1.68516,0.0
-1.014155,0.983681,0.887685,10.85,76.25,-1.33285,1.999723,0.0
-1.342576,1.37498,1.093667,10.85,76.5,-1.33285,1.999723,0.0
-0.767838,0.885856,0.669894,10.6,76.25,-1.33285,1.999723,0.0
-0.405051,-0.777161,-0.230504,10.85,76.25,-0.441994,1.999723,0.0
-0.651368,-0.679336,-0.053948,10.85,76.5,-0.441994,1.999723,0.0
-0.113994,-0.875012,-0.407060,10.6,76.25,-0.441994,1.999723,0.0
0.54585,-1.168485,-1.319595,10.85,76.25,0.349563,1.999723,0.0
0.299533,-1.07066,-1.132039,10.85,76.5,0.349563,1.999723,0.0
0.792166,-1.26631, -1.496206, 10.6, 76.25, 0.349563, 1.999723, 0.0
-0.767838, -0.189564, -0.113069, 10.85, 76.25, 1.24844, 1.999723, 0.0
-1.014155, 0.005435, 0.097899, 10.85, 76.5, 1.24844, 1.999723, 0.0
-0.53721, -0.288039, -0.32049, 10.6, 76.25, 1.24844, 1.999723, 0.0
-1.342576, 0.592383, 0.828699, 10.85, 76.25, -1.33285, 2.314286, 0.0
-1.671, 0.885856, 1.03468, 10.85, 76.5, -1.33285, 2.314286, 0.0
-1.119702, 0.494558, 0.615296, 10.6, 76.25, -1.33285, 2.314286, 0.0
-0.405051, -0.972836, -0.440263, 10.85, 76.25, -0.441994, 2.314286, 0.0
-0.651368, -0.875012, -0.263706, 10.85, 76.5, -0.441994, 2.314286, 0.0
-0.113994, -0.972836, -0.630043, 10.6, 76.25, -0.441994, 2.314286, 0.0
1.614631, -1.168485, -1.636653, 10.85, 76.25, 0.349563, 2.314286, 0.0
1.242784, -1.168485, -1.458269, 10.85, 76.5, 0.349563, 2.314286, 0.0
1.890666, -1.26631, -1.813208, 10.6, 76.25, 0.349563, 2.314286, 0.0
-0.038164, -0.777161, -0.457896, 10.85, 76.25, 1.24844, 2.314286, 0.0
-0.286383, -0.581512, -0.278546, 10.85, 76.5, 1.24844, 2.314286, 0.0
0.198305, -0.679336, -0.492576, 10.6, 76.25, 1.24844, 2.314286, 0.0
"""
df = pd.read_csv(StringIO(preprocessed_data_string))

print("DataFrame loaded successfully:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# --- 2. Split the data into features (X) and target (y) ---
# We'll use all other features to predict the 2m temperature.
X = df.drop('2m_temperature_c', axis=1) # All columns except '2m_temperature_c'
y = df['2m_temperature_c'] # The target variable we want to predict

# --- 3. Split the data into training and testing sets ---
# We use a 70/30 split for training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("\nData Split Complete:")
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")

# --- 4. Train the Linear Regression model ---
model = LinearRegression()
print("\nTraining Linear Regression model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 5. Make predictions on the test set ---
y_pred = model.predict(X_test)

# --- 6. Evaluate the model ---
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# You can also inspect the model's coefficients to see which features are most important
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.4f}")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from io import StringIO

# --- 1. Load the preprocessed data ---
# We'll use the same embedded string for a self-contained example.
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
-1.014155,0.983681,0.887685,10.85,76.25,1.24844,1.056034,0.0
-1.342576,1.37498,1.093667,10.85,76.5,1.24844,1.056034,0.0
-0.767838,0.885856,0.669894,10.6,76.25,1.24844,1.056034,0.0
-1.921349,2.15758,1.529267,10.85,76.25,-1.33285,1.370597,0.0
-2.24977,2.548879,1.735249,10.85,76.5,-1.33285,1.370597,0.0
-1.675032,1.96193,1.352712,10.6,76.25,-1.33285,1.370597,0.0
-0.405051,-0.385863,0.180424,10.85,76.25,-0.441994,1.370597,0.0
-0.651368,-0.288039,0.357597,10.85,76.5,-0.441994,1.370597,0.0
-0.113994,-0.483687,-0.021105,10.6,76.25,-0.441994,1.370597,0.0
0.54585,-0.972836,-0.613041,10.85,76.25,0.349563,1.370597,0.0
0.299533,-0.875012,-0.407060,10.85,76.5,0.349563,1.370597,0.0
0.792166,-1.07066,-0.789648,10.6,76.25,0.349563,1.370597,0.0
-0.767838,0.592383,0.509088,10.85,76.25,1.24844,1.370597,0.0
-1.014155,0.885856,0.714488,10.85,76.5,1.24844,1.370597,0.0
-0.537210,0.494558,0.269601,10.6,76.25,1.24844,1.370597,0.0
-1.581515,1.668453,1.200155,10.85,76.25,-1.33285,1.68516,0.0
-1.921349,2.059752,1.405788,10.85,76.5,-1.33285,1.68516,0.0
-1.25337,1.472804,1.017382,10.6,76.25,-1.33285,1.68516,0.0
-0.1931, -0.679336, 0.082728, 10.85, 76.25, -0.441994, 1.68516, 0.0
-0.440625, -0.581512, 0.259899, 10.85, 76.5, -0.441994, 1.68516, 0.0
0.053217, -0.777161, -0.097479, 10.6, 76.25, -0.441994, 1.68516, 0.0
1.01848, -1.07066, -0.995629, 10.85, 76.25, 0.349563, 1.68516, 0.0
0.771658, -0.972836, -0.789648, 10.85, 76.5, 0.349563, 1.68516, 0.0
1.264795, -1.168485, -1.132039, 10.6, 76.25, 0.349563, 1.68516, 0.0
0.053217, -0.092415, -0.097479, 10.85, 76.25, 1.24844, 1.68516, 0.0
-0.193100,0.103259,0.113069,10.85,76.5,1.24844,1.68516,0.0
0.299533,-0.189564,-0.173877,10.6,76.25,1.24844,1.68516,0.0
-1.014155,0.983681,0.887685,10.85,76.25,-1.33285,1.999723,0.0
-1.342576,1.37498,1.093667,10.85,76.5,-1.33285,1.999723,0.0
-0.767838,0.885856,0.669894,10.6,76.25,-1.33285,1.999723,0.0
-0.405051,-0.777161,-0.230504,10.85,76.25,-0.441994,1.999723,0.0
-0.651368,-0.679336,-0.053948,10.85,76.5,-0.441994,1.999723,0.0
-0.113994,-0.875012,-0.407060,10.6,76.25,-0.441994,1.999723,0.0
0.54585,-1.168485,-1.319595,10.85,76.25,0.349563,1.999723,0.0
0.299533,-1.07066,-1.132039,10.85,76.5,0.349563,1.999723,0.0
0.792166,-1.26631, -1.496206, 10.6, 76.25, 0.349563, 1.999723, 0.0
-0.767838, -0.189564, -0.113069, 10.85, 76.25, 1.24844, 1.999723, 0.0
-1.014155, 0.005435, 0.097899, 10.85, 76.5, 1.24844, 1.999723, 0.0
-0.53721, -0.288039, -0.32049, 10.6, 76.25, 1.24844, 1.999723, 0.0
-1.342576, 0.592383, 0.828699, 10.85, 76.25, -1.33285, 2.314286, 0.0
-1.671, 0.885856, 1.03468, 10.85, 76.5, -1.33285, 2.314286, 0.0
-1.119702, 0.494558, 0.615296, 10.6, 76.25, -1.33285, 2.314286, 0.0
-0.405051, -0.972836, -0.440263, 10.85, 76.25, -0.441994, 2.314286, 0.0
-0.651368, -0.875012, -0.263706, 10.85, 76.5, -0.441994, 2.314286, 0.0
-0.113994, -0.972836, -0.630043, 10.6, 76.25, -0.441994, 2.314286, 0.0
1.614631, -1.168485, -1.636653, 10.85, 76.25, 0.349563, 2.314286, 0.0
1.242784, -1.168485, -1.458269, 10.85, 76.5, 0.349563, 2.314286, 0.0
1.890666, -1.26631, -1.813208, 10.6, 76.25, 0.349563, 2.314286, 0.0
-0.038164, -0.777161, -0.457896, 10.85, 76.25, 1.24844, 2.314286, 0.0
-0.286383, -0.581512, -0.278546, 10.85, 76.5, 1.24844, 2.314286, 0.0
0.198305, -0.679336, -0.492576, 10.6, 76.25, 1.24844, 2.314286, 0.0
"""
df = pd.read_csv(StringIO(preprocessed_data_string))

# --- 2. Re-split the data into features (X) and target (y) ---
# We need to do this again to have the same variables in the same script.
X = df.drop('2m_temperature_c', axis=1) # All columns except '2m_temperature_c'
y = df['2m_temperature_c'] # The target variable we want to predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 3. Train the Linear Regression model ---
model = LinearRegression()
model.fit(X_train, y_train)

# --- 4. Make predictions on the test set ---
y_pred = model.predict(X_test)

# --- 5. Create a scatter plot to visualize predictions ---
plt.figure(figsize=(10, 6))

# Plot the actual vs. predicted values
plt.scatter(y_test, y_pred, alpha=0.7, color='dodgerblue', label='Predictions')

# Plot the perfect prediction line (y=x)
plt.plot(y_test, y_test, color='red', linestyle='--', label='Perfect Prediction')

# Add titles and labels for clarity
plt.title('Actual vs. Predicted 2m Temperature', fontsize=16)
plt.xlabel('Actual 2m Temperature (Normalized)', fontsize=12)
plt.ylabel('Predicted 2m Temperature (Normalized)', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from io import StringIO

# --- 1. Load the preprocessed data ---
# We'll use the same embedded string for a self-contained example.
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
-1.014155,0.983681,0.887685,10.85,76.25,1.24844,1.056034,0.0
-1.342576,1.37498,1.093667,10.85,76.5,1.24844,1.056034,0.0
-0.767838,0.885856,0.669894,10.6,76.25,1.24844,1.056034,0.0
-1.921349,2.15758,1.529267,10.85,76.25,-1.33285,1.370597,0.0
-2.24977,2.548879,1.735249,10.85,76.5,-1.33285,1.370597,0.0
-1.675032,1.96193,1.352712,10.6,76.25,-1.33285,1.370597,0.0
-0.405051,-0.385863,0.180424,10.85,76.25,-0.441994,1.370597,0.0
-0.651368,-0.288039,0.357597,10.85,76.5,-0.441994,1.370597,0.0
-0.113994,-0.483687,-0.021105,10.6,76.25,-0.441994,1.370597,0.0
0.54585,-0.972836,-0.613041,10.85,76.25,0.349563,1.370597,0.0
0.299533,-0.875012,-0.407060,10.85,76.5,0.349563,1.370597,0.0
0.792166,-1.07066,-0.789648,10.6,76.25,0.349563,1.370597,0.0
-0.767838,0.592383,0.509088,10.85,76.25,1.24844,1.370597,0.0
-1.014155,0.885856,0.714488,10.85,76.5,1.24844,1.370597,0.0
-0.537210,0.494558,0.269601,10.6,76.25,1.24844,1.370597,0.0
-1.581515,1.668453,1.200155,10.85,76.25,-1.33285,1.68516,0.0
-1.921349,2.059752,1.405788,10.85,76.5,-1.33285,1.68516,0.0
-1.25337,1.472804,1.017382,10.6,76.25,-1.33285,1.68516,0.0
-0.1931, -0.679336, 0.082728, 10.85, 76.25, -0.441994, 1.68516, 0.0
-0.440625, -0.581512, 0.259899, 10.85, 76.5, -0.441994, 1.68516, 0.0
0.053217, -0.777161, -0.097479, 10.6, 76.25, -0.441994, 1.68516, 0.0
1.01848, -1.07066, -0.995629, 10.85, 76.25, 0.349563, 1.68516, 0.0
0.771658, -0.972836, -0.789648, 10.85, 76.5, 0.349563, 1.68516, 0.0
1.264795, -1.168485, -1.132039, 10.6, 76.25, 0.349563, 1.68516, 0.0
0.053217, -0.092415, -0.097479, 10.85, 76.25, 1.24844, 1.68516, 0.0
-0.193100,0.103259,0.113069,10.85,76.5,1.24844,1.68516,0.0
0.299533,-0.189564,-0.173877,10.6,76.25,1.24844,1.68516,0.0
-1.014155,0.983681,0.887685,10.85,76.25,-1.33285,1.999723,0.0
-1.342576,1.37498,1.093667,10.85,76.5,-1.33285,1.999723,0.0
-0.767838,0.885856,0.669894,10.6,76.25,-1.33285,1.999723,0.0
-0.405051,-0.777161,-0.230504,10.85,76.25,-0.441994,1.999723,0.0
-0.651368,-0.679336,-0.053948,10.85,76.5,-0.441994,1.999723,0.0
-0.113994,-0.875012,-0.407060,10.6,76.25,-0.441994,1.999723,0.0
0.54585,-1.168485,-1.319595,10.85,76.25,0.349563,1.999723,0.0
0.299533,-1.07066,-1.132039,10.85,76.5,0.349563,1.999723,0.0
0.792166,-1.26631, -1.496206, 10.6, 76.25, 0.349563, 1.999723, 0.0
-0.767838, -0.189564, -0.113069, 10.85, 76.25, 1.24844, 1.999723, 0.0
-1.014155, 0.005435, 0.097899, 10.85, 76.5, 1.24844, 1.999723, 0.0
-0.53721, -0.288039, -0.32049, 10.6, 76.25, 1.24844, 1.999723, 0.0
-1.342576, 0.592383, 0.828699, 10.85, 76.25, -1.33285, 2.314286, 0.0
-1.671, 0.885856, 1.03468, 10.85, 76.5, -1.33285, 2.314286, 0.0
-1.119702, 0.494558, 0.615296, 10.6, 76.25, -1.33285, 2.314286, 0.0
-0.405051, -0.972836, -0.440263, 10.85, 76.25, -0.441994, 2.314286, 0.0
-0.651368, -0.875012, -0.263706, 10.85, 76.5, -0.441994, 2.314286, 0.0
-0.113994, -0.972836, -0.630043, 10.6, 76.25, -0.441994, 2.314286, 0.0
1.614631, -1.168485, -1.636653, 10.85, 76.25, 0.349563, 2.314286, 0.0
1.242784, -1.168485, -1.458269, 10.85, 76.5, 0.349563, 2.314286, 0.0
1.890666, -1.26631, -1.813208, 10.6, 76.25, 0.349563, 2.314286, 0.0
-0.038164, -0.777161, -0.457896, 10.85, 76.25, 1.24844, 2.314286, 0.0
-0.286383, -0.581512, -0.278546, 10.85, 76.5, 1.24844, 2.314286, 0.0
0.198305, -0.679336, -0.492576, 10.6, 76.25, 1.24844, 2.314286, 0.0
"""
df = pd.read_csv(StringIO(preprocessed_data_string))

print("DataFrame loaded successfully:")
print(df.head())

# --- 2. Split the data into features (X) and target (y) ---
# We'll use all other features to predict the 2m temperature.
X = df.drop('2m_temperature_c', axis=1) # All columns except '2m_temperature_c'
y = df['2m_temperature_c'] # The target variable we want to predict

# --- 3. Split the data into training and testing sets ---
# We use a 70/30 split for training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("\nData Split Complete:")
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")

# --- 4. Train the Random Forest Regressor model ---
# This is the key change from the previous script!
# We instantiate a Random Forest model instead of a Linear Regression model.
model = RandomForestRegressor(n_estimators=100, random_state=42)
print("\nTraining Random Forest Regressor model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 5. Make predictions on the test set ---
y_pred = model.predict(X_test)

# --- 6. Evaluate the model ---
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nRandom Forest Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# Random Forest also has a feature_importances_ attribute that is very useful
print("\nFeature Importances (Top 5):")
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from io import StringIO

# --- 1. Load the preprocessed data ---
# We'll use the same embedded string for a self-contained example.
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
-1.014155,0.983681,0.887685,10.85,76.25,1.24844,1.056034,0.0
-1.342576,1.37498,1.093667,10.85,76.5,1.24844,1.056034,0.0
-0.767838,0.885856,0.669894,10.6,76.25,1.24844,1.056034,0.0
-1.921349,2.15758,1.529267,10.85,76.25,-1.33285,1.370597,0.0
-2.24977,2.548879,1.735249,10.85,76.5,-1.33285,1.370597,0.0
-1.675032,1.96193,1.352712,10.6,76.25,-1.33285,1.370597,0.0
-0.405051,-0.385863,0.180424,10.85,76.25,-0.441994,1.370597,0.0
-0.651368,-0.288039,0.357597,10.85,76.5,-0.441994,1.370597,0.0
-0.113994,-0.483687,-0.021105,10.6,76.25,-0.441994,1.370597,0.0
0.54585,-0.972836,-0.613041,10.85,76.25,0.349563,1.370597,0.0
0.299533,-0.875012,-0.407060,10.85,76.5,0.349563,1.370597,0.0
0.792166,-1.07066,-0.789648,10.6,76.25,0.349563,1.370597,0.0
-0.767838,0.592383,0.509088,10.85,76.25,1.24844,1.370597,0.0
-1.014155,0.885856,0.714488,10.85,76.5,1.24844,1.370597,0.0
-0.537210,0.494558,0.269601,10.6,76.25,1.24844,1.370597,0.0
-1.581515,1.668453,1.200155,10.85,76.25,-1.33285,1.68516,0.0
-1.921349,2.059752,1.405788,10.85,76.5,-1.33285,1.68516,0.0
-1.25337,1.472804,1.017382,10.6,76.25,-1.33285,1.68516,0.0
-0.1931, -0.679336, 0.082728, 10.85, 76.25, -0.441994, 1.68516, 0.0
-0.440625, -0.581512, 0.259899, 10.85, 76.5, -0.441994, 1.68516, 0.0
0.053217, -0.777161, -0.097479, 10.6, 76.25, -0.441994, 1.68516, 0.0
1.01848, -1.07066, -0.995629, 10.85, 76.25, 0.349563, 1.68516, 0.0
0.771658, -0.972836, -0.789648, 10.85, 76.5, 0.349563, 1.68516, 0.0
1.264795, -1.168485, -1.132039, 10.6, 76.25, 0.349563, 1.68516, 0.0
0.053217, -0.092415, -0.097479, 10.85, 76.25, 1.24844, 1.68516, 0.0
-0.193100,0.103259,0.113069,10.85,76.5,1.24844,1.68516,0.0
0.299533,-0.189564,-0.173877,10.6,76.25,1.24844,1.68516,0.0
-1.014155,0.983681,0.887685,10.85,76.25,-1.33285,1.999723,0.0
-1.342576,1.37498,1.093667,10.85,76.5,-1.33285,1.999723,0.0
-0.767838,0.885856,0.669894,10.6,76.25,-1.33285,1.999723,0.0
-0.405051,-0.777161,-0.230504,10.85,76.25,-0.441994,1.999723,0.0
-0.651368,-0.679336,-0.053948,10.85,76.5,-0.441994,1.999723,0.0
-0.113994,-0.875012,-0.407060,10.6,76.25,-0.441994,1.999723,0.0
0.54585,-1.168485,-1.319595,10.85,76.25,0.349563,1.999723,0.0
0.299533,-1.07066,-1.132039,10.85,76.5,0.349563,1.999723,0.0
0.792166,-1.26631, -1.496206, 10.6, 76.25, 0.349563, 1.999723, 0.0
-0.767838, -0.189564, -0.113069, 10.85, 76.25, 1.24844, 1.999723, 0.0
-1.014155, 0.005435, 0.097899, 10.85, 76.5, 1.24844, 1.999723, 0.0
-0.53721, -0.288039, -0.32049, 10.6, 76.25, 1.24844, 1.999723, 0.0
-1.342576, 0.592383, 0.828699, 10.85, 76.25, -1.33285, 2.314286, 0.0
-1.671, 0.885856, 1.03468, 10.85, 76.5, -1.33285, 2.314286, 0.0
-1.119702, 0.494558, 0.615296, 10.6, 76.25, -1.33285, 2.314286, 0.0
-0.405051, -0.972836, -0.440263, 10.85, 76.25, -0.441994, 2.314286, 0.0
-0.651368, -0.875012, -0.263706, 10.85, 76.5, -0.441994, 2.314286, 0.0
-0.113994, -0.972836, -0.630043, 10.6, 76.25, -0.441994, 2.314286, 0.0
1.614631, -1.168485, -1.636653, 10.85, 76.25, 0.349563, 2.314286, 0.0
1.242784, -1.168485, -1.458269, 10.85, 76.5, 0.349563, 2.314286, 0.0
1.890666, -1.26631, -1.813208, 10.6, 76.25, 0.349563, 2.314286, 0.0
-0.038164, -0.777161, -0.457896, 10.85, 76.25, 1.24844, 2.314286, 0.0
-0.286383, -0.581512, -0.278546, 10.85, 76.5, 1.24844, 2.314286, 0.0
0.198305, -0.679336, -0.492576, 10.6, 76.25, 1.24844, 2.314286, 0.0
"""
df = pd.read_csv(StringIO(preprocessed_data_string))

# --- 2. Split the data into features (X) and target (y) ---
X = df.drop('2m_temperature_c', axis=1)
y = df['2m_temperature_c']

# --- 3. Split the data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 4. Train the Random Forest Regressor model ---
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- 5. Make predictions on the test set ---
y_pred = model.predict(X_test)

# --- 6. Evaluate the model (Metrics) ---
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Random Forest Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# --- 7. Visualize the results ---
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='dodgerblue', label='Predictions')

# Plot the ideal 'perfect prediction' line
max_val = max(y_test.max(), y_pred.max())
min_val = min(y_test.min(), y_pred.min())
plt.plot([min_val, max_val], [min_val, max_val], '--', color='red', label='Perfect Prediction')

plt.title('Actual vs. Predicted 2m Temperature (Random Forest)', fontsize=16)
plt.xlabel('Actual Temperature ($°C$)', fontsize=12)
plt.ylabel('Predicted Temperature ($°C$)', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# --- 8. Print Feature Importances ---
print("\nFeature Importances:")
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from io import StringIO
import pickle

# --- 1. Load the preprocessed data ---
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
-1.014155,0.983681,0.887685,10.85,76.25,1.24844,1.056034,0.0
-1.342576,1.37498,1.093667,10.85,76.5,1.24844,1.056034,0.0
-0.767838,0.885856,0.669894,10.6,76.25,1.24844,1.056034,0.0
-1.921349,2.15758,1.529267,10.85,76.25,-1.33285,1.370597,0.0
-2.24977,2.548879,1.735249,10.85,76.5,-1.33285,1.370597,0.0
-1.675032,1.96193,1.352712,10.6,76.25,-1.33285,1.370597,0.0
-0.405051,-0.385863,0.180424,10.85,76.25,-0.441994,1.370597,0.0
-0.651368,-0.288039,0.357597,10.85,76.5,-0.441994,1.370597,0.0
-0.113994,-0.483687,-0.021105,10.6,76.25,-0.441994,1.370597,0.0
0.54585,-0.972836,-0.613041,10.85,76.25,0.349563,1.370597,0.0
0.299533,-0.875012,-0.407060,10.85,76.5,0.349563,1.370597,0.0
0.792166,-1.07066,-0.789648,10.6,76.25,0.349563,1.370597,0.0
-0.767838,0.592383,0.509088,10.85,76.25,1.24844,1.370597,0.0
-1.014155,0.885856,0.714488,10.85,76.5,1.24844,1.370597,0.0
-0.537210,0.494558,0.269601,10.6,76.25,1.24844,1.370597,0.0
-1.581515,1.668453,1.200155,10.85,76.25,-1.33285,1.68516,0.0
-1.921349,2.059752,1.405788,10.85,76.5,-1.33285,1.68516,0.0
-1.25337,1.472804,1.017382,10.6,76.25,-1.33285,1.68516,0.0
-0.1931, -0.679336, 0.082728, 10.85, 76.25, -0.441994, 1.68516, 0.0
-0.440625, -0.581512, 0.259899, 10.85, 76.5, -0.441994, 1.68516, 0.0
0.053217, -0.777161, -0.097479, 10.6, 76.25, -0.441994, 1.68516, 0.0
1.01848, -1.07066, -0.995629, 10.85, 76.25, 0.349563, 1.68516, 0.0
0.771658, -0.972836, -0.789648, 10.85, 76.5, 0.349563, 1.68516, 0.0
1.264795, -1.168485, -1.132039, 10.6, 76.25, 0.349563, 1.68516, 0.0
0.053217, -0.092415, -0.097479, 10.85, 76.25, 1.24844, 1.68516, 0.0
-0.193100,0.103259,0.113069,10.85,76.5,1.24844,1.68516,0.0
0.299533,-0.189564,-0.173877,10.6,76.25,1.24844,1.68516,0.0
-1.014155,0.983681,0.887685,10.85,76.25,-1.33285,1.999723,0.0
-1.342576,1.37498,1.093667,10.85,76.5,-1.33285,1.999723,0.0
-0.767838,0.885856,0.669894,10.6,76.25,-1.33285,1.999723,0.0
-0.405051,-0.777161,-0.230504,10.85,76.25,-0.441994,1.999723,0.0
-0.651368,-0.679336,-0.053948,10.85,76.5,-0.441994,1.999723,0.0
-0.113994,-0.875012,-0.407060,10.6,76.25,-0.441994,1.999723,0.0
0.54585,-1.168485,-1.319595,10.85,76.25,0.349563,1.999723,0.0
0.299533,-1.07066,-1.132039,10.85,76.5,0.349563,1.999723,0.0
0.792166,-1.26631, -1.496206, 10.6, 76.25, 0.349563, 1.999723, 0.0
-0.767838, -0.189564, -0.113069, 10.85, 76.25, 1.24844, 1.999723, 0.0
-1.014155, 0.005435, 0.097899, 10.85, 76.5, 1.24844, 1.999723, 0.0
-0.53721, -0.288039, -0.32049, 10.6, 76.25, 1.24844, 1.999723, 0.0
-1.342576, 0.592383, 0.828699, 10.85, 76.25, -1.33285, 2.314286, 0.0
-1.671, 0.885856, 1.03468, 10.85, 76.5, -1.33285, 2.314286, 0.0
-1.119702, 0.494558, 0.615296, 10.6, 76.25, -1.33285, 2.314286, 0.0
-0.405051, -0.972836, -0.440263, 10.85, 76.25, -0.441994, 2.314286, 0.0
-0.651368, -0.875012, -0.263706, 10.85, 76.5, -0.441994, 2.314286, 0.0
-0.113994, -0.972836, -0.630043, 10.6, 76.25, -0.441994, 2.314286, 0.0
1.614631, -1.168485, -1.636653, 10.85, 76.25, 0.349563, 2.314286, 0.0
1.242784, -1.168485, -1.458269, 10.85, 76.5, 0.349563, 2.314286, 0.0
1.890666, -1.26631, -1.813208, 10.6, 76.25, 0.349563, 2.314286, 0.0
-0.038164, -0.777161, -0.457896, 10.85, 76.25, 1.24844, 2.314286, 0.0
-0.286383, -0.581512, -0.278546, 10.85, 76.5, 1.24844, 2.314286, 0.0
0.198305, -0.679336, -0.492576, 10.6, 76.25, 1.24844, 2.314286, 0.0
"""
df = pd.read_csv(StringIO(preprocessed_data_string))

# --- 2. Split the data into features (X) and target (y) ---
X = df.drop('2m_temperature_c', axis=1)
y = df['2m_temperature_c']

# --- 3. Split the data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 4. Train the Random Forest Regressor model ---
print("Training Random Forest Regressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# --- 5. Save the trained model to a file ---
model_filename = 'random_forest_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"\nModel successfully saved to '{model_filename}'")
print("You can now load this file to use the model for new predictions without retraining.")


In [None]:
import pandas as pd
from io import StringIO
import pickle

# --- 1. Load the preprocessed data (for feature names) ---
# We use this to make sure the new data has the same columns and order as the training data.
preprocessed_data_string = """
2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
-0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
"""
df_features = pd.read_csv(StringIO(preprocessed_data_string))
# Drop the target variable to get the feature names
feature_columns = df_features.drop('2m_temperature_c', axis=1).columns

# --- 2. Load the trained model from the file ---
model_filename = 'random_forest_model.pkl'
print(f"Loading model from '{model_filename}'...")
try:
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print("Model loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{model_filename}' was not found. Please ensure it exists in the current directory.")
    exit()

# --- 3. Prepare new data for prediction ---
# This is an example of a new data point. The values must be preprocessed
# in the same way the training data was.
new_data = {
    'total_precipitation_mm': [0.5],
    'relative_humidity_percent': [0.75],
    'latitude': [10.85],
    'longitude': [76.25],
    'hour': [12],
    'day_of_week': [3],
    'month': [7]
}
new_df = pd.DataFrame(new_data)

# Ensure the new DataFrame has the same columns and order as the training data
new_df_aligned = new_df.reindex(columns=feature_columns, fill_value=0)

# --- 4. Make a prediction using the loaded model ---
print("\nMaking prediction on new data...")
prediction = loaded_model.predict(new_df_aligned)
print(f"The predicted 2m_temperature_c is: {prediction[0]:.4f}")



In [None]:
# app.py
from flask import Flask, request, render_template_string
import pandas as pd
from io import StringIO
import pickle

# Initialize the Flask application
app = Flask(__name__)

# --- 1. Define the HTML template for the web page ---
# We're embedding the HTML directly in the Python file for simplicity.
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Temperature Predictor</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" rel="stylesheet">
    <style>
        body {
            font-family: 'Inter', sans-serif;
        }
    </style>
</head>
<body class="bg-gray-100 flex items-center justify-center min-h-screen p-4">
    <div class="bg-white rounded-lg shadow-xl p-8 max-w-lg w-full">
        <h1 class="text-3xl font-bold text-gray-800 text-center mb-6">Predict 2m Temperature</h1>
        <p class="text-center text-gray-600 mb-8">Enter the conditions below to get a temperature prediction from our trained Random Forest model.</p>

        <form action="/predict" method="post" class="space-y-6">
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="precipitation" class="block text-sm font-medium text-gray-700">Total Precipitation (mm)</label>
                    <input type="number" step="any" id="precipitation" name="total_precipitation_mm" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="humidity" class="block text-sm font-medium text-gray-700">Relative Humidity (%)</label>
                    <input type="number" step="any" id="humidity" name="relative_humidity_percent" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="latitude" class="block text-sm font-medium text-gray-700">Latitude</label>
                    <input type="number" step="any" id="latitude" name="latitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="longitude" class="block text-sm font-medium text-gray-700">Longitude</label>
                    <input type="number" step="any" id="longitude" name="longitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-3 gap-4">
                <div>
                    <label for="hour" class="block text-sm font-medium text-gray-700">Hour (0-23)</label>
                    <input type="number" id="hour" name="hour" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="day" class="block text-sm font-medium text-gray-700">Day of Week (0-6)</label>
                    <input type="number" id="day" name="day_of_week" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="month" class="block text-sm font-medium text-gray-700">Month (1-12)</label>
                    <input type="number" id="month" name="month" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <button type="submit"
                    class="w-full flex justify-center py-2 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
                Get Prediction
            </button>
        </form>

        {% if prediction %}
        <div class="mt-8 p-4 bg-indigo-50 rounded-md">
            <h2 class="text-xl font-semibold text-gray-800 text-center">Prediction Result</h2>
            <p class="text-center mt-2 text-lg font-medium text-indigo-700">The predicted 2m temperature is: <span class="font-bold">{{ prediction }}°C</span></p>
        </div>
        {% endif %}

    </div>
</body>
</html>
"""

# --- 2. Load the model and get feature names once when the app starts ---
# This saves time by not loading the model on every request.
model_filename = 'random_forest_model.pkl'
try:
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print(f"Model '{model_filename}' loaded successfully.")

    # We need the feature names from the original training data.
    preprocessed_data_string = """
    2m_temperature_c,total_precipitation_mm,relative_humidity_percent,latitude,longitude,hour,day_of_week,month
    0.053217,0.103259,-0.230504,10.85,76.25,0.349563,1.056034,0.0
    -0.193100,0.396733,-0.053948,10.85,76.5,0.349563,1.056034,0.0
    0.299533,0.005435,-0.407060,10.6,76.25,0.349563,1.056034,0.0
    """
    df_features = pd.read_csv(StringIO(preprocessed_data_string))
    feature_columns = df_features.drop('2m_temperature_c', axis=1).columns
    print("Feature columns successfully loaded.")

except FileNotFoundError:
    print(f"Error: The file '{model_filename}' was not found. Please ensure it is in the same directory.")
    loaded_model = None
    feature_columns = None
    exit()

# --- 3. Define the main route for the home page ---
@app.route('/')
def home():
    """
    Renders the HTML form.
    """
    return render_template_string(HTML_TEMPLATE, prediction=None)

# --- 4. Define the prediction route ---
@app.route('/predict', methods=['POST'])
def predict():
    """
    Handles the form submission, makes a prediction, and renders the result.
    """
    if loaded_model is None or feature_columns is None:
        return "Error: Model or feature names not loaded.", 500

    # Get the data from the form
    try:
        new_data = {
            'total_precipitation_mm': [float(request.form['total_precipitation_mm'])],
            'relative_humidity_percent': [float(request.form['relative_humidity_percent'])],
            'latitude': [float(request.form['latitude'])],
            'longitude': [float(request.form['longitude'])],
            'hour': [int(request.form['hour'])],
            'day_of_week': [int(request.form['day_of_week'])],
            'month': [int(request.form['month'])]
        }
        
        # Create a DataFrame from the new data
        new_df = pd.DataFrame(new_data)
        
        # Align the new data with the features the model was trained on
        new_df_aligned = new_df.reindex(columns=feature_columns, fill_value=0)

        # Make the prediction
        prediction = loaded_model.predict(new_df_aligned)[0]
        
        # Render the page with the prediction result
        return render_template_string(HTML_TEMPLATE, prediction=f"{prediction:.4f}")

    except Exception as e:
        # Handle any errors during prediction
        return f"An error occurred: {e}", 400

# --- 5. Main entry point to run the app ---
if __name__ == '__main__':
    # You can run this app by executing this file from the terminal.
    # The debug=True flag automatically reloads the server on code changes.
    app.run(debug=True)



In [None]:
# app.py
from flask import Flask, request, render_template_string
import pandas as pd
import pickle

# Initialize the Flask application
app = Flask(__name__)

# --- 1. Define the HTML template for the web page ---
# We're embedding the HTML directly in the Python file for simplicity.
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Temperature Predictor</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" rel="stylesheet">
    <style>
        body {
            font-family: 'Inter', sans-serif;
        }
    </style>
</head>
<body class="bg-gray-100 flex items-center justify-center min-h-screen p-4">
    <div class="bg-white rounded-lg shadow-xl p-8 max-w-lg w-full">
        <h1 class="text-3xl font-bold text-gray-800 text-center mb-6">Predict 2m Temperature</h1>
        <p class="text-center text-gray-600 mb-8">Enter the conditions below to get a temperature prediction from our trained Random Forest model.</p>

        <form action="/predict" method="post" class="space-y-6">
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="precipitation" class="block text-sm font-medium text-gray-700">Total Precipitation (mm)</label>
                    <input type="number" step="any" id="precipitation" name="total_precipitation_mm" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="humidity" class="block text-sm font-medium text-gray-700">Relative Humidity (%)</label>
                    <input type="number" step="any" id="humidity" name="relative_humidity_percent" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="latitude" class="block text-sm font-medium text-gray-700">Latitude</label>
                    <input type="number" step="any" id="latitude" name="latitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="longitude" class="block text-sm font-medium text-gray-700">Longitude</label>
                    <input type="number" step="any" id="longitude" name="longitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-3 gap-4">
                <div>
                    <label for="hour" class="block text-sm font-medium text-gray-700">Hour (0-23)</label>
                    <input type="number" id="hour" name="hour" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="day" class="block text-sm font-medium text-gray-700">Day of Week (0-6)</label>
                    <input type="number" id="day" name="day_of_week" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="month" class="block text-sm font-medium text-gray-700">Month (1-12)</label>
                    <input type="number" id="month" name="month" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <button type="submit"
                    class="w-full flex justify-center py-2 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
                Get Prediction
            </button>
        </form>

        {% if prediction %}
        <div class="mt-8 p-4 bg-indigo-50 rounded-md">
            <h2 class="text-xl font-semibold text-gray-800 text-center">Prediction Result</h2>
            <p class="text-center mt-2 text-lg font-medium text-indigo-700">The predicted 2m temperature is: <span class="font-bold">{{ prediction }}°C</span></p>
        </div>
        {% endif %}

    </div>
</body>
</html>
"""

# --- 2. Load the model and define feature names once when the app starts ---
model_filename = 'random_forest_model.pkl'
try:
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print(f"Model '{model_filename}' loaded successfully.")

    # Directly define the feature columns, as they are known from the training data.
    feature_columns = ['total_precipitation_mm', 'relative_humidity_percent', 'latitude', 'longitude', 'hour', 'day_of_week', 'month']
    print("Feature columns successfully defined.")

except FileNotFoundError:
    print(f"Error: The file '{model_filename}' was not found. Please ensure it is in the same directory.")
    loaded_model = None
    feature_columns = None
    exit()

# --- 3. Define the main route for the home page ---
@app.route('/')
def home():
    """
    Renders the HTML form.
    """
    return render_template_string(HTML_TEMPLATE, prediction=None)

# --- 4. Define the prediction route ---
@app.route('/predict', methods=['POST'])
def predict():
    """
    Handles the form submission, makes a prediction, and renders the result.
    """
    if loaded_model is None or feature_columns is None:
        return "Error: Model or feature names not loaded.", 500

    # Get the data from the form
    try:
        new_data = {
            'total_precipitation_mm': [float(request.form['total_precipitation_mm'])],
            'relative_humidity_percent': [float(request.form['relative_humidity_percent'])],
            'latitude': [float(request.form['latitude'])],
            'longitude': [float(request.form['longitude'])],
            'hour': [int(request.form['hour'])],
            'day_of_week': [int(request.form['day_of_week'])],
            'month': [int(request.form['month'])]
        }
        
        # Create a DataFrame from the new data
        new_df = pd.DataFrame(new_data)
        
        # Align the new data with the features the model was trained on
        new_df_aligned = new_df.reindex(columns=feature_columns, fill_value=0)

        # Make the prediction
        prediction = loaded_model.predict(new_df_aligned)[0]
        
        # Render the page with the prediction result
        return render_template_string(HTML_TEMPLATE, prediction=f"{prediction:.4f}")

    except Exception as e:
        # Handle any errors during prediction
        return f"An error occurred: {e}", 400

# --- 5. Main entry point to run the app ---
if __name__ == '__main__':
    # You can run this app by executing this file from the terminal.
    # The debug=True flag automatically reloads the server on code changes.
    app.run(debug=True)



In [None]:
# app.py
from flask import Flask, request, render_template_string
import pandas as pd
import pickle

# Initialize the Flask application
app = Flask(__name__)

# --- 1. Define the HTML template for the web page ---
# We're embedding the HTML directly in the Python file for simplicity.
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Temperature Predictor</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" rel="stylesheet">
    <style>
        body {
            font-family: 'Inter', sans-serif;
        }
    </style>
</head>
<body class="bg-gray-100 flex items-center justify-center min-h-screen p-4">
    <div class="bg-white rounded-lg shadow-xl p-8 max-w-lg w-full">
        <h1 class="text-3xl font-bold text-gray-800 text-center mb-6">Predict 2m Temperature</h1>
        <p class="text-center text-gray-600 mb-8">Enter the conditions below to get a temperature prediction from our trained Random Forest model.</p>

        <form action="/predict" method="post" class="space-y-6">
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="precipitation" class="block text-sm font-medium text-gray-700">Total Precipitation (mm)</label>
                    <input type="number" step="any" id="precipitation" name="total_precipitation_mm" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="humidity" class="block text-sm font-medium text-gray-700">Relative Humidity (%)</label>
                    <input type="number" step="any" id="humidity" name="relative_humidity_percent" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                <div>
                    <label for="latitude" class="block text-sm font-medium text-gray-700">Latitude</label>
                    <input type="number" step="any" id="latitude" name="latitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="longitude" class="block text-sm font-medium text-gray-700">Longitude</label>
                    <input type="number" step="any" id="longitude" name="longitude" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <div class="grid grid-cols-1 md:grid-cols-3 gap-4">
                <div>
                    <label for="hour" class="block text-sm font-medium text-gray-700">Hour (0-23)</label>
                    <input type="number" id="hour" name="hour" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="day" class="block text-sm font-medium text-gray-700">Day of Week (0-6)</label>
                    <input type="number" id="day" name="day_of_week" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
                <div>
                    <label for="month" class="block text-sm font-medium text-gray-700">Month (1-12)</label>
                    <input type="number" id="month" name="month" required
                           class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500 sm:text-sm p-2">
                </div>
            </div>
            <button type="submit"
                    class="w-full flex justify-center py-2 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
                Get Prediction
            </button>
        </form>

        {% if prediction %}
        <div class="mt-8 p-4 bg-indigo-50 rounded-md">
            <h2 class="text-xl font-semibold text-gray-800 text-center">Prediction Result</h2>
            <p class="text-center mt-2 text-lg font-medium text-indigo-700">The predicted 2m temperature is: <span class="font-bold">{{ prediction }}°C</span></p>
        </div>
        {% endif %}

    </div>
</body>
</html>
"""

# --- 2. Load the model and define feature names once when the app starts ---
model_filename = 'random_forest_model.pkl'
try:
    with open(model_filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print(f"Model '{model_filename}' loaded successfully.")

    # Directly define the feature columns, as they are known from the training data.
    feature_columns = ['total_precipitation_mm', 'relative_humidity_percent', 'latitude', 'longitude', 'hour', 'day_of_week', 'month']
    print("Feature columns successfully defined.")

except FileNotFoundError:
    print(f"Error: The file '{model_filename}' was not found. Please ensure it is in the same directory.")
    loaded_model = None
    feature_columns = None
    exit()

# --- 3. Define the main route for the home page ---
@app.route('/')
def home():
    """
    Renders the HTML form.
    """
    return render_template_string(HTML_TEMPLATE, prediction=None)

# --- 4. Define the prediction route ---
@app.route('/predict', methods=['POST'])
def predict():
    """
    Handles the form submission, makes a prediction, and renders the result.
    """
    if loaded_model is None or feature_columns is None:
        return "Error: Model or feature names not loaded.", 500

    # Get the data from the form
    try:
        new_data = {
            'total_precipitation_mm': [float(request.form['total_precipitation_mm'])],
            'relative_humidity_percent': [float(request.form['relative_humidity_percent'])],
            'latitude': [float(request.form['latitude'])],
            'longitude': [float(request.form['longitude'])],
            'hour': [int(request.form['hour'])],
            'day_of_week': [int(request.form['day_of_week'])],
            'month': [int(request.form['month'])]
        }
        
        # Create a DataFrame from the new data
        new_df = pd.DataFrame(new_data)
        
        # Align the new data with the features the model was trained on
        new_df_aligned = new_df.reindex(columns=feature_columns, fill_value=0)

        # Make the prediction
        prediction = loaded_model.predict(new_df_aligned)[0]
        
        # Render the page with the prediction result
        return render_template_string(HTML_TEMPLATE, prediction=f"{prediction:.4f}")

    except Exception as e:
        # Handle any errors during prediction
        return f"An error occurred: {e}", 400

# --- 5. Main entry point to run the app ---
if __name__ == '__main__':
    # Running in debug=False to avoid issues with auto-reloader in certain environments
    app.run(debug=False)


In [1]:
import pandas as pd
import numpy as np

def preprocess_new_data(precipitation, humidity, latitude, longitude, hour, day_of_week, month):
    """
    Preprocesses a new data point to be ready for model prediction.

    This function takes raw input values for a weather data point and converts them
    into a Pandas DataFrame with the specific column order required by the
    trained Random Forest model. This is a crucial step to avoid errors
    when making new predictions.

    Args:
        precipitation (float): The total precipitation in millimeters.
        humidity (float): The relative humidity percentage.
        latitude (float): The latitude of the location.
        longitude (float): The longitude of the location.
        hour (int): The hour of the day (0-23).
        day_of_week (int): The day of the week (0=Monday, 6=Sunday).
        month (int): The month of the year (1-12).

    Returns:
        pandas.DataFrame: A single-row DataFrame with the preprocessed data,
                          ready to be passed to the model.
    """
    # Create a dictionary with the new data
    new_data_dict = {
        'total_precipitation_mm': [precipitation],
        'relative_humidity_percent': [humidity],
        'latitude': [latitude],
        'longitude': [longitude],
        'hour': [hour],
        'day_of_week': [day_of_week],
        'month': [month]
    }
    
    # Create a DataFrame from the dictionary
    new_df = pd.DataFrame(new_data_dict)

    # Define the exact column order the model expects.
    # This is the most critical step to prevent prediction errors.
    feature_columns = [
        'total_precipitation_mm', 
        'relative_humidity_percent', 
        'latitude', 
        'longitude', 
        'hour', 
        'day_of_week', 
        'month'
    ]

    # Reindex the DataFrame to match the required column order.
    # We set fill_value=0 as a safeguard, though in this case, all columns
    # should be present.
    preprocessed_df = new_df.reindex(columns=feature_columns, fill_value=0)
    
    return preprocessed_df

if __name__ == '__main__':
    # --- Example Usage ---
    # Define a set of raw input values for a new data point
    # These values must be in the same scale as the training data if normalization was applied.
    # For now, we will use the unscaled values from the original data source.
    raw_input_values = {
        'precipitation': 0.5,
        'humidity': 0.75,
        'latitude': 10.85,
        'longitude': 76.25,
        'hour': 12,
        'day_of_week': 3,
        'month': 7
    }

    # Preprocess the raw data using the function
    ready_data = preprocess_new_data(
        raw_input_values['precipitation'],
        raw_input_values['humidity'],
        raw_input_values['latitude'],
        raw_input_values['longitude'],
        raw_input_values['hour'],
        raw_input_values['day_of_week'],
        raw_input_values['month']
    )

    # Print the resulting DataFrame to verify it's correctly structured
    print("Preprocessed Data for Prediction:")
    print(ready_data)
    print("\nDataFrame structure is ready for the Random Forest model.")


Preprocessed Data for Prediction:
   total_precipitation_mm  relative_humidity_percent  latitude  longitude  \
0                     0.5                       0.75     10.85      76.25   

   hour  day_of_week  month  
0    12            3      7  

DataFrame structure is ready for the Random Forest model.


In [2]:
# Let's set your map key that was emailed to you. It should look something like 'abcdef1234567890abcdef1234567890'
MAP_KEY = '2eaecfb3056b7b7751771485eb481c51'
# MAP_KEY = 'abcdef0123456789abcdef1234567890'

# now let's check how many transactions we have
import pandas as pd
import requests
url = 'https://firms.modaps.eosdis.nasa.gov/mapserver/mapkey_status/?MAP_KEY=' + MAP_KEY
try:
  response = requests.get(url)
  data = response.json()
  df = pd.Series(data)
  display(df)
except:
  # possible error, wrong MAP_KEY value, check for extra quotes, missing letters
  print ("There is an issue with the query. \nTry in your browser: %s" % url)

transaction_limit             5000
current_transactions             0
transaction_interval    10 minutes
dtype: object

In [3]:
# let's create a simple function that tells us how many transactions we have used.
# We will use this in later examples

def get_transaction_count() :
  count = 0
  try:
    response = requests.get(url)
    data = response.json()
    df = pd.Series(data)
    count = df['current_transactions']
  except:
    print ("Error in our call.")
  return count

tcount = get_transaction_count()
print ('Our current transaction count is %i' % tcount)

Our current transaction count is 0


In [7]:
# let's query data_availability to find out what date range is available for various datasets
# we will explain these datasets a bit later

# this url will return information about all supported sensors and their corresponding datasets
# instead of 'all' you can specify individual sensor, ex:LANDSAT_NRT
da_url = 'https://firms.modaps.eosdis.nasa.gov/api/data_availability/csv/' + MAP_KEY + '/all'
df = pd.read_csv(da_url)
display(df)

Unnamed: 0,data_id,min_date,max_date
0,MODIS_NRT,2025-05-01,2025-07-31
1,MODIS_SP,2000-11-01,2025-04-30
2,VIIRS_NOAA20_NRT,2025-03-01,2025-07-31
3,VIIRS_NOAA20_SP,2018-04-01,2025-02-28
4,VIIRS_NOAA21_NRT,2024-01-17,2025-07-31
5,VIIRS_SNPP_NRT,2025-04-01,2025-07-31
6,VIIRS_SNPP_SP,2012-01-20,2025-03-31
7,LANDSAT_NRT,2022-06-20,2025-07-31
8,GOES_NRT,2022-08-09,2025-07-31
9,BA_MODIS,2000-11-01,2025-05-01


In [8]:
# ==============================================================================
# Day 2: Data Acquisition Scripts
# These scripts are designed to be run in a Jupyter Notebook (data_acquisition_day2.ipynb).
# ==============================================================================

# General imports
import requests
import pandas as pd
import xarray as xr
import numpy as np
from datetime import datetime, timedelta
from io import StringIO
import cdsapi
import os

# --- FIRMS Data Acquisition ---
# This section handles fetching active fire hotspot data from the FIRMS API.

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key from the FIRMS website.
FIRMS_API_KEY = "YOUR_FIRMS_API_KEY"

# Bounding box for Kerala (approximate)
# Format: min_lon,min_lat,max_lon,max_lat
KERALA_BOUNDING_BOX = "74.5,8.0,77.5,12.5"

# Source: 'VIIRS_SNPP_NRT' or 'VIIRS_NOAA20_NRT'
# We'll use VIIRS_SNPP_NRT for this example.
FIRMS_SOURCE = "VIIRS_SNPP_NRT"

def fetch_firms_data(api_key, source, bbox, date_range_days=7):
    """
    Fetches FIRMS active fire data for a given bounding box and date range.
    date_range_days: Number of days back from today to fetch data.
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)
    
    # FIRMS API date format is YYYY-MM-DD
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")

    # Construct the URL for the FIRMS 'area' archive endpoint.
    # This is a robust pattern for historical data downloads via bounding box.
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox}/{start_date_str}/{end_date_str}")
    
    print(f"Fetching FIRMS data from: {url}")
    response = requests.get(url)
    response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

    # FIRMS returns CSV data, which we read directly into a Pandas DataFrame
    data = pd.read_csv(StringIO(response.text))
    
    # Data is returned with a header, but let's check for empty data.
    if data.empty:
        print("No FIRMS data found for the specified period and area.")
        return pd.DataFrame() # Return an empty DataFrame

    return data

# --- Test the FIRMS function ---
try:
    firms_data = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE, KERALA_BOUNDING_BOX, date_range_days=7)
    
    if not firms_data.empty:
        print(f"Fetched {len(firms_data)} FIRMS hotspots.")
        print("Raw FIRMS data head:")
        print(firms_data.head())
        
        # Member 1 needs 'latitude', 'longitude', 'acq_date', 'acq_time', 'frp'
        required_firms_cols = ['latitude', 'longitude', 'acq_date', 'acq_time', 'frp']
        if all(col in firms_data.columns for col in required_firms_cols):
            print("\nFIRMS data contains required columns. Performing basic processing...")
            
            # Basic processing: combine date and time, convert to a proper timestamp
            # Use zfill(4) to ensure the time is a 4-digit string (e.g., 800 -> 0800)
            firms_data['timestamp'] = pd.to_datetime(
                firms_data['acq_date'] + ' ' + firms_data['acq_time'].astype(str).str.zfill(4), 
                format='%Y-%m-%d %H%M'
            )
            
            # Select and reorder relevant columns
            processed_firms_data = firms_data[['timestamp', 'latitude', 'longitude', 'frp', 'confidence']]
            print("\nProcessed FIRMS data head:")
            print(processed_firms_data.head())
            
            # Save to a temporary CSV for inspection (optional)
            processed_firms_data.to_csv("firms_data_last_7_days.csv", index=False)
            print("\nFIRMS data saved to firms_data_last_7_days.csv")
        else:
            print("FIRMS data missing some required columns. Check FIRMS API documentation.")

except requests.exceptions.HTTPError as e:
    print(f"HTTP Error fetching FIRMS data: {e}")
    print("Check your API key, bounding box, and FIRMS API documentation for the correct URL format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    print("Ensure your API key is correct and you have internet access.")


# ==============================================================================
# --- ERA5-Land Data Acquisition ---
# This section handles fetching meteorological data from the CDS API using cdsapi.
# ==============================================================================

# --- Configuration ---
# Bounding box for Kerala in CDS API format: [North, West, South, East]
KERALA_BOUNDING_BOX_CDS = [12.5, 74.5, 8.0, 77.5]

def fetch_era5_land_data(bbox, date_range_days=2):
    """
    Fetches ERA5-Land data for a given bounding box and date range.
    bbox: [North, West, South, East]
    date_range_days: Number of days back from today to fetch data.
    """
    c = cdsapi.Client()

    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)

    # Generate a list of dates for the request
    # ERA5-Land requires a list of years, months, and days
    dates = [start_date + timedelta(days=i) for i in range(date_range_days)]
    years = sorted(list(set([d.strftime('%Y') for d in dates])))
    months = sorted(list(set([d.strftime('%m') for d in dates])))
    days = sorted(list(set([d.strftime('%d') for d in dates])))

    # Define variables to fetch (based on AI/ML requirements)
    variables = [
        '2m_temperature',
        '2m_dewpoint_temperature', # Useful for calculating relative humidity
        'total_precipitation',
        '10m_u_component_of_wind', # For wind speed/direction
        '10m_v_component_of_wind', # For wind speed/direction
    ]

    # Define times to fetch (hourly data for detailed analysis)
    times = [
        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00', '06:00', '07:00',
        '08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00',
        '16:00', '17:00', '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
    ]

    # Temporary file to store the downloaded NetCDF
    output_file = 'era5_land_data.nc'
    
    print(f"Requesting ERA5-Land data for dates: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
    try:
        # Check if the output file already exists and remove it to avoid errors
        if os.path.exists(output_file):
            os.remove(output_file)
            
        c.retrieve(
            'reanalysis-era5-land',
            {
                'variable': variables,
                'year': years,
                'month': months,
                'day': days,
                'time': times,
                'area': bbox, # [North, West, South, East]
                'format': 'netcdf',
            },
            output_file
        )
        print(f"\nERA5-Land data downloaded to {output_file}")

        # Load the data using xarray
        ds = xr.open_dataset(output_file)
        print("\nERA5-Land data loaded into xarray Dataset.")
        print(ds) # Print dataset info
        
        # --- Basic Processing (Example: Convert temperature from Kelvin to Celsius) ---
        if '2m_temperature' in ds.data_vars:
            ds['2m_temperature_c'] = ds['2m_temperature'] - 273.15
            print("\n2m_temperature converted to Celsius.")
        
        # --- Example: Calculate wind speed from u/v components ---
        if '10m_u_component_of_wind' in ds.data_vars and '10m_v_component_of_wind' in ds.data_vars:
            ds['wind_speed'] = np.sqrt(ds['10m_u_component_of_wind']**2 + ds['10m_v_component_of_wind']**2)
            print("Wind speed calculated.")

        # --- Example: Calculate relative humidity from temperature and dewpoint ---
        if '2m_temperature_c' in ds.data_vars and '2m_dewpoint_temperature' in ds.data_vars:
            # Convert dewpoint to Celsius first
            ds['2m_dewpoint_temperature_c'] = ds['2m_dewpoint_temperature'] - 273.15
            
            # Formula: RH = 100 * (exp((17.625 * TD) / (243.04 + TD)) / exp((17.625 * T) / (243.04 + T)))
            # Where T is dry bulb temp, TD is dew point temp, both in Celsius
            e_s = 6.1094 * np.exp((17.625 * ds['2m_temperature_c']) / (243.04 + ds['2m_temperature_c']))
            e_a = 6.1094 * np.exp((17.625 * ds['2m_dewpoint_temperature_c']) / (243.04 + ds['2m_dewpoint_temperature_c']))
            ds['relative_humidity_percent'] = (e_a / e_s) * 100
            
            # Clip values to the valid range [0, 100]
            ds['relative_humidity_percent'] = ds['relative_humidity_percent'].clip(0, 100)
            print("Relative humidity calculated.")

        # Convert xarray Dataset to a flattened Pandas DataFrame for easier feature engineering later
        df = ds.to_dataframe().reset_index()
        
        # Drop original Kelvin temps if Celsius is preferred, and the wind components
        df = df.drop(columns=['2m_temperature', '2m_dewpoint_temperature', 
                              '10m_u_component_of_wind', '10m_v_component_of_wind'], errors='ignore')
        print("\nERA5-Land data converted to Pandas DataFrame.")
        print(df.head())
        
        # Save to a temporary CSV for inspection (optional)
        df.to_csv("era5_land_data_processed.csv", index=False)
        print("ERA5-Land processed data saved to era5_land_data_processed.csv")
        return df

    except Exception as e:
        print(f"Error fetching ERA5-Land data: {e}")
        print("Ensure your .cdsapirc file is correctly configured in your home directory.")
        print("Also, check your CDS account for any data download limits or issues.")
        print("Verify the bounding box order and format for the CDS API.")
        return None

# --- Test the ERA5-Land function ---
era5_df = fetch_era5_land_data(KERALA_BOUNDING_BOX_CDS, date_range_days=2)


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/YOUR_FIRMS_API_KEY/VIIRS_SNPP_NRT/74.5,8.0,77.5,12.5/2025-07-24/2025-07-31
HTTP Error fetching FIRMS data: 500 Server Error: Internal Server Error for url: https://firms.modaps.eosdis.nasa.gov/api/area/csv/YOUR_FIRMS_API_KEY/VIIRS_SNPP_NRT/74.5,8.0,77.5,12.5/2025-07-24/2025-07-31
Check your API key, bounding box, and FIRMS API documentation for the correct URL format.


2025-07-31 22:39:35,039 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.


Requesting ERA5-Land data for dates: 2025-07-29 to 2025-07-30


2025-07-31 22:39:35,650 INFO Request ID is 1a8cf89b-82fc-4217-a0b2-6c050c621307
2025-07-31 22:39:35,951 INFO status has been updated to accepted
2025-07-31 22:39:58,896 INFO status has been updated to failed


Error fetching ERA5-Land data: 400 Client Error: Bad Request for url: https://cds.climate.copernicus.eu/api/retrieve/v1/jobs/1a8cf89b-82fc-4217-a0b2-6c050c621307/results
The job has failed
None of the data you have requested is available yet, please revise the period requested. The latest date available for this dataset is: 2025-07-26 17:00
None of the data you have requested is available yet, please revise the period requested. The latest date available for this dataset is: 2025-07-26 17:00
The job failed with: MultiAdaptorNoDataError
Ensure your .cdsapirc file is correctly configured in your home directory.
Also, check your CDS account for any data download limits or issues.
Verify the bounding box order and format for the CDS API.


In [9]:
# General imports
import requests
import pandas as pd
import xarray as xr
import numpy as np
from datetime import datetime, timedelta
from io import StringIO
import cdsapi
import os

# --- FIRMS Data Acquisition ---
# This section handles fetching active fire hotspot data from the FIRMS API.

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key from the FIRMS website.
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51"

# Bounding box for Kerala (approximate)
# Format: min_lon,min_lat,max_lon,max_lat
KERALA_BOUNDING_BOX = "74.5,8.0,77.5,12.5"

# Source: 'VIIRS_SNPP_NRT' or 'VIIRS_NOAA20_NRT'
# We'll use VIIRS_SNPP_NRT for this example.
FIRMS_SOURCE = "VIIRS_SNPP_NRT"

def fetch_firms_data(api_key, source, bbox, date_range_days=7):
    """
    Fetches FIRMS active fire data for a given bounding box and date range.
    date_range_days: Number of days back from today to fetch data.
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(days=date_range_days)
    
    # FIRMS API date format is YYYY-MM-DD
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")

    # Construct the URL for the FIRMS 'area' archive endpoint.
    # This is a robust pattern for historical data downloads via bounding box.
    url = (f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{api_key}/{source}/"
           f"{bbox}/{start_date_str}/{end_date_str}")
    
    print(f"Fetching FIRMS data from: {url}")
    response = requests.get(url)
    response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

    # FIRMS returns CSV data, which we read directly into a Pandas DataFrame
    data = pd.read_csv(StringIO(response.text))
    
    # Data is returned with a header, but let's check for empty data.
    if data.empty:
        print("No FIRMS data found for the specified period and area.")
        return pd.DataFrame() # Return an empty DataFrame

    return data

# --- Test the FIRMS function ---
try:
    firms_data = fetch_firms_data(FIRMS_API_KEY, FIRMS_SOURCE, KERALA_BOUNDING_BOX, date_range_days=7)
    
    if not firms_data.empty:
        print(f"Fetched {len(firms_data)} FIRMS hotspots.")
        print("Raw FIRMS data head:")
        print(firms_data.head())
        
        # Member 1 needs 'latitude', 'longitude', 'acq_date', 'acq_time', 'frp'
        required_firms_cols = ['latitude', 'longitude', 'acq_date', 'acq_time', 'frp']
        if all(col in firms_data.columns for col in required_firms_cols):
            print("\nFIRMS data contains required columns. Performing basic processing...")
            
            # Basic processing: combine date and time, convert to a proper timestamp
            # Use zfill(4) to ensure the time is a 4-digit string (e.g., 800 -> 0800)
            firms_data['timestamp'] = pd.to_datetime(
                firms_data['acq_date'] + ' ' + firms_data['acq_time'].astype(str).str.zfill(4), 
                format='%Y-%m-%d %H%M'
            )
            
            # Select and reorder relevant columns
            processed_firms_data = firms_data[['timestamp', 'latitude', 'longitude', 'frp', 'confidence']]
            print("\nProcessed FIRMS data head:")
            print(processed_firms_data.head())
            
            # Save to a temporary CSV for inspection (optional)
            processed_firms_data.to_csv("firms_data_last_7_days.csv", index=False)
            print("\nFIRMS data saved to firms_data_last_7_days.csv")
        else:
            print("FIRMS data missing some required columns. Check FIRMS API documentation.")

except requests.exceptions.HTTPError as e:
    print(f"HTTP Error fetching FIRMS data: {e}")
    print("Check your API key, bounding box, and FIRMS API documentation for the correct URL format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    print("Ensure your API key is correct and you have internet access.")


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_SNPP_NRT/74.5,8.0,77.5,12.5/2025-07-24/2025-07-31
HTTP Error fetching FIRMS data: 500 Server Error: Internal Server Error for url: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_SNPP_NRT/74.5,8.0,77.5,12.5/2025-07-24/2025-07-31
Check your API key, bounding box, and FIRMS API documentation for the correct URL format.


In [10]:
# ==============================================================================
# Simple FIRMS API Request
# This script fetches the most recent day of VIIRS NOAA-20 data for the entire world.
# This is a good way to test the API connection and verify your key.
# ==============================================================================

# General imports
import requests
import pandas as pd
from io import StringIO
import os

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key from the FIRMS website.
# The key '2eaecfb3056b7b7751771485eb481c51' is a placeholder and will not work.
# Make sure you have a valid key from https://nrt3.modaps.eosdis.nasa.gov/api/activate
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51" 

# Base URL for the FIRMS API
BASE_URL = 'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'

# We'll use VIIRS NOAA-20, which is known for its high resolution.
FIRMS_SOURCE = 'VIIRS_NOAA20_NRT'

# Region and date range: 'world' and '1' for the most recent 24 hours.
FIRMS_REGION = 'world'
DATE_RANGE = '1'

def fetch_firms_global_data(api_key, source, region, date_range):
    """
    Fetches FIRMS active fire data for a specific region and date range.
    This uses a simplified API endpoint for quick access to recent data.
    """
    # Construct the URL
    url = f"{BASE_URL}{api_key}/{source}/{region}/{date_range}"
    
    print(f"Fetching FIRMS data from: {url}")
    
    # Check if the API key is a placeholder
    if api_key == "YOUR_FIRMS_API_KEY":
        print("\nERROR: You must replace 'YOUR_FIRMS_API_KEY' with your actual key.")
        return None
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

        # FIRMS returns CSV data, which we read directly into a Pandas DataFrame
        data = pd.read_csv(StringIO(response.text))
        
        if data.empty:
            print("No FIRMS data found for the specified period and area.")
            return pd.DataFrame() # Return an empty DataFrame
        
        return data

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching FIRMS data: {e}")
        print("Check your API key and the URL format. If the key is correct, the server might be experiencing issues.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Ensure you have a valid API key and internet access.")
        return None

# --- Test the FIRMS function ---
# The get_transaction_count() function is not a standard library function.
# It would be a custom function in a real-world scenario to track usage.
# For this example, we'll just call the data fetching function directly.
firms_global_data = fetch_firms_global_data(FIRMS_API_KEY, FIRMS_SOURCE, FIRMS_REGION, DATE_RANGE)

if firms_global_data is not None and not firms_global_data.empty:
    print(f"\nSuccessfully fetched {len(firms_global_data)} FIRMS hotspots.")
    print("FIRMS data head:")
    print(firms_global_data.head())
    
    # Save to a temporary CSV for inspection (optional)
    firms_global_data.to_csv("firms_global_recent_data.csv", index=False)
    print("\nFIRMS data saved to firms_global_recent_data.csv")



Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_NOAA20_NRT/world/1

Successfully fetched 32410 FIRMS hotspots.
FIRMS data head:
   latitude  longitude  bright_ti4  scan  track    acq_date  acq_time  \
0  19.40509 -155.27338      367.00  0.56   0.69  2025-07-31        22   
1  19.40793 -155.27521      355.45  0.56   0.69  2025-07-31        22   
2  19.40839 -155.26984      351.72  0.56   0.69  2025-07-31        22   
3  19.41144 -155.27393      352.36  0.56   0.69  2025-07-31        22   
4  56.05059  160.64233      356.53  0.42   0.61  2025-07-31        35   

  satellite instrument confidence version  bright_ti5    frp daynight  
0       N20      VIIRS          h  2.0NRT      329.90  21.45        D  
1       N20      VIIRS          l  2.0NRT      330.36  41.23        D  
2       N20      VIIRS          n  2.0NRT      319.27  26.26        D  
3       N20      VIIRS          n  2.0NRT      325.09  21.45        D  
4     

In [11]:
# ==============================================================================
# Specific FIRMS API Request for Kerala, India
# This script fetches active fire data for the state of Kerala, India,
# and a specific date range using the FIRMS API.
# ==============================================================================

# General imports
import requests
import pandas as pd
from io import StringIO
import os

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key.
# The key '2eaecfb3056b7b7751771485eb481c51' is a placeholder.
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51"

# Base URL for the FIRMS API
BASE_URL = 'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'

# We'll use VIIRS NOAA-20 for high-resolution data.
FIRMS_SOURCE = 'VIIRS_NOAA20_NRT'

# Define the bounding box for Kerala, India.
# Format: lon_min,lat_min,lon_max,lat_max
# Coordinates for Kerala: ~74.5°E, 8.0°N to 77.5°E, 13.0°N
BOUNDING_BOX = "74.5,8.0,77.5,13.0"

# Define the date range. The API expects YYYY-MM-DD.
START_DATE = '2025-07-30'
END_DATE = '2025-07-31'
DATE_RANGE = f"{START_DATE},{END_DATE}"

def fetch_firms_data_with_bbox(api_key, source, bbox, date_range):
    """
    Fetches FIRMS active fire data for a specific bounding box and date range.
    """
    # Construct the URL with the bounding box and date range parameters.
    url = f"{BASE_URL}{api_key}/{source}/{bbox}/{date_range}"
    
    print(f"Fetching FIRMS data from: {url}")
    
    # Check for placeholder key
    if api_key == "YOUR_FIRMS_API_KEY":
        print("\nERROR: You must replace 'YOUR_FIRMS_API_KEY' with your actual key.")
        return None
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

        # FIRMS returns CSV data, which we read into a Pandas DataFrame
        data = pd.read_csv(StringIO(response.text))
        
        if data.empty:
            print("No FIRMS data found for the specified period and area.")
            return pd.DataFrame() # Return an empty DataFrame
        
        return data

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching FIRMS data: {e}")
        print("This could be due to an invalid key, incorrect URL format, or server issues.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Ensure you have a valid API key and internet access.")
        return None

# --- Test the FIRMS function ---
firms_data = fetch_firms_data_with_bbox(FIRMS_API_KEY, FIRMS_SOURCE, BOUNDING_BOX, DATE_RANGE)

if firms_data is not None and not firms_data.empty:
    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("FIRMS data head:")
    print(firms_data.head())
    
    # Save to a temporary CSV for inspection (optional)
    firms_data.to_csv("firms_kerala_data.csv", index=False)
    print(f"\nFIRMS data for {BOUNDING_BOX} saved to firms_kerala_data.csv")


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_NOAA20_NRT/74.5,8.0,77.5,13.0/2025-07-30,2025-07-31
No FIRMS data found for the specified period and area.


In [12]:
# ==============================================================================
# Updated FIRMS API Request for California, USA
# This script fetches active fire data for California and a recent date range.
# This demonstrates a successful data retrieval for an area with known fire activity.
# ==============================================================================

# General imports
import requests
import pandas as pd
from io import StringIO
import os

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key.
# The key '2eaecfb3056b7b7751771485eb481c51' is a placeholder.
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51"

# Base URL for the FIRMS API
BASE_URL = 'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'

# We'll use VIIRS NOAA-20 for high-resolution data.
FIRMS_SOURCE = 'VIIRS_NOAA20_NRT'

# Define the bounding box for California, USA.
# Format: lon_min,lat_min,lon_max,lat_max
BOUNDING_BOX = "-124.5,32.5,-114.0,42.0"

# Define the date range. The API expects YYYY-MM-DD.
START_DATE = '2025-07-30'
END_DATE = '2025-07-31'
DATE_RANGE = f"{START_DATE},{END_DATE}"

def fetch_firms_data_with_bbox(api_key, source, bbox, date_range):
    """
    Fetches FIRMS active fire data for a specific bounding box and date range.
    """
    # Construct the URL with the bounding box and date range parameters.
    url = f"{BASE_URL}{api_key}/{source}/{bbox}/{date_range}"
    
    print(f"Fetching FIRMS data from: {url}")
    
    # Check for placeholder key
    if api_key == "YOUR_FIRMS_API_KEY":
        print("\nERROR: You must replace 'YOUR_FIRMS_API_KEY' with your actual key.")
        return None
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

        # FIRMS returns CSV data, which we read into a Pandas DataFrame
        data = pd.read_csv(StringIO(response.text))
        
        if data.empty:
            print("No FIRMS data found for the specified period and area.")
            return pd.DataFrame() # Return an empty DataFrame
        
        return data

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching FIRMS data: {e}")
        print("This could be due to an invalid key, incorrect URL format, or server issues.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Ensure you have a valid API key and internet access.")
        return None

# --- Test the FIRMS function ---
firms_data = fetch_firms_data_with_bbox(FIRMS_API_KEY, FIRMS_SOURCE, BOUNDING_BOX, DATE_RANGE)

if firms_data is not None and not firms_data.empty:
    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("FIRMS data head:")
    print(firms_data.head())
    
    # Save to a temporary CSV for inspection (optional)
    firms_data.to_csv("firms_ca_data.csv", index=False)
    print(f"\nFIRMS data for {BOUNDING_BOX} saved to firms_ca_data.csv")


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_NOAA20_NRT/-124.5,32.5,-114.0,42.0/2025-07-30,2025-07-31
No FIRMS data found for the specified period and area.


In [13]:
# ==============================================================================
# Corrected FIRMS API Request for California, USA
# This script fetches active fire data for California from a valid, past date range.
# ==============================================================================

# General imports
import requests
import pandas as pd
from io import StringIO
import os

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key.
# The placeholder key '2eaecfb3056b7b7751771485eb481c51' will NOT work.
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51"

# Base URL for the FIRMS API
BASE_URL = 'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'

# We'll use VIIRS NOAA-20 for high-resolution data.
FIRMS_SOURCE = 'VIIRS_NOAA20_NRT'

# Define the bounding box for California, USA.
# Format: lon_min,lat_min,lon_max,lat_max
BOUNDING_BOX = "-124.5,32.5,-114.0,42.0"

# --- Corrected Date Range ---
# We are now using a date range from last year where fire data is available.
START_DATE = '2024-07-30'
END_DATE = '2024-07-31'
DATE_RANGE = f"{START_DATE},{END_DATE}"

def fetch_firms_data_with_bbox(api_key, source, bbox, date_range):
    """
    Fetches FIRMS active fire data for a specific bounding box and date range.
    """
    # Construct the URL with the bounding box and date range parameters.
    url = f"{BASE_URL}{api_key}/{source}/{bbox}/{date_range}"
    
    print(f"Fetching FIRMS data from: {url}")
    
    # Check for placeholder key and warn the user
    if api_key == "YOUR_FIRMS_API_KEY":
        print("\nERROR: You must replace 'YOUR_FIRMS_API_KEY' with your actual key.")
        return None
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

        # FIRMS returns CSV data, which we read into a Pandas DataFrame
        data = pd.read_csv(StringIO(response.text))
        
        if data.empty:
            print("No FIRMS data found for the specified period and area.")
            return pd.DataFrame() # Return an empty DataFrame
        
        return data

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching FIRMS data: {e}")
        print("This could be due to an invalid key, incorrect URL format, or server issues.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Ensure you have a valid API key and internet access.")
        return None

# --- Test the FIRMS function ---
firms_data = fetch_firms_data_with_bbox(FIRMS_API_KEY, FIRMS_SOURCE, BOUNDING_BOX, DATE_RANGE)

if firms_data is not None and not firms_data.empty:
    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("FIRMS data head:")
    print(firms_data.head())
    
    # Save to a temporary CSV for inspection (optional)
    firms_data.to_csv("firms_ca_data.csv", index=False)
    print(f"\nFIRMS data for {BOUNDING_BOX} saved to firms_ca_data.csv")


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_NOAA20_NRT/-124.5,32.5,-114.0,42.0/2024-07-30,2024-07-31
No FIRMS data found for the specified period and area.


In [14]:
# ==============================================================================
# FIRMS API Request with a placeholder for your valid API key.
# This script fetches active fire data for California from a valid, past date range.
# ==============================================================================

# General imports
import requests
import pandas as pd
from io import StringIO
import os

# --- Configuration ---
# IMPORTANT: Replace "YOUR_FIRMS_API_KEY" with your actual key.
# This key is required for the NASA API to function.
FIRMS_API_KEY = "2eaecfb3056b7b7751771485eb481c51"

# Base URL for the FIRMS API
BASE_URL = 'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'

# We'll use VIIRS NOAA-20 for high-resolution data.
FIRMS_SOURCE = 'VIIRS_NOAA20_NRT'

# Define the bounding box for California, USA.
# Format: lon_min,lat_min,lon_max,lat_max
BOUNDING_BOX = "-124.5,32.5,-114.0,42.0"

# --- Corrected Date Range ---
# Using a past date range from last year where fire data is available.
START_DATE = '2024-07-30'
END_DATE = '2024-07-31'
DATE_RANGE = f"{START_DATE},{END_DATE}"

def fetch_firms_data_with_bbox(api_key, source, bbox, date_range):
    """
    Fetches FIRMS active fire data for a specific bounding box and date range.
    """
    # Construct the URL with the bounding box and date range parameters.
    url = f"{BASE_URL}{api_key}/{source}/{bbox}/{date_range}"
    
    print(f"Fetching FIRMS data from: {url}")
    
    # Check for placeholder key and warn the user
    if api_key == "YOUR_FIRMS_API_KEY":
        print("\nERROR: You must replace 'YOUR_FIRMS_API_KEY' with your actual key.")
        return None
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises an HTTPError for bad responses (4xx or 5xx)

        # FIRMS returns CSV data, which we read into a Pandas DataFrame
        data = pd.read_csv(StringIO(response.text))
        
        if data.empty:
            print("No FIRMS data found for the specified period and area.")
            return pd.DataFrame() # Return an empty DataFrame
        
        return data

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error fetching FIRMS data: {e}")
        print("This could be due to an invalid key, incorrect URL format, or server issues.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Ensure you have a valid API key and internet access.")
        return None

# --- Test the FIRMS function ---
firms_data = fetch_firms_data_with_bbox(FIRMS_API_KEY, FIRMS_SOURCE, BOUNDING_BOX, DATE_RANGE)

if firms_data is not None and not firms_data.empty:
    print(f"\nSuccessfully fetched {len(firms_data)} FIRMS hotspots.")
    print("FIRMS data head:")
    print(firms_data.head())
    
    # Save to a temporary CSV for inspection (optional)
    firms_data.to_csv("firms_ca_data.csv", index=False)
    print(f"\nFIRMS data for {BOUNDING_BOX} saved to firms_ca_data.csv")


Fetching FIRMS data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_NOAA20_NRT/-124.5,32.5,-114.0,42.0/2024-07-30,2024-07-31
No FIRMS data found for the specified period and area.


In [15]:
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>FIRMS API Key Tester</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
    <style>
        body {
            font-family: 'Inter', sans-serif;
            background-color: #f3f4f6;
        }
        .message-box {
            position: fixed;
            top: 2rem;
            right: 2rem;
            max-width: 300px;
            background-color: #fff;
            border-left: 4px solid;
            padding: 1rem;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
            border-radius: 0.5rem;
            transition: transform 0.3s ease-in-out, opacity 0.3s ease-in-out;
            transform: translateX(110%);
            opacity: 0;
            z-index: 1000;
        }
        .message-box.show {
            transform: translateX(0);
            opacity: 1;
        }
        .status-success { border-color: #10b981; }
        .status-error { border-color: #ef4444; }
        .status-warning { border-color: #f59e0b; }
    </style>
</head>
<body class="bg-gray-100 flex items-center justify-center min-h-screen p-4">

    <div class="bg-white rounded-lg shadow-xl p-8 max-w-lg w-full">
        <h1 class="text-3xl font-bold text-center text-gray-800 mb-6">FIRMS Key Tester</h1>
        <p class="text-gray-600 mb-6 text-center">
            Enter your FIRMS `MAP_KEY` and click the button to test if it's working and check the API status.
        </p>

        <!-- Input Section -->
        <div class="mb-4">
            <label for="apiKeyInput" class="block text-gray-700 font-semibold mb-2">Your FIRMS MAP_KEY</label>
            <input type="text" id="apiKeyInput" placeholder="Enter your key here..." class="w-full px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500">
        </div>
        <div class="mb-6">
            <label for="dayRangeInput" class="block text-gray-700 font-semibold mb-2">Day Range (1-10)</label>
            <input type="number" id="dayRangeInput" value="1" min="1" max="10" class="w-full px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500">
        </div>

        <!-- Button -->
        <button id="testButton" class="w-full bg-blue-600 hover:bg-blue-700 text-white font-bold py-3 px-4 rounded-md shadow-lg transition-colors duration-200">
            Test Key
        </button>

        <!-- Status and Data Display -->
        <div id="resultContainer" class="mt-8 p-6 bg-gray-50 rounded-lg border border-gray-200 hidden">
            <h2 class="text-xl font-bold text-gray-800 mb-4">API Response</h2>
            <div id="statusMessage" class="font-medium text-lg mb-2"></div>
            <pre id="responseData" class="bg-gray-200 p-4 rounded-md text-sm overflow-auto max-h-64"></pre>
        </div>
    </div>

    <!-- Message Box for Notifications -->
    <div id="messageBox" class="message-box">
        <p id="messageText" class="font-medium"></p>
    </div>

    <script>
        // DOM element references
        const apiKeyInput = document.getElementById('apiKeyInput');
        const dayRangeInput = document.getElementById('dayRangeInput');
        const testButton = document.getElementById('testButton');
        const resultContainer = document.getElementById('resultContainer');
        const statusMessage = document.getElementById('statusMessage');
        const responseData = document.getElementById('responseData');
        const messageBox = document.getElementById('messageBox');
        const messageText = document.getElementById('messageText');

        /**
         * Displays a temporary message box notification.
         * @param {string} message The message to display.
         * @param {string} type The type of message (success, error, warning).
         */
        const showMessage = (message, type) => {
            messageText.textContent = message;
            messageBox.className = `message-box show status-${type}`;
            setTimeout(() => {
                messageBox.classList.remove('show');
            }, 5000);
        };

        testButton.addEventListener('click', async () => {
            const apiKey = apiKeyInput.value.trim();
            const dayRange = dayRangeInput.value;
            const source = 'VIIRS_SNPP_NRT'; // Default source, can be changed
            const area = 'world'; // Default area for testing
            const url = `https://firms.modaps.eosdis.nasa.gov/api/area/csv/${apiKey}/${source}/${area}/${dayRange}`;

            if (!apiKey) {
                showMessage("Please enter your API key.", "error");
                return;
            }

            testButton.textContent = 'Testing...';
            testButton.disabled = true;
            resultContainer.classList.add('hidden');

            try {
                const response = await fetch(url);
                
                // Show the result container after the fetch
                resultContainer.classList.remove('hidden');

                if (response.ok) {
                    const data = await response.text();
                    statusMessage.textContent = 'Success! Your key is working.';
                    statusMessage.className = 'font-medium text-lg text-green-600 mb-2';
                    responseData.textContent = data.slice(0, 500) + '\n\n... (Data truncated for display)';
                    showMessage("Successfully fetched data.", "success");
                } else {
                    const errorText = await response.text();
                    statusMessage.className = 'font-medium text-lg text-red-600 mb-2';
                    
                    // Handle specific error codes
                    if (response.status === 429) {
                        statusMessage.textContent = 'Error: Rate Limit Exceeded (429)';
                        responseData.textContent = 'You have made too many requests. The FIRMS API has a limit of 5,000 transactions per 10-minute interval. Please wait and try again later.';
                        showMessage("Rate limit exceeded. Please wait.", "warning");
                    } else if (response.status === 403) {
                        statusMessage.textContent = 'Error: Forbidden (403)';
                        responseData.textContent = 'The API key provided is not valid or does not have permission. Please check your key for typos or request a new one from FIRMS.';
                        showMessage("Invalid API key. Check for typos.", "error");
                    } else {
                        statusMessage.textContent = `Error: ${response.status} ${response.statusText}`;
                        responseData.textContent = errorText;
                        showMessage(`API responded with an error: ${response.status}`, "error");
                    }
                }
            } catch (error) {
                statusMessage.textContent = 'Error: An unexpected error occurred.';
                statusMessage.className = 'font-medium text-lg text-red-600 mb-2';
                responseData.textContent = error.toString();
                showMessage("An unexpected network error occurred.", "error");
                console.error(error);
            } finally {
                testButton.textContent = 'Test Key';
                testButton.disabled = false;
            }
        });
    </script>
</body>
</html>


SyntaxError: invalid decimal literal (1982260478.py, line 16)

In [16]:
import React, { useState } from 'react';
import {
  Table,
  TableBody,
  TableCell,
  TableHead,
  TableHeader,
  TableRow,
} from "shadcn/ui";
import {
  Card,
  CardContent,
  CardHeader,
  CardTitle,
  CardDescription
} from "shadcn/ui";
import { Textarea } from "shadcn/ui";
import { Button } from "shadcn/ui";

// Define the main App component
function App() {
  // State to hold the raw CSV text from the textarea
  const [csvText, setCsvText] = useState(
    `latitude,longitude,bright_ti4,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_ti5,frp,daynight
65.62944,147.48901,333.06,0.45,0.63,2025-07-31,13,N,VIIRS,n,2.0NRT,298.21,12.32,D
65.63281,147.49579,367,0.45,0.63,2025-07-31,13,N,VIIRS,h,2.0NRT,298.07,10.08,D
65.63768,147.48698,352.09,0.45,0.63,2025-07-31,13,N,VIIRS,n,2.0NRT,297.46,10.08,D
66.11572,152.64148,327.93,0.34,0.56,2025-07-31,13,N,VIIRS,n,2.0NRT,296.66,6.21,D
66.11675,152.63954,352.49,0.34,0.56,2025-07-31,13,N,V
`
  );
  // State to hold the parsed data as an array of objects
  const [data, setData] = useState([]);
  // State to hold the column headers
  const [headers, setHeaders] = useState([]);
  // State to hold any error messages
  const [error, setError] = useState('');

  // Column descriptions for the explanatory section
  const columnDescriptions = [
    { name: 'latitude', description: 'The latitude of the fire detection.' },
    { name: 'longitude', description: 'The longitude of the fire detection.' },
    { name: 'bright_ti4', description: 'The brightness temperature in Kelvin of the T-4 channel (3.75 µm).' },
    { name: 'bright_ti5', description: 'The brightness temperature in Kelvin of the T-5 channel (11 µm).' },
    { name: 'confidence', description: 'The confidence level of the fire detection (low, nominal, high).' },
    { name: 'frp', description: 'Fire Radiative Power in Megawatts (MW). This indicates the intensity of the fire.' },
    { name: 'daynight', description: 'Whether the detection occurred during the day (D) or night (N).' },
  ];

  // Function to parse the CSV data
  const parseCsv = () => {
    setError(''); // Clear previous errors
    if (!csvText) {
      setError('Please paste some CSV data.');
      return;
    }

    try {
      const lines = csvText.trim().split('\n');
      if (lines.length < 2) {
        setError('Please provide at least a header and one row of data.');
        return;
      }

      // Split the header line by comma to get column names
      const newHeaders = lines[0].split(',').map(h => h.trim());
      setHeaders(newHeaders);

      // Process each data line
      const newData = lines.slice(1).map(line => {
        // Handle truncated lines by stopping the parsing
        if (line.split(',').length < newHeaders.length) {
          // You could also ignore incomplete lines or add an error, this approach just stops
          return null; 
        }
        const values = line.split(',');
        const rowObject = {};
        newHeaders.forEach((header, index) => {
          rowObject[header] = values[index] ? values[index].trim() : '';
        });
        return rowObject;
      }).filter(Boolean); // Remove any null values from incomplete lines

      setData(newData);
    } catch (e) {
      setError('Failed to parse CSV. Please check the format.');
      console.error(e);
    }
  };

  // Main component render
  return (
    <div className="min-h-screen bg-gray-100 dark:bg-gray-900 text-gray-900 dark:text-gray-100 p-8 flex flex-col items-center font-sans">
      <Card className="w-full max-w-4xl shadow-lg rounded-xl overflow-hidden mb-8 bg-white dark:bg-gray-800">
        <CardHeader className="bg-blue-600 dark:bg-blue-800 text-white p-6 rounded-t-xl">
          <CardTitle className="text-3xl font-bold">VIIRS Fire Data Viewer</CardTitle>
          <CardDescription className="text-blue-100 dark:text-blue-200 mt-2">Paste your satellite data below to view it in a structured table.</CardDescription>
        </CardHeader>
        <CardContent className="p-6 space-y-6">
          <Textarea
            value={csvText}
            onChange={(e) => setCsvText(e.target.value)}
            placeholder="Paste your CSV data here..."
            rows={10}
            className="w-full rounded-md border-gray-300 dark:border-gray-600 bg-gray-50 dark:bg-gray-700 focus:ring-blue-500 focus:border-blue-500 text-gray-900 dark:text-gray-100"
          />
          <div className="flex justify-center">
            <Button
              onClick={parseCsv}
              className="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-6 rounded-full shadow-lg transition duration-200 transform hover:scale-105"
            >
              Process Data
            </Button>
          </div>
          {error && (
            <div className="text-red-500 text-center mt-4">
              {error}
            </div>
          )}
        </CardContent>
      </Card>

      {data.length > 0 && (
        <Card className="w-full max-w-4xl shadow-lg rounded-xl overflow-hidden mt-8 bg-white dark:bg-gray-800">
          <CardHeader className="bg-gray-200 dark:bg-gray-700 p-4 border-b dark:border-gray-600">
            <CardTitle className="text-2xl font-semibold">Fire Detections</CardTitle>
          </CardHeader>
          <CardContent className="p-0 overflow-x-auto">
            <Table>
              <TableHeader>
                <TableRow className="bg-gray-100 dark:bg-gray-700">
                  {headers.map(header => (
                    <TableHead key={header} className="whitespace-nowrap font-bold text-gray-600 dark:text-gray-300">
                      {header}
                    </TableHead>
                  ))}
                </TableRow>
              </TableHeader>
              <TableBody>
                {data.map((row, rowIndex) => (
                  <TableRow key={rowIndex} className="hover:bg-gray-50 dark:hover:bg-gray-700 transition-colors">
                    {headers.map(header => (
                      <TableCell key={`${rowIndex}-${header}`} className="whitespace-nowrap text-sm text-gray-700 dark:text-gray-300">
                        {row[header]}
                      </TableCell>
                    ))}
                  </TableRow>
                ))}
              </TableBody>
            </Table>
          </CardContent>
        </Card>
      )}

      <Card className="w-full max-w-4xl shadow-lg rounded-xl overflow-hidden mt-8 bg-white dark:bg-gray-800">
        <CardHeader className="bg-gray-200 dark:bg-gray-700 p-4 border-b dark:border-gray-600">
          <CardTitle className="text-2xl font-semibold">Column Descriptions</CardTitle>
          <CardDescription className="text-gray-600 dark:text-gray-400">
            A quick guide to understanding the data columns.
          </CardDescription>
        </CardHeader>
        <CardContent className="p-6 space-y-4">
          <ul className="grid grid-cols-1 md:grid-cols-2 gap-4">
            {columnDescriptions.map((col, index) => (
              <li key={index} className="p-4 bg-gray-50 dark:bg-gray-700 rounded-md shadow-sm">
                <strong className="text-blue-600 dark:text-blue-400">{col.name}:</strong> {col.description}
              </li>
            ))}
          </ul>
        </CardContent>
      </Card>
    </div>
  );
}

export default App;


SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (500140624.py, line 25)

In [17]:
import csv
from io import StringIO

# Your VIIRS fire data as a multi-line string.
# The StringIO object lets us treat this string like a file.
fire_data_string = """
latitude,longitude,bright_ti4,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_ti5,frp,daynight
65.62944,147.48901,333.06,0.45,0.63,2025-07-31,13,N,VIIRS,n,2.0NRT,298.21,12.32,D
65.63281,147.49579,367,0.45,0.63,2025-07-31,13,N,VIIRS,h,2.0NRT,298.07,10.08,D
65.63768,147.48698,352.09,0.45,0.63,2025-07-31,13,N,VIIRS,n,2.0NRT,297.46,10.08,D
66.11572,152.64148,327.93,0.34,0.56,2025-07-31,13,N,VIIRS,n,2.0NRT,296.66,6.21,D
"""

# Create a file-like object from the string data
csv_file = StringIO(fire_data_string.strip())

# Use the csv.reader to safely parse the data
# The reader handles all the complexities of CSV formatting for you.
csv_reader = csv.reader(csv_file, delimiter=',')

# Get the header row to use for keys in our dictionaries
headers = next(csv_reader)

# Process each row and store it in a list of dictionaries
fire_detections = []
for row in csv_reader:
    # Handle incomplete rows that might be at the end of the data
    if len(row) == len(headers):
        # Create a dictionary for each row for easy access
        row_dict = dict(zip(headers, row))
        fire_detections.append(row_dict)

# Print the parsed data to show that it worked
print("Parsed VIIRS Fire Detections:")
for detection in fire_detections:
    print(detection)

print("\n---")
print("Example: Accessing a specific value from the first detection.")
if fire_detections:
    first_detection = fire_detections[0]
    print(f"Latitude of the first detection: {first_detection['latitude']}")
    print(f"Confidence of the first detection: {first_detection['confidence']}")


Parsed VIIRS Fire Detections:
{'latitude': '65.62944', 'longitude': '147.48901', 'bright_ti4': '333.06', 'scan': '0.45', 'track': '0.63', 'acq_date': '2025-07-31', 'acq_time': '13', 'satellite': 'N', 'instrument': 'VIIRS', 'confidence': 'n', 'version': '2.0NRT', 'bright_ti5': '298.21', 'frp': '12.32', 'daynight': 'D'}
{'latitude': '65.63281', 'longitude': '147.49579', 'bright_ti4': '367', 'scan': '0.45', 'track': '0.63', 'acq_date': '2025-07-31', 'acq_time': '13', 'satellite': 'N', 'instrument': 'VIIRS', 'confidence': 'h', 'version': '2.0NRT', 'bright_ti5': '298.07', 'frp': '10.08', 'daynight': 'D'}
{'latitude': '65.63768', 'longitude': '147.48698', 'bright_ti4': '352.09', 'scan': '0.45', 'track': '0.63', 'acq_date': '2025-07-31', 'acq_time': '13', 'satellite': 'N', 'instrument': 'VIIRS', 'confidence': 'n', 'version': '2.0NRT', 'bright_ti5': '297.46', 'frp': '10.08', 'daynight': 'D'}
{'latitude': '66.11572', 'longitude': '152.64148', 'bright_ti4': '327.93', 'scan': '0.34', 'track': '0.

In [18]:
import requests
import time
import os

# --- Configuration Section ---
# Replace this with the MAP_KEY you received in your email
MAP_KEY = "2eaecfb3056b7b7751771485eb481c51"

# The satellite sensor you want to use (VIIRS_SNPP_NRT is recommended)
SOURCE = "VIIRS_SNPP_NRT"

# The geographic area to search for fires (use 'world' for the whole globe)
AREA_COORDINATES = "world"

# The number of days of data to fetch (1-10 recommended to avoid rate limits)
DAY_RANGE = 1

# The name of the file to save the data to
OUTPUT_FILENAME = "firms_fire_data.csv"

# --- Main Script ---

# Construct the full API URL using the configuration variables
api_url = f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/{MAP_KEY}/{SOURCE}/{AREA_COORDINATES}/{DAY_RANGE}"

print(f"Attempting to fetch data from: {api_url}")
print("Please be patient, the request may take a few seconds...")

# Use a try-except block to handle potential network or request errors
try:
    # Send a GET request to the FIRMS API
    response = requests.get(api_url)

    # Check the HTTP status code of the response
    if response.status_code == 200:
        # The request was successful!
        print("Success! Data received.")

        # Save the content of the response to a local CSV file
        with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"Data saved successfully to {OUTPUT_FILENAME}")

    elif response.status_code == 403:
        # Status code 403 means Forbidden, usually an issue with the key
        print("Error: The API key is invalid or permissions are incorrect.")
        print("Please double-check your MAP_KEY for any typos.")
        print("If the key is correct, you may need to request a new one.")

    elif response.status_code == 429:
        # Status code 429 means too many requests, a rate-limiting issue
        print("Error: Rate limit exceeded (429 Too Many Requests).")
        print("You have made too many requests in a short period.")
        print("Please wait for about 10 minutes and try again.")
        print("Consider reducing the DAY_RANGE to avoid this in the future.")
        
    else:
        # Handle any other unexpected status codes
        print(f"Error: Received an unexpected status code: {response.status_code}")
        print("Response content:")
        print(response.text)

except requests.exceptions.RequestException as e:
    # Handle network-related errors (e.g., no internet connection)
    print(f"A network error occurred: {e}")

print("\n--- Script finished ---")



Attempting to fetch data from: https://firms.modaps.eosdis.nasa.gov/api/area/csv/2eaecfb3056b7b7751771485eb481c51/VIIRS_SNPP_NRT/world/1
Please be patient, the request may take a few seconds...
Success! Data received.
Data saved successfully to firms_fire_data.csv

--- Script finished ---


In [19]:
import csv
import os

# The name of the file you saved in the previous step
INPUT_FILENAME = "firms_fire_data.csv"

# --- Main Script ---

if not os.path.exists(INPUT_FILENAME):
    print(f"Error: The file '{INPUT_FILENAME}' was not found.")
    print("Please run the data retriever script first to create this file.")
else:
    # Create an empty list to store our fire detection data
    fire_detections = []

    print(f"Reading data from '{INPUT_FILENAME}'...")

    # Use a 'with' statement to ensure the file is closed correctly
    with open(INPUT_FILENAME, 'r', encoding='utf-8') as csv_file:
        # Use csv.DictReader to automatically parse each row into a dictionary
        csv_reader = csv.DictReader(csv_file)

        # Loop through each row in the file and add it to our list
        for row in csv_reader:
            fire_detections.append(row)

    print(f"Successfully parsed {len(fire_detections)} fire detections.")
    print("---")
    
    # Let's see what the first detection looks like!
    if fire_detections:
        print("First detection data:")
        for key, value in fire_detections[0].items():
            print(f"  {key}: {value}")
        print("\n---")
        
        # Now you can easily access any piece of data by its column name
        first_lat = fire_detections[0]['latitude']
        first_lon = fire_detections[0]['longitude']
        print(f"Latitude of the first detection: {first_lat}")
        print(f"Longitude of the first detection: {first_lon}")
        
    else:
        print("The file was empty or contained no data rows.")
        


Reading data from 'firms_fire_data.csv'...
Successfully parsed 96380 fire detections.
---
First detection data:
  latitude: 65.62944
  longitude: 147.48901
  bright_ti4: 333.06
  scan: 0.45
  track: 0.63
  acq_date: 2025-07-31
  acq_time: 13
  satellite: N
  instrument: VIIRS
  confidence: n
  version: 2.0NRT
  bright_ti5: 298.21
  frp: 12.32
  daynight: D

---
Latitude of the first detection: 65.62944
Longitude of the first detection: 147.48901


In [20]:
import csv
import os
import folium

# --- Configuration Section ---
# The name of the CSV file created in the previous step
INPUT_FILENAME = "firms_fire_data.csv"
# The name of the HTML file that will contain your map
OUTPUT_MAP_FILE = "fire_hotspots_map.html"

# --- Main Script ---
# Check if the input file exists before trying to read it
if not os.path.exists(INPUT_FILENAME):
    print(f"Error: The file '{INPUT_FILENAME}' was not found.")
    print("Please make sure you have run the previous script to fetch the data.")
else:
    # Use a try-except block to handle potential data parsing errors
    try:
        fire_detections = []
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as csv_file:
            # We use DictReader again to get data as dictionaries
            csv_reader = csv.DictReader(csv_file)
            
            # Loop through each row and prepare the data for visualization
            for row in csv_reader:
                # We need to convert latitude and longitude from strings to numbers (floats)
                # We also need to get the confidence and brightness
                # Folium requires numbers for coordinates
                try:
                    lat = float(row['latitude'])
                    lon = float(row['longitude'])
                    confidence = row['confidence']
                    bright_ti4 = float(row['bright_ti4'])
                    
                    # Store the processed data
                    fire_detections.append({
                        'lat': lat,
                        'lon': lon,
                        'confidence': confidence,
                        'brightness': bright_ti4
                    })
                except (ValueError, KeyError) as e:
                    print(f"Skipping a row due to a data error: {e}")
                    continue

        print(f"Loaded {len(fire_detections)} fire detections from the CSV file.")

        # Create the base map object
        # We start with a low zoom level to see the entire world
        # The center is set to (0, 0) for a global view
        world_map = folium.Map(location=[0, 0], zoom_start=2)

        # Loop through our processed fire detections and add markers to the map
        for detection in fire_detections:
            # We use a CircleMarker for each fire hotspot
            folium.CircleMarker(
                location=[detection['lat'], detection['lon']],
                radius=5,  # Size of the circle on the map
                color='red', # All circles will be red
                fill=True,
                fill_color='red',
                fill_opacity=0.7,
                tooltip=f"Confidence: {detection['confidence']}, Brightness: {detection['brightness']}"
            ).add_to(world_map)
            
        # Save the completed map as an HTML file
        world_map.save(OUTPUT_MAP_FILE)
        
        print(f"\nSuccess! An interactive map has been saved to '{OUTPUT_MAP_FILE}'.")
        print("You can now open this file in your web browser to view the fire hotspots.")

    except Exception as e:
        print(f"An unexpected error occurred during map creation: {e}")



ModuleNotFoundError: No module named 'folium'

In [1]:
import csv
import os
import folium

# --- Configuration Section ---
# The name of the CSV file created in the previous step
INPUT_FILENAME = "firms_fire_data.csv"
# The name of the HTML file that will contain your map
OUTPUT_MAP_FILE = "fire_hotspots_map.html"

# --- Main Script ---
# Check if the input file exists before trying to read it
if not os.path.exists(INPUT_FILENAME):
    print(f"Error: The file '{INPUT_FILENAME}' was not found.")
    print("Please make sure you have run the previous script to fetch the data.")
else:
    # Use a try-except block to handle potential data parsing errors
    try:
        fire_detections = []
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as csv_file:
            # We use DictReader again to get data as dictionaries
            csv_reader = csv.DictReader(csv_file)
            
            # Loop through each row and prepare the data for visualization
            for row in csv_reader:
                # We need to convert latitude and longitude from strings to numbers (floats)
                # We also need to get the confidence and brightness
                # Folium requires numbers for coordinates
                try:
                    lat = float(row['latitude'])
                    lon = float(row['longitude'])
                    confidence = row['confidence']
                    bright_ti4 = float(row['bright_ti4'])
                    
                    # Store the processed data
                    fire_detections.append({
                        'lat': lat,
                        'lon': lon,
                        'confidence': confidence,
                        'brightness': bright_ti4
                    })
                except (ValueError, KeyError) as e:
                    print(f"Skipping a row due to a data error: {e}")
                    continue

        print(f"Loaded {len(fire_detections)} fire detections from the CSV file.")

        # Create the base map object
        # We start with a low zoom level to see the entire world
        # The center is set to (0, 0) for a global view
        world_map = folium.Map(location=[0, 0], zoom_start=2)

        # Loop through our processed fire detections and add markers to the map
        for detection in fire_detections:
            # We use a CircleMarker for each fire hotspot
            folium.CircleMarker(
                location=[detection['lat'], detection['lon']],
                radius=5,  # Size of the circle on the map
                color='red', # All circles will be red
                fill=True,
                fill_color='red',
                fill_opacity=0.7,
                tooltip=f"Confidence: {detection['confidence']}, Brightness: {detection['brightness']}"
            ).add_to(world_map)
            
        # Save the completed map as an HTML file
        world_map.save(OUTPUT_MAP_FILE)
        
        print(f"\nSuccess! An interactive map has been saved to '{OUTPUT_MAP_FILE}'.")
        print("You can now open this file in your web browser to view the fire hotspots.")

    except Exception as e:
        print(f"An unexpected error occurred during map creation: {e}")


Loaded 96380 fire detections from the CSV file.

Success! An interactive map has been saved to 'fire_hotspots_map.html'.
You can now open this file in your web browser to view the fire hotspots.


In [None]:
import csv
import os
import folium

# --- Configuration Section ---
# The name of the CSV file created in the previous step
INPUT_FILENAME = "firms_fire_data.csv"
# The name of the HTML file that will contain your map
OUTPUT_MAP_FILE = "fire_hotspots_map.html"

# --- Main Script ---
# Check if the input file exists before trying to read it
if not os.path.exists(INPUT_FILENAME):
    print(f"Error: The file '{INPUT_FILENAME}' was not found.")
    print("Please make sure you have run the previous script to fetch the data.")
else:
    # Use a try-except block to handle potential data parsing errors
    try:
        fire_detections = []
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as csv_file:
            # We use DictReader again to get data as dictionaries
            csv_reader = csv.DictReader(csv_file)
            
            # Loop through each row and prepare the data for visualization
            for row in csv_reader:
                # We need to convert latitude and longitude from strings to numbers (floats)
                # We also need to get the confidence and brightness
                # Folium requires numbers for coordinates
                try:
                    lat = float(row['latitude'])
                    lon = float(row['longitude'])
                    confidence = row['confidence']
                    bright_ti4 = float(row['bright_ti4'])
                    
                    # Store the processed data
                    fire_detections.append({
                        'lat': lat,
                        'lon': lon,
                        'confidence': confidence,
                        'brightness': bright_ti4
                    })
                except (ValueError, KeyError) as e:
                    # If there's an issue with a single row, print a warning and continue
                    print(f"Warning: Skipping a row due to a data error: {e}. Row: {row}")
                    continue

        print(f"Loaded {len(fire_detections)} valid fire detections from the CSV file.")

        # If no valid detections were found, there's no point in creating a map
        if not fire_detections:
            print("Error: No valid fire detections were found in the CSV file.")
            print("The CSV may be empty or the data format is incorrect.")
        else:
            # Create the base map object
            # We start with a low zoom level to see the entire world
            # The center is set to the first fire detection's coordinates
            world_map = folium.Map(location=[fire_detections[0]['lat'], fire_detections[0]['lon']], zoom_start=2)

            # Loop through our processed fire detections and add markers to the map
            for detection in fire_detections:
                # We use a CircleMarker for each fire hotspot
                folium.CircleMarker(
                    location=[detection['lat'], detection['lon']],
                    radius=5,  # Size of the circle on the map
                    color='red', # All circles will be red
                    fill=True,
                    fill_color='red',
                    fill_opacity=0.7,
                    tooltip=f"Confidence: {detection['confidence']}, Brightness: {detection['brightness']}"
                ).add_to(world_map)
            
            # Save the completed map as an HTML file
            world_map.save(OUTPUT_MAP_FILE)
            
            print(f"\nSuccess! An interactive map has been saved to '{OUTPUT_MAP_FILE}'.")
            print("You can now open this file in your web browser to view the fire hotspots.")

    except Exception as e:
        print(f"An unexpected error occurred during map creation: {e}")


In [1]:
import csv
import os
import folium

# --- Configuration Section ---
# The name of the CSV file created in the previous step
INPUT_FILENAME = "firms_fire_data.csv"
# The name of the HTML file that will contain your map
OUTPUT_MAP_FILE = "fire_hotspots_map.html"

# --- Main Script ---
# Check if the input file exists before trying to read it
if not os.path.exists(INPUT_FILENAME):
    print(f"Error: The file '{INPUT_FILENAME}' was not found.")
    print("Please make sure you have run the previous script to fetch the data.")
else:
    # Use a try-except block to handle potential data parsing errors
    try:
        fire_detections = []
        with open(INPUT_FILENAME, 'r', encoding='utf-8') as csv_file:
            # We use DictReader again to get data as dictionaries
            csv_reader = csv.DictReader(csv_file)
            
            # Loop through each row and prepare the data for visualization
            for row in csv_reader:
                # We need to convert latitude and longitude from strings to numbers (floats)
                # We also need to get the confidence and brightness
                # Folium requires numbers for coordinates
                try:
                    lat = float(row['latitude'])
                    lon = float(row['longitude'])
                    confidence = row['confidence']
                    bright_ti4 = float(row['bright_ti4'])
                    
                    # Store the processed data
                    fire_detections.append({
                        'lat': lat,
                        'lon': lon,
                        'confidence': confidence,
                        'brightness': bright_ti4
                    })
                except (ValueError, KeyError) as e:
                    # If there's an issue with a single row, print a warning and continue
                    print(f"Warning: Skipping a row due to a data error: {e}. Row: {row}")
                    continue

        print(f"Loaded {len(fire_detections)} valid fire detections from the CSV file.")

        # If no valid detections were found, there's no point in creating a map
        if not fire_detections:
            print("Error: No valid fire detections were found in the CSV file.")
            print("The CSV may be empty or the data format is incorrect.")
        else:
            # Create the base map object
            # We start with a low zoom level to see the entire world
            # The center is set to the first fire detection's coordinates
            world_map = folium.Map(location=[fire_detections[0]['lat'], fire_detections[0]['lon']], zoom_start=2)

            # Loop through our processed fire detections and add markers to the map
            for detection in fire_detections:
                # We use a CircleMarker for each fire hotspot
                folium.CircleMarker(
                    location=[detection['lat'], detection['lon']],
                    radius=5,  # Size of the circle on the map
                    color='red', # All circles will be red
                    fill=True,
                    fill_color='red',
                    fill_opacity=0.7,
                    tooltip=f"Confidence: {detection['confidence']}, Brightness: {detection['brightness']}"
                ).add_to(world_map)
            
            # Save the completed map as an HTML file
            world_map.save(OUTPUT_MAP_FILE)
            
            print(f"\nSuccess! An interactive map has been saved to '{OUTPUT_MAP_FILE}'.")
            print("You can now open this file in your web browser to view the fire hotspots.")

    except Exception as e:
        print(f"An unexpected error occurred during map creation: {e}")


Loaded 96380 valid fire detections from the CSV file.

Success! An interactive map has been saved to 'fire_hotspots_map.html'.
You can now open this file in your web browser to view the fire hotspots.


In [2]:
import pandas as pd
import folium

# --- 1. Load the data from the CSV file ---
try:
    df = pd.read_csv('firms_fire_data.csv')
    print("Successfully loaded fire detections from 'fire_detections.csv'")
except FileNotFoundError:
    print("Error: The file 'fire_detections.csv' was not found. Please make sure it's in the same directory.")
    exit()

# --- 2. Get user input for the confidence filter ---
# This loop ensures the user enters a valid number between 0 and 100
while True:
    try:
        min_confidence = float(input("Enter the minimum confidence level (0-100) to filter the data: "))
        if 0 <= min_confidence <= 100:
            break
        else:
            print("Please enter a number between 0 and 100.")
    except ValueError:
        print("Invalid input. Please enter a number.")

# --- 3. Filter the DataFrame based on the user's input ---
# The confidence column is often named 'confidence' or 'Confidence'. Let's check both.
confidence_column = 'confidence' if 'confidence' in df.columns else 'Confidence'
filtered_df = df[df[confidence_column] >= min_confidence]

# Let's see how many detections we have after filtering
print(f"Filtered {len(filtered_df)} valid fire detections with confidence >= {min_confidence}.")

# --- 4. Create a new Folium map centered on the average location of the filtered fires ---
# Create a map centered at the average latitude and longitude of the filtered data
if not filtered_df.empty:
    center_lat = filtered_df['latitude'].mean()
    center_lon = filtered_df['longitude'].mean()
    m = folium.Map(location=[center_lat, center_lon], zoom_start=4, tiles='CartoDB dark_matter')
    print("Created a new interactive map.")
else:
    print("No detections found with the specified confidence level. Creating an empty map.")
    m = folium.Map(location=[0, 0], zoom_start=2, tiles='CartoDB dark_matter')


# --- 5. Add a marker for each filtered fire detection ---
for index, row in filtered_df.iterrows():
    # Create a small, colored circle marker for each detection
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,  # Adjust the size of the marker
        color='red', # Use a bold color to make them stand out
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        # Add a tooltip with details
        tooltip=(
            f"Confidence: {row[confidence_column]:.2f}%<br>"
            f"Latitude: {row['latitude']}<br>"
            f"Longitude: {row['longitude']}"
        )
    ).add_to(m)

# --- 6. Save the new map to an HTML file ---
output_file = 'filtered_fire_hotspots_map.html'
m.save(output_file)
print(f"Success! A new interactive map with filtered data has been saved to '{output_file}'.")
print("You can now open this file in your web browser to view the fire hotspots.")


Error: The file 'fire_detections.csv' was not found. Please make sure it's in the same directory.


Enter the minimum confidence level (0-100) to filter the data:  90


NameError: name 'df' is not defined

In [1]:
import pandas as pd
import folium

# --- 1. Load the data from the CSV file ---
try:
    df = pd.read_csv('fire_detections.csv')
    print("Successfully loaded fire detections from 'fire_detections.csv'")
except FileNotFoundError:
    print("Error: The file 'fire_detections.csv' was not found. Please make sure it's in the same directory.")
    exit()

# --- 2. Get user input for the confidence filter ---
# This loop ensures the user enters a valid number between 0 and 100
while True:
    try:
        min_confidence = float(input("Enter the minimum confidence level (0-100) to filter the data: "))
        if 0 <= min_confidence <= 100:
            break
        else:
            print("Please enter a number between 0 and 100.")
    except ValueError:
        print("Invalid input. Please enter a number.")

# --- 3. Filter the DataFrame based on the user's input ---
# The confidence column is often named 'confidence' or 'Confidence'. Let's check both.
confidence_column = 'confidence' if 'confidence' in df.columns else 'Confidence'
filtered_df = df[df[confidence_column] >= min_confidence]

# Let's see how many detections we have after filtering
print(f"Filtered {len(filtered_df)} valid fire detections with confidence >= {min_confidence}.")

# --- 4. Create a new Folium map centered on the average location of the filtered fires ---
# Create a map centered at the average latitude and longitude of the filtered data
if not filtered_df.empty:
    center_lat = filtered_df['latitude'].mean()
    center_lon = filtered_df['longitude'].mean()
    m = folium.Map(location=[center_lat, center_lon], zoom_start=4, tiles='CartoDB dark_matter')
    print("Created a new interactive map.")
else:
    print("No detections found with the specified confidence level. Creating an empty map.")
    m = folium.Map(location=[0, 0], zoom_start=2, tiles='CartoDB dark_matter')


# --- 5. Add a marker for each filtered fire detection ---
for index, row in filtered_df.iterrows():
    # Create a small, colored circle marker for each detection
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,  # Adjust the size of the marker
        color='red', # Use a bold color to make them stand out
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        # Add a tooltip with details
        tooltip=(
            f"Confidence: {row[confidence_column]:.2f}%<br>"
            f"Latitude: {row['latitude']}<br>"
            f"Longitude: {row['longitude']}"
        )
    ).add_to(m)

# --- 6. Save the new map to an HTML file ---
output_file = 'filtered_fire_hotspots_map.html'
m.save(output_file)
print(f"Success! A new interactive map with filtered data has been saved to '{output_file}'.")
print("You can now open this file in your web browser to view the fire hotspots.")


Error: The file 'fire_detections.csv' was not found. Please make sure it's in the same directory.


Enter the minimum confidence level (0-100) to filter the data:  50


NameError: name 'df' is not defined

In [None]:
import pandas as pd
import folium

# --- 1. Load the data from the CSV file ---
try:
    df = pd.read_csv('firms_fire_data.csv')
    print("Successfully loaded fire detections from 'fire_detections.csv'")
except FileNotFoundError:
    print("Error: The file 'fire_detections.csv' was not found. Please make sure it's in the same directory.")
    exit()

# --- 2. Get user input for the confidence filter ---
# This loop ensures the user enters a valid number between 0 and 100
while True:
    try:
        min_confidence = float(input("Enter the minimum confidence level (0-100) to filter the data: "))
        if 0 <= min_confidence <= 100:
            break
        else:
            print("Please enter a number between 0 and 100.")
    except ValueError:
        print("Invalid input. Please enter a number.")

# --- 3. Filter the DataFrame based on the user's input ---
# The confidence column is often named 'confidence' or 'Confidence'. Let's check both.
confidence_column = 'confidence' if 'confidence' in df.columns else 'Confidence'
filtered_df = df[df[confidence_column] >= min_confidence]

# Let's see how many detections we have after filtering
print(f"Filtered {len(filtered_df)} valid fire detections with confidence >= {min_confidence}.")

# --- 4. Create a new Folium map centered on the average location of the filtered fires ---
# Create a map centered at the average latitude and longitude of the filtered data
if not filtered_df.empty:
    center_lat = filtered_df['latitude'].mean()
    center_lon = filtered_df['longitude'].mean()
    m = folium.Map(location=[center_lat, center_lon], zoom_start=4, tiles='CartoDB dark_matter')
    print("Created a new interactive map.")
else:
    print("No detections found with the specified confidence level. Creating an empty map.")
    m = folium.Map(location=[0, 0], zoom_start=2, tiles='CartoDB dark_matter')


# --- 5. Add a marker for each filtered fire detection ---
for index, row in filtered_df.iterrows():
    # Create a small, colored circle marker for each detection
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,  # Adjust the size of the marker
        color='red', # Use a bold color to make them stand out
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        # Add a tooltip with details
        tooltip=(
            f"Confidence: {row[confidence_column]:.2f}%<br>"
            f"Latitude: {row['latitude']}<br>"
            f"Longitude: {row['longitude']}"
        )
    ).add_to(m)

# --- 6. Save the new map to an HTML file ---
output_file = 'filtered_fire_hotspots_map.html'
m.save(output_file)
print(f"Success! A new interactive map with filtered data has been saved to '{output_file}'.")
print("You can now open this file in your web browser to view the fire hotspots.")


In [1]:
import pandas as pd
import numpy as np
import requests
import io
import os

# --- Configuration ---
# 1. Define the geographical area (Bounding Box for a part of California)
# Format: [lon_min, lat_min, lon_max, lat_max]
BOUNDING_BOX = [-123.0, 37.0, -120.0, 39.0]

# 2. Define the time period
START_DATE = "2023-08-01"
END_DATE = "2023-10-31"

# 3. Define the grid resolution (in degrees). 0.1 degrees is roughly 11km.
GRID_RESOLUTION = 0.1

# 4. NASA FIRMS API Key (You can get one, but for small requests it's not always needed)
# Using the public NRT (Near Real Time) source for this example.
# Map key for VIIRS S-NPP instrument.
FIRMS_MAP_KEY = "c1a20597933a6e9a667634351336a54d" 

# --- Step 1: Fetch Real Fire Data from NASA FIRMS ---
print("Step 1: Fetching real fire data from NASA FIRMS...")

# Construct the API URL
# Source: VIIRS S-NPP (a satellite instrument good for fire detection)
# Area: Bounding box, specified as W,S,E,N
# Date Range:
firms_url = (
    f"https://firms.modaps.eosdis.nasa.gov/api/area/csv/"
    f"{FIRMS_MAP_KEY}/VIIRS_SNPP_NRT/"
    f"{BOUNDING_BOX[0]},{BOUNDING_BOX[1]},{BOUNDING_BOX[2]},{BOUNDING_BOX[3]}/1/{START_DATE}"
)

# FIRMS API requires the end date to be specified via a separate parameter in the request if it's different from start date
# However, the simple daily API call gets data for the date specified up to the present.
# So we will get all data from START_DATE to today and then filter it.
# A more robust way for historical data is using their archive download tool, but for this deadline, the API is faster.

try:
    response = requests.get(firms_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    
    # Read the CSV data directly into a pandas DataFrame
    fire_data_raw = pd.read_csv(io.StringIO(response.text))
    
    # Filter by date to ensure we are within the specified window
    fire_data_raw['acq_date'] = pd.to_datetime(fire_data_raw['acq_date'])
    fire_data = fire_data_raw[fire_data_raw['acq_date'] <= END_DATE].copy()
    
    print(f"Successfully fetched {len(fire_data)} fire hotspots.")
    # Keep only essential columns
    fire_data = fire_data[['latitude', 'longitude', 'acq_date']]
except requests.exceptions.RequestException as e:
    print(f"Error fetching data from FIRMS API: {e}")
    # Create an empty dataframe if the API fails, so the script can continue
    fire_data = pd.DataFrame(columns=['latitude', 'longitude', 'acq_date'])

# --- Step 2: Create the Geographical Grid ---
print("\nStep 2: Creating geographical grid...")

# Create arrays of longitude and latitude points
lon_points = np.arange(BOUNDING_BOX[0], BOUNDING_BOX[2], GRID_RESOLUTION)
lat_points = np.arange(BOUNDING_BOX[1], BOUNDING_BOX[3], GRID_RESOLUTION)

# Create the grid of all combinations of lat/lon
grid_lon, grid_lat = np.meshgrid(lon_points, lat_points)
grid_df = pd.DataFrame({
    'lon': grid_lon.ravel(),
    'lat': grid_lat.ravel()
})

print(f"Created a grid with {len(grid_df)} cells.")

# --- Step 3: Create a Master DataFrame with Simulated Data ---
print("\nStep 3: Creating master DataFrame and simulating weather/vegetation data...")

# Create a date range for our analysis
dates = pd.date_range(start=START_DATE, end=END_DATE, freq='D')

# Create the master DataFrame by crossing the grid with the dates
master_df = pd.MultiIndex.from_product([grid_df.index, dates], names=['grid_index', 'date']).to_frame(index=False)
master_df = master_df.merge(grid_df, left_on='grid_index', right_index=True).drop('grid_index', axis=1)

# **SIMULATE DATA**
# This is the part you'll replace with real data later.
# For now, we generate realistic-looking random data.
num_rows = len(master_df)

# Simulate temperature (with some seasonality)
days_since_start = (master_df['date'] - master_df['date'].min()).dt.days
seasonality = np.sin(2 * np.pi * days_since_start / 90) * 5  # Simple sine wave for seasonal effect
master_df['temperature_c'] = 25 + seasonality + np.random.uniform(-3, 3, num_rows)

# Simulate humidity
master_df['humidity_percent'] = 40 - seasonality * 2 + np.random.uniform(-10, 10, num_rows)
master_df['humidity_percent'] = np.clip(master_df['humidity_percent'], 10, 90)

# Simulate wind speed
master_df['wind_speed_kmh'] = np.random.uniform(5, 30, num_rows)

# Simulate Vegetation Dryness Index (NDVI) - lower values mean drier vegetation
master_df['ndvi'] = 0.5 - (days_since_start / 90) * 0.1 + np.random.uniform(-0.05, 0.05, num_rows)
master_df['ndvi'] = np.clip(master_df['ndvi'], 0.1, 0.8)

print("Simulated data generated.")

# --- Step 4: Integrate the Real Fire Data onto the Grid ---
print("\nStep 4: Integrating real fire data onto the grid...")

# To do this efficiently, we "digitize" the fire coordinates.
# This means we find which grid cell each fire incident belongs to.
lon_bins = np.arange(BOUNDING_BOX[0], BOUNDING_BOX[2] + GRID_RESOLUTION, GRID_RESOLUTION)
lat_bins = np.arange(BOUNDING_BOX[1], BOUNDING_BOX[3] + GRID_RESOLUTION, GRID_RESOLUTION)

if not fire_data.empty:
    # Find the index of the bin for each fire's lat/lon
    fire_data['lon_bin_index'] = np.digitize(fire_data['longitude'], bins=lon_bins) - 1
    fire_data['lat_bin_index'] = np.digitize(fire_data['latitude'], bins=lat_bins) - 1
    
    # Get the actual lon/lat of the grid cell
    fire_data['lon'] = lon_points[fire_data['lon_bin_index']]
    fire_data['lat'] = lat_points[fire_data['lat_bin_index']]

    # Group by the grid cell and date to count the number of fire hotspots
    fire_counts = fire_data.groupby(['lat', 'lon', 'acq_date']).size().reset_index(name='fire_count')
    fire_counts.rename(columns={'acq_date': 'date'}, inplace=True)
    
    # Merge the fire counts into our master dataframe
    master_df = pd.merge(master_df, fire_counts, on=['lat', 'lon', 'date'], how='left')
    master_df['fire_count'].fillna(0, inplace=True)
else:
    # If no fire data was fetched, create the column with all zeros
    master_df['fire_count'] = 0

print("Fire data integrated.")

# --- Step 5: Engineer the Target Variable ---
print("\nStep 5: Engineering the target variable 'fire_in_next_7_days'...")

# Our target: a binary flag that is 1 if a fire occurred in the *next* 7 days for that grid cell.
# This is what the model will try to predict.

# First, create a simple 'fire_today' flag
master_df['fire_today'] = (master_df['fire_count'] > 0).astype(int)

# Sort the data by grid cell and then by date. This is crucial for the next step.
master_df.sort_values(['lat', 'lon', 'date'], inplace=True)

# Use a rolling window on future data.
# We group by each grid cell ('lat', 'lon') and look ahead.
# We shift the 'fire_today' signal backwards in time.
# A rolling max over a window of 7 will be 1 if any value in that window is 1.
master_df['fire_in_next_7_days'] = master_df.groupby(['lat', 'lon'])['fire_today'].transform(
    lambda x: x.shift(-7).rolling(window=7, min_periods=1).max()
).fillna(0).astype(int)

print("Target variable engineered.")


# --- Step 6: Final Cleanup and Save ---
print("\nStep 6: Cleaning up and saving the final preprocessed data...")

final_df = master_df.drop(columns=['fire_count', 'fire_today'])

# The last 7 days for each grid cell will have NaNs for the target because we can't see into the future.
# In a real scenario, you'd just drop these rows as they can't be used for training.
final_df.dropna(subset=['fire_in_next_7_days'], inplace=True)

# Final check on column types
final_df = final_df.round(4) # Round floats for cleanliness

# Save to a CSV file
output_filename = 'preprocessed_wildfire_data.csv'
final_df.to_csv(output_filename, index=False)

print(f"\n✅ Success! Preprocessed data saved to '{output_filename}'")
print(f"The dataset has {len(final_df)} rows and {len(final_df.columns)} columns.")
print("Columns:", final_df.columns.tolist())
print("\nSample of the final data:")
print(final_df.head())


Step 1: Fetching real fire data from NASA FIRMS...


KeyError: 'acq_date'

In [2]:
import pandas as pd
import io
import requests
from datetime import datetime, timedelta

# --- Your Settings ---
# I'll use a sample location and date range for this example.
# Replace these with your actual area of interest.
YOUR_API_KEY = '2eaecfb3056b7b7751771485eb481c51' # Get one here: https://firms.modaps.eosdis.nasa.gov/api/api_key/
REGION = 'USA' # Or use a bounding box like '-125,30,-110,45' for the West Coast
DAYS_OF_DATA = 30 # How many days of data to fetch

# --- Code ---

print("Step 1: Preparing to fetch real fire data from NASA FIRMS...")

# Calculate date range
END_DATE = datetime.now().strftime('%Y-%m-%d')
START_DATE = (datetime.now() - timedelta(days=DAYS_OF_DATA)).strftime('%Y-%m-%d')

# Construct the URL for the MODIS instrument data in CSV format
# Using MODIS C6.1 for this example. You can change to VIIRS if needed.
url = (f'https://firms.modaps.eosdis.nasa.gov/api/area/csv/'
       f'{YOUR_API_KEY}/MODIS_NRT/{REGION}/1/{START_DATE}')

try:
    response = requests.get(url)
    response.raise_for_status() # This will raise an error if the request failed (e.g., 404, 500)

    # Check if the response is empty or contains an error message
    if not response.text or "Error" in response.text:
        print("Error: Received an empty response or an error message from the API.")
        print("API Response:", response.text)
        # Exit or handle the error appropriately
        exit()

    print("Successfully received data from NASA FIRMS.")

    # Use io.StringIO to read the text response as if it were a file
    fire_data_raw = pd.read_csv(io.StringIO(response.text))

    # --- DEBUGGING STEP ---
    # Let's see what we actually got. This is the most important part.
    print("\n--- Debugging Info ---")
    print("Columns available in the DataFrame:")
    print(fire_data_raw.columns)
    print("\nFirst 5 rows of the data:")
    print(fire_data_raw.head())
    print("----------------------\n")


    # --- YOUR CODE CONTINUES HERE ---
    # Now, look at the output above and find the correct column name for the date.
    # It is most likely 'acq_date', but if it's different, change it in the line below.
    
    # !! IMPORTANT !!
    # !! CHANGE 'acq_date' BELOW IF THE DEBUG OUTPUT SHOWS A DIFFERENT NAME !!
    CORRECT_DATE_COLUMN = 'acq_date' 

    print(f"Processing data using the column name: '{CORRECT_DATE_COLUMN}'")

    # Convert the date column to datetime objects
    fire_data_raw[CORRECT_DATE_COLUMN] = pd.to_datetime(fire_data_raw[CORRECT_DATE_COLUMN])

    # Filter by date to ensure we are within the specified window
    END_DATE_DT = pd.to_datetime(END_DATE)
    fire_data = fire_data_raw[fire_data_raw[CORRECT_DATE_COLUMN] <= END_DATE_DT].copy()

    # Select and rename columns for clarity
    # Common columns are: latitude, longitude, brightness, frp (Fire Radiative Power)
    # Check your debug output for the exact names.
    fire_data_processed = fire_data[['latitude', 'longitude', CORRECT_DATE_COLUMN, 'brightness', 'frp']].copy()
    fire_data_processed.rename(columns={CORRECT_DATE_COLUMN: 'date'}, inplace=True)


    print(f"Successfully processed {len(fire_data_processed)} fire hotspots.")
    print("Final preprocessed data sample:")
    print(fire_data_processed.head())

    # You can now save this to a CSV for your guide
    # fire_data_processed.to_csv('preprocessed_fire_data.csv', index=False)
    # print("\nPreprocessed data saved to 'preprocessed_fire_data.csv'")


except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
    print("Please check your API key, region, and the date format.")
except KeyError as e:
    print(f"KeyError: {e}. One of the columns was not found.")
    print("Please check the column names printed in the 'Debugging Info' section and correct them in the script.")
except Exception as err:
    print(f"An other error occurred: {err}")

Step 1: Preparing to fetch real fire data from NASA FIRMS...
Successfully received data from NASA FIRMS.

--- Debugging Info ---
Columns available in the DataFrame:
Index(['Invalid area. Expects: [west', 'south', 'east', 'north].'], dtype='object')

First 5 rows of the data:
Empty DataFrame
Columns: [Invalid area. Expects: [west, south, east, north].]
Index: []
----------------------

Processing data using the column name: 'acq_date'
KeyError: 'acq_date'. One of the columns was not found.
Please check the column names printed in the 'Debugging Info' section and correct them in the script.
