In [None]:
# Install required packages for time series analysis
# !pip install pystac-client pandas matplotlib seaborn statsmodels pymannkendall

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pystac_client import Client
from pystac_client.stac_api_io import StacApiIO
from datetime import datetime, timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
import pymannkendall as mk
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from types import SimpleNamespace
import time
import os
from getpass import getpass
import json

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)



In [2]:
# Connect to Montandon STAC API with Authentication
STAC_API_URL = "https://montandon-eoapi-stage.ifrc.org/"

# Get authentication token
# Option 1: From environment variable (recommended for automation)
api_token = os.getenv('MONTANDON_API_TOKEN')

# Option 2: Prompt user for token if not in environment
if api_token is None:
    print("=" * 70)
    print("AUTHENTICATION REQUIRED")
    print("=" * 70)
    print("\nThe Montandon STAC API requires a Bearer Token for authentication.")
    print("\nHow to get your token:")
    print("  1. Visit: https://goadmin-stage.ifrc.org/")
    print("  2. Log in with your IFRC credentials")
    print("  3. Generate an API token from your account settings")
    print("\nAlternatively, set the MONTANDON_API_TOKEN environment variable:")
    print("  PowerShell: $env:MONTANDON_API_TOKEN = 'your_token_here'")
    print("  Bash: export MONTANDON_API_TOKEN='your_token_here'")
    print("\n" + "=" * 70)
    
    # Prompt for token (hidden input)
    api_token = getpass("Enter your Montandon API Token: ")
    
    if not api_token or api_token.strip() == "":
        raise ValueError("API token is required to access the Montandon STAC API")

# Create authentication headers
auth_headers = {"Authorization": f"Bearer {api_token}"}

# Connect to STAC API with authentication
try:
    client = Client.open(STAC_API_URL, headers=auth_headers)
    print(f"\n[OK] Connected to: {STAC_API_URL}")
    print(f"[OK] API Title: {client.title}")
    print(f"[OK] Authentication: Bearer Token (OpenID Connect)")
    print(f"[OK] Auth Provider: https://goadmin-stage.ifrc.org/o/.well-known/openid-configuration")
except Exception as e:
    print(f"\n[ERROR] Authentication failed: {e}")
    print("\nPlease check:")
    print("  1. Your token is valid and not expired")
    print("  2. You have the correct permissions")
    print("  3. The API endpoint is accessible")
    raise

AUTHENTICATION REQUIRED

The Montandon STAC API requires a Bearer Token for authentication.

How to get your token:
  1. Visit: https://goadmin-stage.ifrc.org/
  2. Log in with your IFRC credentials
  3. Generate an API token from your account settings

Alternatively, set the MONTANDON_API_TOKEN environment variable:
  PowerShell: $env:MONTANDON_API_TOKEN = 'your_token_here'
  Bash: export MONTANDON_API_TOKEN='your_token_here'


[OK] Connected to: https://montandon-eoapi-stage.ifrc.org/
[OK] API Title: stac-fastapi
[OK] Authentication: Bearer Token (OpenID Connect)
[OK] Auth Provider: https://goadmin-stage.ifrc.org/o/.well-known/openid-configuration

[OK] Connected to: https://montandon-eoapi-stage.ifrc.org/
[OK] API Title: stac-fastapi
[OK] Authentication: Bearer Token (OpenID Connect)
[OK] Auth Provider: https://goadmin-stage.ifrc.org/o/.well-known/openid-configuration


## Data Fetching Helper with CQL2 Date Filter

Using CQL2 JSON filter for efficient date-based queries (last 10 years).

In [3]:
# Define time range for analysis (20 years may be too large - will auto-split by year)
from dateutil.relativedelta import relativedelta

END_DATE = datetime(2024, 6, 30)  # June 30, 2024 (valid date)
START_DATE = END_DATE - relativedelta(years=10)  # 10 years ago (more manageable)

# Format dates for CQL2 filter
START_DATE_STR = START_DATE.strftime('%Y-%m-%dT00:00:00Z')
END_DATE_STR = END_DATE.strftime('%Y-%m-%dT23:59:59Z')

print(f"Analysis period: {START_DATE_STR} to {END_DATE_STR}")
print(f"Note: If response is too large, data will be fetched year-by-year automatically.")

Analysis period: 2014-06-30T00:00:00Z to 2024-06-30T23:59:59Z
Note: If response is too large, data will be fetched year-by-year automatically.


In [4]:
# Bounding Boxes (approximate): [min_lon, min_lat, max_lon, max_lat]
# Europe: -10 (W) to 40 (E), 35 (S) to 71 (N)
# EUROPE_BBOX = [-10, 35, 40, 71]

# South Asia (Pakistan, India, Bangladesh, and surrounding regions)
# Covers: 60 (W) to 100 (E), 5 (S) to 37 (N)
SOUTH_ASIA_BBOX = [60, 5, 100, 37]

# Set the active bounding box for analysis
ACTIVE_BBOX = SOUTH_ASIA_BBOX  # Change to EUROPE_BBOX for Europe analysis

def build_datetime_filter(start_date=START_DATE_STR, end_date=END_DATE_STR):
    """
    Build CQL2 JSON filter for datetime range.
    """
    return {
        "op": "t_intersects",
        "args": [
            {"property": "datetime"},
            {"interval": [start_date, end_date]}
        ]
    }

def build_bbox_filter(bbox=ACTIVE_BBOX):
    """
    Build CQL2 JSON filter for bounding box (spatial filter).
    bbox format: [min_lon, min_lat, max_lon, max_lat]
    """
    return {
        "op": "s_intersects",
        "args": [
            {"property": "geometry"},
            {
                "type": "Polygon",
                "coordinates": [[
                    [bbox[0], bbox[1]],
                    [bbox[2], bbox[1]],
                    [bbox[2], bbox[3]],
                    [bbox[0], bbox[3]],
                    [bbox[0], bbox[1]]
                ]]
            }
        ]
    }

def build_combined_filter(start_date=START_DATE_STR, end_date=END_DATE_STR, bbox=ACTIVE_BBOX):
    """
    Build combined CQL2 filter for both datetime and spatial (bbox) constraints.
    """
    return {
        "op": "and",
        "args": [
            build_datetime_filter(start_date, end_date),
            build_bbox_filter(bbox)
        ]
    }



In [5]:
def search_stac_direct(collections, bbox=None, 
                       datetime_range=None, 
                       cql2_filter=None, limit=5000,
                       max_retries=3, retry_delay=2):
    """
    Search STAC API using client._stac_io for authenticated access.
    This bypasses pystac_client's internal link resolution which causes errors.
    
    Includes retry logic with exponential backoff for 
    handling intermittent errors.
    
    Parameters:
    -----------
    collections : list
        Collection IDs to search
    bbox : list
        Bounding box [min_lon, min_lat, max_lon, max_lat]
    datetime_range : str
        ISO 8601 datetime range string 
        (e.g., '2024-01-01/2024-12-31')
    cql2_filter : dict
        CQL2 filter body (optional, may not be supported)
    limit : int
        Maximum number of results
    max_retries : int
        Maximum number of retry attempts (default: 3)
    retry_delay : int
        Base delay in seconds between retries (exponential backoff applied)
        
    Returns:
    --------
    list of SimpleNamespace objects
        Items with .id, .collection_id, .properties (dict), .geometry
    """
    search_url = f"{STAC_API_URL}search"
    
    # Build search payload using standard STAC search parameters
    search_payload = {"limit": limit}
    
    if collections:
        search_payload["collections"] = collections
    if bbox:
        search_payload["bbox"] = bbox
    if datetime_range:
        search_payload["datetime"] = datetime_range
    
    # Only add CQL2 filter if explicitly provided (may cause errors)
    if cql2_filter:
        search_payload["filter"] = cql2_filter
        search_payload["filter-lang"] = "cql2-json"
    
    # Retry logic with exponential backoff
    last_error = None
    for attempt in range(max_retries):
        try:
            # Use client._stac_io.request() for authenticated POST request
            response_text = client._stac_io.request(
                search_url,
                method="POST",
                headers={"Content-Type": "application/json"},
                parameters=search_payload
            )
            
            # Parse the JSON response
            response_data = json.loads(response_text)
            
            # Convert features to SimpleNamespace objects for attribute access
            items = []
            for feature in response_data.get("features", []):
                item = SimpleNamespace(
                    id=feature.get("id"),
                    collection_id=feature.get("collection"),
                    geometry=feature.get("geometry"),
                    bbox=feature.get("bbox"),
                    properties=feature.get("properties", {}),
                    links=feature.get("links", []),
                    assets=feature.get("assets", {})
                )
                items.append(item)
            
            return items
            
        except Exception as e:
            last_error = e
            error_msg = str(e)
            
            # Don't retry for certain error types
            if "ProgramLimitExceededError" in error_msg:
                raise  # Let the caller handle this
            
            # Retry for server errors and timeouts
            if attempt < max_retries - 1:
                wait_time = retry_delay * (2 ** attempt)  # Exponential backoff
                print(f"      ‚ö†Ô∏è  Attempt {attempt + 1} failed: {error_msg[:50]}... Retrying in {wait_time}s")
                time.sleep(wait_time)
            else:
                raise  # Re-raise on final attempt
    
    raise last_error

def fetch_items_with_filter(collections, bbox=None, datetime_range=None, cql2_filter=None, limit=5000):
    """
    Fetch items from specified collections using standard STAC search parameters.
    Uses client._stac_io for authenticated API access with retry logic.
    """
    print(f"Fetching items from {collections}...")
    
    if datetime_range:
        print(f"   Datetime: {datetime_range}")
    if bbox:
        print(f"   Bbox: {bbox}")
    
    try:
        items = search_stac_direct(collections, bbox=bbox, datetime_range=datetime_range, 
                                    cql2_filter=cql2_filter, limit=limit)
        print(f"‚úÖ Found {len(items)} items.")
        return items
    except Exception as e:
        error_msg = str(e)
        
        # Handle "ProgramLimitExceededError" - response too large
        if "ProgramLimitExceededError" in error_msg or "exceeds the maximum" in error_msg:
            print("Response too large! Splitting request by year...")
            return fetch_items_by_year(collections, bbox, datetime_range)
        
        # Handle TimeoutError
        elif "TimeoutError" in error_msg or "timeout" in error_msg.lower():
            print("TIMEOUT: Retrying with smaller limit (2000)...")
            try:
                items = search_stac_direct(collections, bbox=bbox, datetime_range=datetime_range,
                                           cql2_filter=cql2_filter, limit=2000)
                print(f"‚úÖ Found {len(items)} items (with reduced limit).")
                return items
            except Exception as e2:
                print(f"Error even with reduced limit: {str(e2)}")
                return []
        else:
            print(f"Error fetching items: {error_msg}")
            return []

print("Search functions initialized (with retry logic and rate limiting)")

Search functions initialized (with retry logic and rate limiting)


In [6]:
def fetch_items_by_year_paginated(collections, bbox=None, 
                                   start_year=None, end_year=None, 
                                   page_size=100, 
                                   delay_between_requests=0.5,
                                   max_pages_per_year=100):
    """
    Fetch ALL items year-by-year with PAGINATION for maximum reliability.
    
    This function fetches ALL records for each year by making multiple
    paginated requests until no more data is available.
    
    Parameters:
    -----------
    collections : list
        Collection IDs to search
    bbox : list
        Bounding box [min_lon, min_lat, max_lon, max_lat]
    start_year : int
        Starting year (defaults to START_DATE.year)
    end_year : int
        Ending year (defaults to END_DATE.year)
    page_size : int
        Records per API request/page (default: 100 - small for reliability)
    delay_between_requests : float
        Delay in seconds between API requests (default: 0.5s)
    max_pages_per_year : int
        Safety limit to prevent infinite loops (default: 100 = 10,000 records max per year)
    
    Returns:
    --------
    list of SimpleNamespace objects
        All items fetched with pagination
    """
    all_items = []
    
    if start_year is None:
        start_year = START_DATE.year
    if end_year is None:
        end_year = END_DATE.year
    
    print(f"   Fetching year-by-year WITH PAGINATION ({start_year}-{end_year})")
    print(f"   Page size: {page_size} records per request")
    
    for current_year in range(start_year, end_year + 1):
        # Create datetime range for this year
        year_start_str = f"{current_year}-01-01T00:00:00Z"
        year_end_str = f"{current_year}-12-31T23:59:59Z"
        datetime_range = f"{year_start_str}/{year_end_str}"
        
        print(f"     {current_year}: ", end="", flush=True)
        
        year_items = []
        page_num = 0
        
        try:
            # Keep fetching pages until we get fewer items than page_size
            while page_num < max_pages_per_year:
                page_items = search_stac_direct(
                    collections, 
                    bbox=bbox, 
                    datetime_range=datetime_range, 
                    limit=page_size
                )
                
                if not page_items:
                    # No more items
                    break
                
                year_items.extend(page_items)
                page_num += 1
                
                # If we got fewer items than page_size, we've reached the end
                if len(page_items) < page_size:
                    break
                
                # Rate limiting between pages
                time.sleep(delay_between_requests)
            
            all_items.extend(year_items)
            
            if page_num > 1:
                print(f"‚úÖ {len(year_items)} ({page_num} pages)")
            else:
                print(f"‚úÖ {len(year_items)}")
            
        except Exception as e:
            # Keep partial data if we got some
            if year_items:
                all_items.extend(year_items)
                print(f"‚ö†Ô∏è {len(year_items)} partial ({str(e)[:25]}...)")
            else:
                print(f"‚ö†Ô∏è 0 ({str(e)[:30]}...)")
        
        # Rate limiting between years
        time.sleep(delay_between_requests)
    
    print(f"   Total items fetched: {len(all_items)}")
    return all_items


def items_to_dataframe(items):
    """Convert STAC items (SimpleNamespace objects) to a pandas DataFrame."""
    data = []
    for item in items:
        props = item.properties
        hazard_codes = props.get("monty:hazard_codes", [])
        country_codes = props.get("monty:country_codes", [])
        primary_country = country_codes[0] if country_codes else "Unknown"
        entry = {
            "id": item.id,
            "datetime": pd.to_datetime(props.get("datetime") or props.get("start_datetime")),
            "title": props.get("title"),
            "hazard_codes": hazard_codes,
            "primary_country": primary_country,
            "collection": item.collection_id
        }
        data.append(entry)
    df = pd.DataFrame(data)
    if not df.empty:
        df.set_index("datetime", inplace=True)
        df.sort_index(inplace=True)
    return df

print("Helper functions initialized (year-by-year with PAGINATION, page_size=100)")

Helper functions initialized (year-by-year with PAGINATION, page_size=100)


## Case Study 2.1: Seasonality Patterns by Hazard Type

We analyze seasonality of **Floods** and **Tropical Cyclones**.

**Hypothesis:**
- Tropical Cyclones show distinct seasonal peaks (hurricane season).
- Floods show seasonality linked to monsoon seasons.

In [None]:
# Fetch Data for Seasonality Analysis (last 10 years, South Asia) - WITH PAGINATION
collections_of_interest = ["gdacs-events", "emdat-events", "glide-events"]

print("=" * 70)
print("FETCHING EVENTS DATA (YEAR-BY-YEAR WITH PAGINATION)")
print("=" * 70)
print(f"Collections: {collections_of_interest}")
print(f"Region: South Asia (bbox: {ACTIVE_BBOX})")
print(f"Time Range: {START_DATE.year} to {END_DATE.year}")
print(f"Page size: 100 records per request (fetches ALL pages per year)")

start_time = time.time()

all_events = []
collection_stats = {}

# Fetch each collection year-by-year with pagination (SEQUENTIAL)
for collection in collections_of_interest:
    print(f"\nüì¶ {collection}")
    
    try:
        # Use paginated function to get ALL records
        collection_items = fetch_items_by_year_paginated(
            [collection],
            bbox=ACTIVE_BBOX,
            start_year=START_DATE.year,
            end_year=END_DATE.year,
            page_size=100,  # 100 records per page (will paginate for more)
            delay_between_requests=0.5
        )
        all_events.extend(collection_items)
        collection_stats[collection] = len(collection_items)
    except Exception as e:
        collection_stats[collection] = 0
        print(f"   ‚ö†Ô∏è Failed: {str(e)[:50]}...")
    
    time.sleep(1.0)  # Delay between collections

elapsed_time = time.time() - start_time

print(f"\n{'=' * 70}")
print(f"‚úÖ Fetch complete in {elapsed_time:.1f}s")
print(f"Total: {len(all_events)} items")
for coll, count in collection_stats.items():
    print(f"  {coll}: {count}")

# Convert to dataframe
df_events = items_to_dataframe(all_events)
print(f"\nDataFrame: {df_events.shape[0]} rows, {df_events.shape[1]} columns")

if not df_events.empty:
    display(df_events.head())
    print(f"\nDate range: {df_events.index.min()} to {df_events.index.max()}")
    print(f"Countries: {df_events['primary_country'].nunique()}")
print("=" * 70)

FETCHING EVENTS DATA (YEAR-BY-YEAR WITH PAGINATION)
Collections: ['gdacs-events', 'emdat-events', 'glide-events']
Region: South Asia (bbox: [60, 5, 100, 37])
Time Range: 2014 to 2024
Page size: 100 records per request (fetches ALL pages per year)

üì¶ gdacs-events
   Fetching year-by-year WITH PAGINATION (2014-2024)
   Page size: 100 records per request
     2014: ‚úÖ 20
‚úÖ 20
     2015:      2015: ‚úÖ 14
‚úÖ 14
     2016:      2016: ‚úÖ 13
‚úÖ 13
     2017:      2017: ‚úÖ 0
‚úÖ 0
     2018:      2018: ‚úÖ 0
‚úÖ 0
     2019:      2019: ‚úÖ 23
‚úÖ 23
     2020:      2020: ‚úÖ 46
‚úÖ 46
     2021:      2021: ‚úÖ 10000 (100 pages)
‚úÖ 10000 (100 pages)
     2022:      2022: ‚úÖ 10000 (100 pages)
‚úÖ 10000 (100 pages)
     2023:      2023: ‚úÖ 10000 (100 pages)
‚úÖ 10000 (100 pages)
     2024:      2024: ‚úÖ 60
‚úÖ 60
   Total items fetched: 30176
   Total items fetched: 30176

üì¶ emdat-events
   Fetching year-by-year WITH PAGINATION (2014-2024)
   Page size: 100 records per request
  

In [None]:
# Filter for Specific Hazards
# Including GLIDE, EM-DAT, and UNDRR-ISC format codes

def filter_hazard(df, codes_list):
    mask = df['hazard_codes'].apply(lambda x: any(code in x for code in codes_list))
    return df[mask]

# CORRECTED hazard codes per official documentation
flood_codes = ["FL", "FF", "MH0600", "MH0601", "MH0602", "MH0603", "MH0604",
               "nat-hyd-flo-flo", "nat-hyd-flo-fla", "nat-hyd-flo-riv", "nat-hyd-flo-coa"]
cyclone_codes = ["TC", "MH0306", "MH0307", "MH0308", "MH0309", "nat-met-sto-tro"]

df_floods = filter_hazard(df_events, flood_codes)
df_cyclones = filter_hazard(df_events, cyclone_codes)
print(f"Floods: {len(df_floods)}, Cyclones: {len(df_cyclones)}")

In [None]:
# Visualize Monthly Distribution
def plot_monthly_seasonality(df, title, color):
    if df.empty:
        print(f"No data for {title}")
        return
    df_copy = df.copy()
    df_copy['month'] = df_copy.index.month
    monthly_counts = df_copy.groupby('month').size().reindex(range(1,13), fill_value=0)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=monthly_counts.index, y=monthly_counts.values, color=color)
    plt.title(f"Seasonal Distribution: {title}")
    plt.xlabel("Month")
    plt.ylabel("Number of Events")
    plt.xticks(range(0,12), ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
    plt.show()

plot_monthly_seasonality(df_floods, "Floods", "skyblue")
plot_monthly_seasonality(df_cyclones, "Tropical Cyclones", "teal")

### Time Series Decomposition (STL)

We decompose the data into **Trend**, **Seasonality**, and **Residuals**.

In [None]:
def decompose_time_series(df, title, freq='M'):
    if df.empty:
        return
    ts = df.resample(freq).size().fillna(0)
    if len(ts) < 24:
        print(f"Not enough data for {title} (need 24+ months, got {len(ts)})")
        plt.figure(figsize=(12, 4))
        ts.plot()
        plt.title(f"Time Series: {title}")
        plt.show()
        return
    decomposition = seasonal_decompose(ts, model='additive', period=12)
    fig = decomposition.plot()
    fig.set_size_inches(12, 10)
    fig.suptitle(f'STL Decomposition: {title}', fontsize=16)
    plt.tight_layout()
    plt.show()

decompose_time_series(df_floods, "Flood Events (Monthly)")
decompose_time_series(df_cyclones, "Cyclone Events (Monthly)")

## Case Study 2.2: Impact Severity Trends

Analyze whether **impact severity** is increasing or decreasing over time.

**Methodology:**
1. Fetch all impact collections (last 10 years)
2. Extract `monty:impact_detail` values
3. Apply **Mann-Kendall Trend Test**

In [None]:
# Fetch ALL Impact Data (last 10 years, South Asia) - WITH PAGINATION
impact_collections = ["gdacs-impacts", "emdat-impacts", "desinventar-impacts",
                      "idmc-gidd-impacts", "idmc-idu-impacts"]

print("=" * 70)
print("FETCHING IMPACT DATA (YEAR-BY-YEAR WITH PAGINATION)")
print("=" * 70)
print(f"Collections: {impact_collections}")
print(f"Region: South Asia (bbox: {ACTIVE_BBOX})")
print(f"Time Range: {START_DATE.year} to {END_DATE.year}")
print(f"Page size: 100 records per request (fetches ALL pages per year)")

start_time = time.time()

all_impacts = []
impact_stats = {}

# Fetch each collection year-by-year with pagination (SEQUENTIAL)
for collection in impact_collections:
    print(f"\nüì¶ {collection}")
    
    try:
        # Use paginated function to get ALL records
        collection_items = fetch_items_by_year_paginated(
            [collection],
            bbox=ACTIVE_BBOX,
            start_year=START_DATE.year,
            end_year=END_DATE.year,
            page_size=100,  # 100 records per page (will paginate for more)
            delay_between_requests=0.5
        )
        all_impacts.extend(collection_items)
        impact_stats[collection] = len(collection_items)
    except Exception as e:
        impact_stats[collection] = 0
        print(f"   ‚ö†Ô∏è Failed: {str(e)[:50]}...")
    
    time.sleep(1.0)  # Extra delay between collections

elapsed_time = time.time() - start_time

print(f"\n{'=' * 70}")
print(f"‚úÖ Fetch complete in {elapsed_time:.1f}s")
print(f"Total: {len(all_impacts)} impact items")
for coll, count in impact_stats.items():
    print(f"  {coll}: {count}")
print("=" * 70)

In [None]:
# Process Impact Details
def process_impacts(items):
    data = []
    for item in items:
        props = item.properties
        impact_detail = props.get("monty:impact_detail", {})
        if not impact_detail:
            continue
        entry = {
            "datetime": pd.to_datetime(props.get("datetime") or props.get("start_datetime")),
            "category": impact_detail.get("category"),
            "type": impact_detail.get("type"),
            "value": impact_detail.get("value"),
            "unit": impact_detail.get("unit"),
            "country": props.get("monty:country_codes", ["Unknown"])[0]
        }
        data.append(entry)
    df = pd.DataFrame(data)
    if not df.empty:
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        df.dropna(subset=['value'], inplace=True)
        df.set_index("datetime", inplace=True)
        df.sort_index(inplace=True)
    return df

df_impacts = process_impacts(all_impacts)
display(df_impacts.head())
display(df_impacts['type'].value_counts().head(20))

In [None]:
# Analyze Trends (using np.polyfit, not deprecated pd.np)
def analyze_trend(df, impact_type, title):
    """
    Analyze trend for a specific impact type with visualization and statistical test.
    """
    subset = df[df['type'] == impact_type].copy()
    if subset.empty:
        print(f"‚ùå No data for '{impact_type}'")
        return
    
    # Aggregate by year
    ts_yearly = subset['value'].resample('Y').sum()
    
    if len(ts_yearly) < 2:
        print(f"‚ö†Ô∏è  Not enough yearly data for '{impact_type}' (need at least 2 years)")
        return
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
    
    # Plot 1: Yearly trend
    ax1.plot(ts_yearly.index.year, ts_yearly.values, marker='o', linewidth=2, markersize=8, color='steelblue')
    ax1.set_title(f"Yearly Trend: {title}", fontsize=14, fontweight='bold')
    ax1.set_ylabel("Total Value", fontsize=12)
    ax1.set_xlabel("Year", fontsize=12)
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Monthly trend (if enough data)
    ts_monthly = subset['value'].resample('M').sum()
    ax2.plot(ts_monthly.index, ts_monthly.values, linewidth=1.5, color='darkgreen', alpha=0.7)
    ax2.fill_between(ts_monthly.index, ts_monthly.values, alpha=0.3, color='darkgreen')
    ax2.set_title(f"Monthly Trend: {title}", fontsize=14, fontweight='bold')
    ax2.set_ylabel("Total Value", fontsize=12)
    ax2.set_xlabel("Date", fontsize=12)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print(f"\n{'='*70}")
    print(f"üìä IMPACT ANALYSIS: {title.upper()}")
    print(f"{'='*70}")
    print(f"Total records: {len(subset)}")
    print(f"Year range: {subset.index.year.min()} - {subset.index.year.max()}")
    print(f"Total value: {subset['value'].sum():,.0f}")
    print(f"Average per event: {subset['value'].mean():,.2f}")
    print(f"Max value: {subset['value'].max():,.0f}")
    print(f"Min value: {subset['value'].min():,.0f}")
    
    # Perform Mann-Kendall trend test
    if len(ts_yearly) >= 3:
        result = mk.original_test(ts_yearly.values)
        trend_interpretation = "üìà INCREASING" if result.trend == "increasing" else "üìâ DECREASING" if result.trend == "decreasing" else "‚û°Ô∏è  NO TREND"
        print(f"\n{trend_interpretation} TREND (Mann-Kendall Test)")
        print(f"  P-value: {result.p:.4f} {'(Significant at 0.05)' if result.p < 0.05 else '(Not significant)'}")
        print(f"  Slope: {result.slope:.4f} per year")
        print(f"  S-statistic: {result.s}")
    else:
        print(f"\n‚ö†Ô∏è  Not enough yearly data for Mann-Kendall test (need at least 3 years, got {len(ts_yearly)})")
    print(f"{'='*70}\n")

# Analyze the top impact types from the data
print("üîç ANALYZING TOP IMPACT TYPES FROM SOUTH ASIA DATA\n")
impact_types_to_analyze = ['displaced_internal', 'affected_total', 'death', 'shelter_emergency', 'evacuated', 'injured']

for impact_type in impact_types_to_analyze:
    analyze_trend(df_impacts, impact_type, impact_type.replace('_', ' ').title())

In [None]:
# Calculate Event Durations from the existing df_events DataFrame
print("=" * 70)
print("EVENT DURATION ANALYSIS")
print("=" * 70)

# We need to re-fetch events with start/end datetime properties
# Let's create a more detailed dataframe extraction function
def items_to_duration_dataframe(items):
    """Convert STAC items to DataFrame with duration information."""
    data = []
    for item in items:
        props = item.properties
        
        # Extract datetime information
        datetime_val = props.get("datetime")
        start_datetime = props.get("start_datetime")
        end_datetime = props.get("end_datetime")
        
        # Parse dates
        if start_datetime and end_datetime:
            start = pd.to_datetime(start_datetime)
            end = pd.to_datetime(end_datetime)
            duration_days = (end - start).total_seconds() / (24 * 3600)
            event_date = start
        elif datetime_val:
            event_date = pd.to_datetime(datetime_val)
            duration_days = 0  # Single point event
            start = event_date
            end = event_date
        else:
            continue  # Skip if no valid datetime
        
        hazard_codes = props.get("monty:hazard_codes", [])
        country_codes = props.get("monty:country_codes", [])
        primary_country = country_codes[0] if country_codes else "Unknown"
        
        entry = {
            "id": item.id,
            "start_datetime": start,
            "end_datetime": end,
            "duration_days": duration_days,
            "title": props.get("title"),
            "hazard_codes": hazard_codes,
            "primary_country": primary_country,
            "collection": item.collection_id,
            "month": start.month,
            "season": get_season(start.month)
        }
        data.append(entry)
    
    df = pd.DataFrame(data)
    return df

def get_season(month):
    """Determine season from month (Northern Hemisphere)."""
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

# Create duration dataframe from existing events
df_duration = items_to_duration_dataframe(all_events)

print(f"Total events with duration data: {len(df_duration)}")
print(f"\nDuration statistics (BEFORE cleaning):")
print(df_duration['duration_days'].describe())

# Data Quality Check: Identify problematic records
print(f"\n" + "=" * 70)
print("DATA QUALITY CHECKS")
print("=" * 70)

# Check for negative durations
negative_duration = df_duration[df_duration['duration_days'] < 0]
print(f"\n1. Negative Duration Events: {len(negative_duration)}")
if len(negative_duration) > 0:
    print(f"   Min: {negative_duration['duration_days'].min():.2f} days")
    print(f"   Max: {negative_duration['duration_days'].max():.2f} days")
    display(negative_duration[['start_datetime', 'end_datetime', 'duration_days', 'primary_country', 'hazard_codes']].head())

# Check for zero duration events
zero_duration = df_duration[df_duration['duration_days'] == 0]
print(f"\n2. Zero Duration Events (instantaneous): {len(zero_duration)}")

# Check for extreme outliers (>365 days = 1 year)
extreme_duration = df_duration[df_duration['duration_days'] > 365]
print(f"\n3. Extreme Duration Events (>365 days): {len(extreme_duration)}")
if len(extreme_duration) > 0:
    print(f"   Max: {extreme_duration['duration_days'].max():.2f} days ({extreme_duration['duration_days'].max()/365:.1f} years)")

# Clean the data: Keep only valid positive durations (>=0 and <=365 days)
# This removes:
# - Negative durations (data errors)
# - Extreme outliers (>1 year is suspicious for most disaster types)
df_duration_clean = df_duration[(df_duration['duration_days'] >= 0) & (df_duration['duration_days'] <= 365)].copy()

print(f"\n" + "=" * 70)
print("CLEANING RESULTS")
print("=" * 70)
print(f"Original records: {len(df_duration)}")
print(f"Records removed: {len(df_duration) - len(df_duration_clean)}")
print(f"Cleaned records: {len(df_duration_clean)}")
print(f"Data retention rate: {(len(df_duration_clean)/len(df_duration)*100):.1f}%")

print(f"\n\nDuration statistics (AFTER cleaning):")
print(df_duration_clean['duration_days'].describe())

# Display cleaned sample
print(f"\nSample of cleaned data:")
display(df_duration_clean.head(10))
print("=" * 70)

# Use cleaned dataset for all subsequent analyses
df_duration = df_duration_clean

In [None]:

# DIAGNOSTIC: Explicit Data Cleaning Verification
print("\n" + "=" * 80)
print("üîç DIAGNOSTIC: DATA CLEANING VERIFICATION")
print("=" * 80)

# Show current state of df_duration
print(f"\nüìä Current df_duration statistics:")
print(f"  Shape: {df_duration.shape}")
print(f"  Min duration: {df_duration['duration_days'].min():.2f} days")
print(f"  Max duration: {df_duration['duration_days'].max():.2f} days")
print(f"  Negative values: {(df_duration['duration_days'] < 0).sum()}")
print(f"  Values > 365 days: {(df_duration['duration_days'] > 365).sum()}")

# If negative values exist, FORCE clean immediately
if (df_duration['duration_days'] < 0).sum() > 0:
    print(f"\n‚ö†Ô∏è  ALERT: Found negative durations! Forcing clean now...")
    df_duration = df_duration[(df_duration['duration_days'] >= 0) & (df_duration['duration_days'] <= 365)].copy()
    print(f"‚úÖ After cleaning:")
    print(f"  Records kept: {len(df_duration)}")
    print(f"  Min: {df_duration['duration_days'].min():.2f} days")
    print(f"  Max: {df_duration['duration_days'].max():.2f} days")

print("\n" + "=" * 80)


In [None]:
# Verify Data Cleaning
print("\n" + "=" * 70)
print("DATA CLEANING VERIFICATION")
print("=" * 70)
print(f"\nDataFrame info:")
print(f"  Total records: {len(df_duration)}")
print(f"  Min duration: {df_duration['duration_days'].min():.2f} days")
print(f"  Max duration: {df_duration['duration_days'].max():.2f} days")
print(f"  Records with negative duration: {(df_duration['duration_days'] < 0).sum()}")
print(f"  Records with duration > 365 days: {(df_duration['duration_days'] > 365).sum()}")

# If still seeing negative values, show them
if (df_duration['duration_days'] < 0).sum() > 0:
    print(f"\n‚ö†Ô∏è  WARNING: Found {(df_duration['duration_days'] < 0).sum()} negative duration records!")
    print("Removing them now...")
    df_duration = df_duration[df_duration['duration_days'] >= 0].copy()
    print(f"After removal: {len(df_duration)} records")
    print(f"New min: {df_duration['duration_days'].min():.2f} days")

print("=" * 70)

# Classify Hazards as Sudden-Onset vs Slow-Onset
print("\n" + "=" * 70)
print("HAZARD CLASSIFICATION: SUDDEN-ONSET vs SLOW-ONSET")
print("=" * 70)

# Define hazard categories
sudden_onset_codes = [
    "TC", "EQ", "TS", "VO",  # GLIDE codes: Cyclone, Earthquake, Tsunami, Volcano
    "MH0306", "MH0307", "MH0308", "MH0309",  # UNDRR Tropical cyclones
    "MH0100", "MH0101", "MH0102",  # UNDRR Earthquakes
    "nat-met-sto-tro",  # EM-DAT Tropical storms
    "nat-geo-tec-ear",  # EM-DAT Earthquakes
]

slow_onset_codes = [
    "FL", "FF", "DR", "CW", "HT",  # GLIDE: Floods, Drought, Cold wave, Heat wave
    "MH0600", "MH0601", "MH0602", "MH0603", "MH0604",  # UNDRR Floods
    "MH0800", "MH0801", "MH0802",  # UNDRR Droughts
    "nat-hyd-flo-flo", "nat-hyd-flo-fla", "nat-hyd-flo-riv", "nat-hyd-flo-coa",  # EM-DAT Floods
    "nat-cli-dro", "nat-cli-ext-hea", "nat-cli-ext-col",  # EM-DAT Droughts, Extreme temps
]

def classify_onset_type(hazard_codes):
    """Classify event as sudden or slow onset."""
    for code in hazard_codes:
        if any(sudden in code for sudden in sudden_onset_codes):
            return "Sudden-Onset"
        if any(slow in code for slow in slow_onset_codes):
            return "Slow-Onset"
    return "Other"

df_duration['onset_type'] = df_duration['hazard_codes'].apply(classify_onset_type)

# Distribution by onset type
onset_distribution = df_duration['onset_type'].value_counts()
print(f"\nOnset Type Distribution:")
print(onset_distribution)

# Duration by onset type
print(f"\nAverage Duration by Onset Type (days):")
duration_by_onset = df_duration.groupby('onset_type')['duration_days'].agg(['mean', 'median', 'std', 'count', 'min', 'max'])
print(duration_by_onset)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Duration distribution by onset type with better scaling
df_duration.boxplot(column='duration_days', by='onset_type', ax=ax1)
ax1.set_title('Event Duration by Onset Type', fontsize=14, fontweight='bold')
ax1.set_xlabel('Onset Type', fontsize=12)
ax1.set_ylabel('Duration (days)', fontsize=12)
# Auto-scale to show all data with padding
max_duration = df_duration['duration_days'].max()
ax1.set_ylim(-5, max_duration * 1.05)  # Add 5% padding at top
ax1.grid(True, alpha=0.3, axis='y')
plt.suptitle('')  # Remove automatic title

# Plot 2: Count by onset type
onset_distribution.plot(kind='bar', ax=ax2, color=['#FF6B6B', '#4ECDC4', '#95E1D3'])
ax2.set_title('Event Count by Onset Type', fontsize=14, fontweight='bold')
ax2.set_xlabel('Onset Type', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("=" * 70)

In [None]:
# Duration Analysis by Specific Hazard Type
print("\n" + "=" * 70)
print("DURATION ANALYSIS BY SPECIFIC HAZARD TYPE")
print("=" * 70)

# Extract primary hazard code (first code in list)
df_duration['primary_hazard'] = df_duration['hazard_codes'].apply(lambda x: x[0] if x else "Unknown")

# Get top hazard types
top_hazards = df_duration['primary_hazard'].value_counts().head(10)
print(f"\nTop 10 Hazard Types by Frequency:")
print(top_hazards)

# Filter for top hazards
df_top_hazards = df_duration[df_duration['primary_hazard'].isin(top_hazards.index)]

# Duration statistics by hazard type
print(f"\n\nDuration Statistics by Top Hazard Types:")
hazard_duration_stats = df_top_hazards.groupby('primary_hazard')['duration_days'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
hazard_duration_stats = hazard_duration_stats.sort_values('mean', ascending=False)
print(hazard_duration_stats)

# Visualization: Duration by hazard type with improved scaling
fig, ax = plt.subplots(figsize=(16, 7))

# Create boxplot with better visibility
bp = df_top_hazards.boxplot(
    column='duration_days', 
    by='primary_hazard', 
    ax=ax,
    patch_artist=True,
    return_type='dict'
)

# Customize appearance
ax.set_title('Event Duration Distribution by Hazard Type', fontsize=14, fontweight='bold', pad=20)
plt.suptitle('')  # Remove default title
ax.set_xlabel('Hazard Type', fontsize=12)
ax.set_ylabel('Duration (days)', fontsize=12)
ax.tick_params(axis='x', rotation=45, labelsize=10)
ax.grid(True, alpha=0.3, axis='y')

# Auto-scale to show full range with padding
max_duration = df_top_hazards['duration_days'].max()
ax.set_ylim(-5, max_duration * 1.05)

# Add reference line for median overall duration
overall_median = df_top_hazards['duration_days'].median()
ax.axhline(overall_median, color='red', linestyle='--', linewidth=1, 
           alpha=0.5, label=f'Overall Median: {overall_median:.1f} days')
ax.legend(loc='upper right')

plt.tight_layout()
plt.show()

print("=" * 70)

In [None]:
# Seasonal Duration Patterns
print("\n" + "=" * 70)
print("SEASONAL DURATION PATTERNS")
print("=" * 70)

# Duration by season
season_duration = df_duration.groupby('season')['duration_days'].agg(['mean', 'median', 'count']).round(2)
print(f"\nDuration by Season:")
print(season_duration)

# Duration by season and onset type
season_onset_duration = df_duration.groupby(['season', 'onset_type'])['duration_days'].agg(['mean', 'count']).round(2)
print(f"\n\nDuration by Season and Onset Type:")
print(season_onset_duration)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Average duration by season
season_duration['mean'].plot(kind='bar', ax=ax1, color='steelblue')
ax1.set_title('Average Event Duration by Season', fontsize=14, fontweight='bold')
ax1.set_xlabel('Season', fontsize=12)
ax1.set_ylabel('Average Duration (days)', fontsize=12)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3)

# Plot 2: Heatmap of duration by season and onset type
pivot_data = df_duration.pivot_table(values='duration_days', index='season', columns='onset_type', aggfunc='mean')
sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='YlOrRd', ax=ax2, cbar_kws={'label': 'Avg Duration (days)'})
ax2.set_title('Average Duration: Season vs Onset Type', fontsize=14, fontweight='bold')
ax2.set_xlabel('Onset Type', fontsize=12)
ax2.set_ylabel('Season', fontsize=12)

plt.tight_layout()
plt.show()

print("=" * 70)

In [None]:
# Country-Level Duration Analysis
print("\n" + "=" * 70)
print("COUNTRY-LEVEL DURATION ANALYSIS")
print("=" * 70)

# Top countries by event count
top_countries = df_duration['primary_country'].value_counts().head(10)
print(f"\nTop 10 Countries by Event Count:")
print(top_countries)

# Duration by country
country_duration = df_duration.groupby('primary_country')['duration_days'].agg([
    'count', 'mean', 'median'
]).round(2)
country_duration = country_duration[country_duration['count'] >= 5]  # At least 5 events
country_duration = country_duration.sort_values('mean', ascending=False).head(15)

print(f"\n\nTop 15 Countries by Average Duration (min 5 events):")
print(country_duration)

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Event count by country
top_countries.plot(kind='barh', ax=ax1, color='skyblue')
ax1.set_title('Top 10 Countries by Event Count', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Events', fontsize=12)
ax1.set_ylabel('Country', fontsize=12)
ax1.grid(True, alpha=0.3, axis='x')

# Plot 2: Average duration by country
country_duration['mean'].plot(kind='barh', ax=ax2, color='coral')
ax2.set_title('Top 15 Countries by Average Event Duration', fontsize=14, fontweight='bold')
ax2.set_xlabel('Average Duration (days)', fontsize=12)
ax2.set_ylabel('Country', fontsize=12)
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("=" * 70)

In [None]:
# Disaster Lifecycle Profiles by Hazard Cluster
print("\n" + "=" * 70)
print("DISASTER LIFECYCLE PROFILES")
print("=" * 70)

# Create hazard clusters based on similar characteristics
hazard_clusters = {
    "Floods": ["FL", "FF", "MH0600", "MH0601", "MH0602", "MH0603", "MH0604", 
               "nat-hyd-flo-flo", "nat-hyd-flo-fla", "nat-hyd-flo-riv", "nat-hyd-flo-coa"],
    "Tropical Cyclones": ["TC", "MH0306", "MH0307", "MH0308", "MH0309", "nat-met-sto-tro"],
    "Droughts": ["DR", "MH0800", "MH0801", "MH0802", "nat-cli-dro"],
    "Earthquakes": ["EQ", "MH0100", "MH0101", "MH0102", "nat-geo-tec-ear"],
    "Temperature Extremes": ["HT", "CW", "nat-cli-ext-hea", "nat-cli-ext-col"]
}

def assign_cluster(hazard_codes):
    """Assign hazard to a cluster."""
    for cluster_name, codes in hazard_clusters.items():
        if any(any(code in hc for code in codes) for hc in hazard_codes):
            return cluster_name
    return "Other"

df_duration['hazard_cluster'] = df_duration['hazard_codes'].apply(assign_cluster)

# Lifecycle profile statistics
cluster_profiles = df_duration.groupby('hazard_cluster').agg({
    'duration_days': ['count', 'mean', 'median', 'std', 'min', 'max'],
    'onset_type': lambda x: x.mode()[0] if len(x.mode()) > 0 else "Unknown"
}).round(2)

print(f"\nDisaster Lifecycle Profiles by Hazard Cluster:")
print(cluster_profiles)

# Distribution visualization
clusters_with_data = df_duration['hazard_cluster'].value_counts()
print(f"\n\nEvent Count by Hazard Cluster:")
print(clusters_with_data)

# Create lifecycle profile visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, (cluster, group) in enumerate(df_duration.groupby('hazard_cluster')):
    if idx >= 4:  # Limit to top 4 clusters
        break
    
    ax = axes[idx]
    
    # Duration distribution histogram - Clip negative values to 0 for display
    group_clipped = group.copy()
    group_clipped['duration_days'] = group_clipped['duration_days'].clip(lower=0)
    group_clipped['duration_days'].hist(bins=30, ax=ax, color='steelblue', alpha=0.7, edgecolor='black')
    
    mean_val = group['duration_days'].mean()
    median_val = group['duration_days'].median()
    
    ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.1f} days')
    ax.axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'Median: {median_val:.1f} days')
    
    ax.set_title(f'{cluster} Lifecycle Profile (n={len(group)})', fontsize=12, fontweight='bold')
    ax.set_xlabel('Duration (days)', fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xlim(left=0)  # Set x-axis to start at 0 (no negative values displayed)

plt.tight_layout()
plt.show()

print("=" * 70)

In [None]:
# Duration Trend Over Time
print("\n" + "=" * 70)
print("DURATION TRENDS OVER TIME")
print("=" * 70)

# Add year column
df_duration['year'] = df_duration['start_datetime'].dt.year

# Average duration by year
yearly_duration = df_duration.groupby('year')['duration_days'].agg(['mean', 'median', 'count']).round(2)
print(f"\nAverage Duration by Year:")
print(yearly_duration)

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Duration trend over years
ax1.plot(yearly_duration.index, yearly_duration['mean'], marker='o', linewidth=2, 
         markersize=8, color='steelblue', label='Mean Duration')
ax1.plot(yearly_duration.index, yearly_duration['median'], marker='s', linewidth=2, 
         markersize=8, color='coral', label='Median Duration')
ax1.set_title('Event Duration Trend Over Years', fontsize=14, fontweight='bold')
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Duration (days)', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Event count over years
yearly_duration['count'].plot(kind='bar', ax=ax2, color='lightgreen', alpha=0.7)
ax2.set_title('Event Count by Year', fontsize=14, fontweight='bold')
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Number of Events', fontsize=12)
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Duration trend by hazard cluster over time
cluster_year_duration = df_duration.groupby(['year', 'hazard_cluster'])['duration_days'].mean().unstack()
print(f"\n\nAverage Duration by Year and Hazard Cluster:")
print(cluster_year_duration.round(2))

# Plot duration trends by cluster with symlog scale (linear near zero, log elsewhere)
from matplotlib.ticker import FuncFormatter

fig, ax = plt.subplots(figsize=(14, 6))

# Plot all clusters except Droughts
for cluster in cluster_year_duration.columns:
    if cluster != 'Droughts':  # Skip Droughts
        ax.plot(cluster_year_duration.index, cluster_year_duration[cluster], 
                marker='o', linewidth=2.5, label=cluster, markersize=7)

ax.set_title('Duration Trends by Hazard Cluster Over Time (Symmetrical Log Scale)', fontsize=14, fontweight='bold')
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Average Duration (days)', fontsize=12)

# Use symlog scale: linear for small values (0-0.5), log elsewhere
# linthresh=0.5 means values between -0.5 and +0.5 are linear
ax.set_yscale('symlog', linthresh=0.5)

# Format y-axis to show actual numbers instead of scientific notation
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f'{int(y) if y >= 1 else f"{y:.1f}"}'))

ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3, which='both')  # Show grid for both major and minor ticks
plt.tight_layout()
plt.show()

print("=" * 70)

## Conclusion

This notebook demonstrated:
1. **Time-series extraction** from STAC collections using CQL2 date filters
2. **Seasonality visualization** for hazard types
3. **Statistical trend analysis** on impact data
4. **Event duration analysis** segmented by hazard type, season, and country
5. **Disaster lifecycle profiles** for different hazard clusters

**Key Findings from Duration Analysis**:
- Sudden-onset events (earthquakes, cyclones) typically have shorter durations
- Slow-onset events (floods, droughts) show longer duration patterns
- Seasonal variations affect event duration significantly
- Country-specific duration patterns reveal regional vulnerability characteristics

**Next Steps:**
- Integrate Correlation IDs to normalize trends
- Cross-Correlation between Hazard Magnitude and Impact Severity
- Build regression models to predict event duration
- Add economic/income level data for enhanced analysis