In [4]:
import os
import pandas as pd
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

# -------------------------------
# 1. Download NASA FIRMS Fire Data (Canada region)
# -------------------------------

def download_firms_data(start_date, end_date, save_path='firms_data.csv'):
    base_url = "https://firms.modaps.eosdis.nasa.gov/api/area/csv"
    # Replace with your FIRMS API key (register for free at FIRMS)
    api_key = "3f7d8ca6c97c44d097afd9c62cac0b49"
    params = {
        "api_key": api_key,
        "area": "CAN",  # Canada
        "start": start_date,
        "end": end_date,
        "product": "MODIS_NRT"  # or "VIIRS_SNPP_NRT"
    }

    print(f"Fetching FIRMS data from {start_date} to {end_date}...")
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        with open(save_path, 'w') as f:
            f.write(response.text)
        print(f"Saved FIRMS data to {save_path}")
    else:
        print(f"Error fetching FIRMS data: {response.status_code}")

# -------------------------------
# 2. Scrape Wildfire Reports from NRCan
# -------------------------------

def scrape_nrcan_reports(date):
    base_url = "https://cwfis.cfs.nrcan.gc.ca/report/archives"
    date_str = date.strftime("%Y-%m-%d")
    print(f"Scraping NRCan report for {date_str}...")

    try:
        response = requests.get(base_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)

        # Normalize links and match with date
        for link in links:
            href = link['href']
            if "report" in href and date_str in href:
                full_url = "https://cwfis.cfs.nrcan.gc.ca" + href
                print(f"Found report: {full_url}")
                return full_url

        print("No report found for this date.")
        return None
    except Exception as e:
        print(f"Error scraping NRCan: {e}")
        return None

# -------------------------------
# 3. (Optional) News Headlines via NewsAPI
# -------------------------------

def fetch_news_headlines(date, api_key):
    date_str = date.strftime("%Y-%m-%d")
    url = f"https://newsapi.org/v2/everything?q=wildfire+canada&from={date_str}&to={date_str}&language=en&sortBy=popularity&apiKey={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get("articles", [])
        headlines = [a["title"] for a in articles]
        return headlines
    else:
        print("Error fetching news:", response.status_code)
        return []

# -------------------------------
# 4. Align Data by Date
# -------------------------------

def run_pipeline(start_date_str, end_date_str):
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

    firms_data = []
    reports = []
    news = []

    # Loop over each day
    for day in pd.date_range(start=start_date, end=end_date):
        date_str = day.strftime("%Y-%m-%d")

        # Step 1: Download FIRMS data (daily)
        download_firms_data(date_str, date_str, save_path=f"firms_{date_str}.csv")

        # Step 2: Scrape NRCan report
        report_url = scrape_nrcan_reports(day)
        reports.append({"date": date_str, "report_url": report_url})

        # Step 3: (Optional) News headlines
        # Replace with your NewsAPI key
        news_api_key = "3f7d8ca6c97c44d097afd9c62cac0b49"
        headlines = fetch_news_headlines(day, news_api_key)
        news.append({"date": date_str, "headlines": headlines})

    # Save metadata
    pd.DataFrame(reports).to_csv("nrcan_reports.csv", index=False)
    pd.DataFrame(news).to_csv("wildfire_news.csv", index=False)

# -------------------------------
# Run the Pipeline
# -------------------------------

if __name__ == "__main__":
    run_pipeline("2023-07-01", "2023-07-07")  # Example time window



Fetching FIRMS data from 2023-07-01 to 2023-07-01...
Saved FIRMS data to firms_2023-07-01.csv
Scraping NRCan report for 2023-07-01...
No report found for this date.
Error fetching news: 426
Fetching FIRMS data from 2023-07-02 to 2023-07-02...
Saved FIRMS data to firms_2023-07-02.csv
Scraping NRCan report for 2023-07-02...
No report found for this date.
Error fetching news: 426
Fetching FIRMS data from 2023-07-03 to 2023-07-03...
Saved FIRMS data to firms_2023-07-03.csv
Scraping NRCan report for 2023-07-03...
No report found for this date.
Error fetching news: 426
Fetching FIRMS data from 2023-07-04 to 2023-07-04...
Saved FIRMS data to firms_2023-07-04.csv
Scraping NRCan report for 2023-07-04...
No report found for this date.
Error fetching news: 426
Fetching FIRMS data from 2023-07-05 to 2023-07-05...
Saved FIRMS data to firms_2023-07-05.csv
Scraping NRCan report for 2023-07-05...
No report found for this date.
Error fetching news: 426
Fetching FIRMS data from 2023-07-06 to 2023-07-06.

In [5]:


if __name__ == "__main__":
    run_pipeline("2023-07-01", "2023-07-03")  # Small test window

