### Lab 2. QA/QC on Datasets

### Part 1. Request Data from APIs

In [8]:
# Packages and Libraries
import requests
import zipfile
from zipfile import ZipFile

import io
import os
import sys
import json

import pandas as pd

#### DEM Data for Minnesota

In [4]:
# CKAN API base URL for Minnesota GIS data
BASE_URL = "https://gisdata.mn.gov/api/3/action/"

# Dataset name (from the URL slug)
DATASET_NAME = "elev-30m-digital-elevation-model"

# Directory to save files
SAVE_DIR = "30m_mn_elev"
os.makedirs(SAVE_DIR, exist_ok=True)

# Fetch dataset details
response = requests.get(f"{BASE_URL}package_show", params={"id": DATASET_NAME})
data = response.json()

if data["success"]:
    resources = data["result"]["resources"]

    # Find the TIFF or ZIP resource
    tiff_url = None
    for resource in resources:
        if "tif" in resource["url"].lower() or "zip" in resource["url"].lower():
            tiff_url = resource["url"]
            file_name = os.path.join(SAVE_DIR, os.path.basename(tiff_url))
            print(f"Downloading: {tiff_url}")

            # Download the file
            with requests.get(tiff_url, stream=True) as r:
                r.raise_for_status()
                with open(file_name, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Downloaded: {file_name}")

            # Unzip if it's a ZIP file
            if file_name.endswith(".zip"):
                with zipfile.ZipFile(file_name, "r") as zip_ref:
                    zip_ref.extractall(SAVE_DIR)
                print(f"Extracted: {SAVE_DIR}")

else:
    print("Failed to fetch dataset information.")

Downloading: https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/elev_30m_digital_elevation_model/fgdb_elev_30m_digital_elevation_model.zip
Downloaded: 30m_mn_elev\fgdb_elev_30m_digital_elevation_model.zip
Extracted: 30m_mn_elev


#### iNaturalist BMSB Data Download

In [5]:
# iNaturalist API base URL
BASE_URL = "https://api.inaturalist.org/v1/observations"

# Query parameters (Use bounding box for Minnesota)
params = {
    "taxon_id": 81923,      # Brown Marmorated Stink Bug
    "verifiable": "true",   # Only verifiable observations
    "per_page": 50,         # Max 200 results per request
    "swlat": 43.499,        # Southwest latitude (MN)
    "swlng": -97.2392,      # Southwest longitude (MN)
    "nelat": 49.3843,       # Northeast latitude (MN)
    "nelng": -89.4917       # Northeast longitude (MN)
}

# Send GET request to iNaturalist API
response = requests.get(BASE_URL, params=params)
data = response.json()

# Check if data retrieval was successful
if "results" in data:
    observations = []
    
    for obs in data["results"]:
        lat = obs.get("geojson", {}).get("coordinates", [None, None])[1]
        lon = obs.get("geojson", {}).get("coordinates", [None, None])[0]

        observations.append({
            "ID": obs["id"],
            "Observed Date": obs.get("observed_on", "N/A"),
            "Latitude": lat,
            "Longitude": lon,
            "Scientific Name": obs["taxon"]["name"] if "taxon" in obs else "N/A",
            "Common Name": obs["taxon"].get("preferred_common_name", "N/A"),
            "Image URL": obs["photos"][0]["url"] if obs.get("photos") else None,
            "Location Description": obs.get("place_guess", "Unknown"),
            "Exact Location?": obs.get("location_is_exact", False),
            "Obscured?": obs.get("obscured", False),
            "Accuracy (meters)": obs.get("positional_accuracy", "N/A"),
            "Count Observed": obs.get("individual_count", "N/A")
        })
    
    # Convert to Pandas DataFrame
    df = pd.DataFrame(observations)
    
    # Save to CSV
    csv_filename = "brown_marmorated_stink_bug_mn.csv"
    df.to_csv(csv_filename, index=False)
    
    print(f"Saved {len(df)} observations to {csv_filename}")
    
    # Display the first few rows
    print(df.head())

else:
    print("Failed to fetch observation data.")

Saved 50 observations to brown_marmorated_stink_bug_mn.csv
          ID Observed Date   Latitude  Longitude    Scientific Name  \
0  264596414    2025-03-08  44.943124 -93.124529  Halyomorpha halys   
1  263160449    2024-01-18  44.010008 -92.456489  Halyomorpha halys   
2  262052201    2025-02-17  44.010681 -92.505242  Halyomorpha halys   
3  261700994    2025-02-14  44.895863 -93.262520  Halyomorpha halys   
4  260707725    2025-02-05  46.573231 -90.929597  Halyomorpha halys   

                  Common Name  \
0  Brown Marmorated Stink Bug   
1  Brown Marmorated Stink Bug   
2  Brown Marmorated Stink Bug   
3  Brown Marmorated Stink Bug   
4  Brown Marmorated Stink Bug   

                                           Image URL  \
0  https://inaturalist-open-data.s3.amazonaws.com...   
1  https://inaturalist-open-data.s3.amazonaws.com...   
2  https://inaturalist-open-data.s3.amazonaws.com...   
3  https://inaturalist-open-data.s3.amazonaws.com...   
4  https://inaturalist-open-data.s3

In [28]:
import requests
import pandas as pd

# iNaturalist API base URL
BASE_URL = "https://api.inaturalist.org/v1/observations"

# Query parameters (Use bounding box for Minnesota)
params = {
    "taxon_id": 81923,      # Brown Marmorated Stink Bug
    "verifiable": "true",   # Only verifiable observations
    "per_page": 200,        # Max 200 results per request (iNaturalist's max)
    "swlat": 43.499,        # Southwest latitude (MN)
    "swlng": -97.2392,      # Southwest longitude (MN)
    "nelat": 49.3843,       # Northeast latitude (MN)
    "nelng": -89.4917       # Northeast longitude (MN)
}

# Initialize a list to store all observations
all_observations = []
page = 1  # Start with the first page

while True:
    # Update the page number in the parameters
    params["page"] = page
    
    # Send GET request to iNaturalist API
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    # Check if data retrieval was successful
    if "results" in data:
        # If no more results, stop the loop
        if not data["results"]:
            break
        
        # Process the observations
        for obs in data["results"]:
            lat = obs.get("geojson", {}).get("coordinates", [None, None])[1]
            lon = obs.get("geojson", {}).get("coordinates", [None, None])[0]

            all_observations.append({
                "ID": obs["id"],
                "Observed Date": obs.get("observed_on", "N/A"),
                "Latitude": lat,
                "Longitude": lon,
                "Scientific Name": obs["taxon"]["name"] if "taxon" in obs else "N/A",
                "Common Name": obs["taxon"].get("preferred_common_name", "N/A"),
                "Image URL": obs["photos"][0]["url"] if obs.get("photos") else None,
                "Location Description": obs.get("place_guess", "Unknown"),
                "Exact Location?": obs.get("location_is_exact", False),
                "Obscured?": obs.get("obscured", False),
                "Accuracy (meters)": obs.get("positional_accuracy", "N/A"),
                "Count Observed": obs.get("individual_count", "N/A")
            })
        
        # Increment the page number for the next request
        page += 1
    else:
        print("Failed to fetch observation data.")
        break

# Convert to Pandas DataFrame
df = pd.DataFrame(all_observations)

# Save to CSV
csv_filename = "brown_marmorated_stink_bug_mn_all_observations.csv"
df.to_csv(csv_filename, index=False)

print(f"Saved {len(df)} observations to {csv_filename}")

# Display the first few rows
print(df.head())

Saved 1292 observations to brown_marmorated_stink_bug_mn_all_observations.csv
          ID Observed Date   Latitude  Longitude    Scientific Name  \
0  264596414    2025-03-08  44.943124 -93.124529  Halyomorpha halys   
1  263160449    2024-01-18  44.010008 -92.456489  Halyomorpha halys   
2  262052201    2025-02-17  44.010681 -92.505242  Halyomorpha halys   
3  261700994    2025-02-14  44.895863 -93.262520  Halyomorpha halys   
4  260707725    2025-02-05  46.573231 -90.929597  Halyomorpha halys   

                  Common Name  \
0  Brown Marmorated Stink Bug   
1  Brown Marmorated Stink Bug   
2  Brown Marmorated Stink Bug   
3  Brown Marmorated Stink Bug   
4  Brown Marmorated Stink Bug   

                                           Image URL  \
0  https://inaturalist-open-data.s3.amazonaws.com...   
1  https://inaturalist-open-data.s3.amazonaws.com...   
2  https://inaturalist-open-data.s3.amazonaws.com...   
3  https://inaturalist-open-data.s3.amazonaws.com...   
4  https://inatu

#### Landcover Data for Minnesota

In [6]:
# CKAN API base URL for Minnesota GIS data
BASE_URL = "https://gisdata.mn.gov/api/3/action/"

# Dataset name (from the URL slug)
DATASET_NAME = "biota-landcover-nlcd-mn-2019"

# Directory to save files
SAVE_DIR = "nlcd_mn_2019"
os.makedirs(SAVE_DIR, exist_ok=True)

# Fetch dataset details
response = requests.get(f"{BASE_URL}package_show", params={"id": DATASET_NAME})
data = response.json()

if data["success"]:
    resources = data["result"]["resources"]

    # Find the TIFF or ZIP resource
    tiff_url = None
    for resource in resources:
        if "tif" in resource["url"].lower() or "zip" in resource["url"].lower():
            tiff_url = resource["url"]
            file_name = os.path.join(SAVE_DIR, os.path.basename(tiff_url))
            print(f"Downloading: {tiff_url}")

            # Download the file
            with requests.get(tiff_url, stream=True) as r:
                r.raise_for_status()
                with open(file_name, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Downloaded: {file_name}")

            # Unzip if it's a ZIP file
            if file_name.endswith(".zip"):
                with zipfile.ZipFile(file_name, "r") as zip_ref:
                    zip_ref.extractall(SAVE_DIR)
                print(f"Extracted: {SAVE_DIR}")

else:
    print("Failed to fetch dataset information.")

Downloading: https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/biota_landcover_nlcd_mn_2019/tif_biota_landcover_nlcd_mn_2019.zip
Downloaded: nlcd_mn_2019\tif_biota_landcover_nlcd_mn_2019.zip
Extracted: nlcd_mn_2019


#### Weather Data for Minnesota by Months

In [15]:
# NOAA API Key
noaa_key = "jmDCMdCsdIhjIADxAZxYSxhztAALtLII"
headers = {'token': noaa_key}

# API URL and parameters
url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"
params = {
    "datasetid": "NORMAL_MLY",
    "locationid": "FIPS:27",  # Minnesota FIPS code
    "startdate": "2010-01-01",
    "enddate": "2019-12-31"
}

# Make the request
response = requests.get(url, headers=headers, params=params)

# Check response
if response.status_code == 200:
    data = response.json()['results']
    df = pd.json_normalize(data)
    df.to_csv("mn_monthly_normals.csv", index=False)
    print("Data saved as mn_monthly_normals.csv")
else:
    print(f"Error: {response.status_code}, {response.text}")


Data saved as mn_monthly_normals.csv


In [11]:
import requests
import pandas as pd

# NOAA API Key
noaa_key = "jmDCMdCsdIhjIADxAZxYSxhztAALtLII"

# API URL and parameters
url = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"
headers = {"token": noaa_key}

# Parameters to query monthly normals for MSP (Minneapolis-St. Paul Airport)
params = {
    "datasetid": "NORMAL_MLY",
    "stationid": "GHCND:USW00014922",  # MSP station ID
    "startdate": "2010-01-01",         # Normal period start date
    "enddate": "2019-12-31",           # Normal period end date
    "limit": 1000,                     # Increase if necessary
    "units": "metric"                  # Optional: set to 'standard' for Fahrenheit, inches, etc.
}

# Make the request
response = requests.get(url, headers=headers, params=params)

# Check response
if response.status_code == 200:
    data = response.json().get('results', [])
    if data:
        df = pd.json_normalize(data)
        df.to_csv("msp_monthly_normals.csv", index=False)
        print("Data saved as msp_monthly_normals.csv")
    else:
        print("No data found for MSP station.")
else:
    print(f"Error: {response.status_code}, {response.text}")

Data saved as msp_monthly_normals.csv
