# Vizzuality Challenge, ETL pipeline. 
 Summarized total ecosystem carbon of the northern lakes region in the USA using data from the National Forest Carbon Monitoring System.

This Notebook is set to process de downloaded files from the National Forest Carbon Monitoring System (Check Readme.txt for specifications on download.py)

This ETL pipeline will: 

- 1 - Automatically download the necesary data (download.py)

- 2 - Select the ROI (Every county of the states of Michigan, Wisconsin and Minnesota) within the USA administrative boundaries Shapefile source

- 3 - Summarize Total Ecosystem Carbon for the set ROI. TotalExosystemCarbon_2020 is the choosed raster due that is the "current" state of carbon ecosystem data in the region. 

- 4 - Convert according to necesity CRSs, units, etc. 

- 5 - Create a .gpkg file with the Total Ecosystem Carbon values

- 6 - Upload it to a relational database for inquires 

- 7 - Simple vizzualization of the output data. 

In [None]:
import sys
print(sys.executable)

d:\conda_envs\vizz_challenge\python.exe


In [None]:
from pathlib import Path
import requests, zipfile
import geopandas as gpd
import rasterio
from rasterio.mask import mask
from rasterstats import zonal_stats
import pandas as pd
import numpy as np
import warnings

# Base directories. Creates the necessary folder structure for simplicity
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
RAW = DATA_DIR / "raw"
PROCESSED = DATA_DIR / "processed"
OUTPUT = DATA_DIR / "output"

for folder in [RAW, PROCESSED, OUTPUT]:
    folder.mkdir(parents=True, exist_ok=True)

print("Folders are ready:", list(DATA_DIR.glob('*')))

Folders are ready: [WindowsPath('d:/Vizzuality_challenge/data/output'), WindowsPath('d:/Vizzuality_challenge/data/processed'), WindowsPath('d:/Vizzuality_challenge/data/raw')]


In [6]:
# Data download, quick import from download.py file. You can also execute .py file directly if wanted.
# This module is added to the notebook for the intent of traceability. 

from download import download_and_extract  # 👈 importa la función desde tu script

# URLs
COUNTIES_URL = "https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip"
CARBON_URL = "https://usfs-public.box.com/shared/static/v861gwms9fq68sitl0r3vs2v3moxeu9x.zip"

# Descarga
counties_dir = download_and_extract(COUNTIES_URL, RAW)
carbon_dir = download_and_extract(CARBON_URL, RAW)

# Encontrar paths después de la descarga
shp_path = list(counties_dir.rglob("*.shp"))[0]
raster_path = next(carbon_dir.rglob("*.tif"))
print("✅ Descargas completas.")


File downloaded: tl_2025_us_county.zip
Files extracted in: tl_2025_us_county
File downloaded: v861gwms9fq68sitl0r3vs2v3moxeu9x.zip
Extracting files to v861gwms9fq68sitl0r3vs2v3moxeu9x...
Extraction completed
✅ Descargas completas.


In [None]:
# Filters the counties within the ROI
shp_path = list(counties_dir.glob("*.shp"))[0]
gdf = gpd.read_file(shp_path)

# Michigan (26), Minnesota (27), Wisconsin (55), States code in .shp file.
filtered = gdf[gdf["STATEFP"].isin(["26", "27", "55"])]
filtered_path = PROCESSED / "counties_MI_WI_MN.shp"
filtered.to_file(filtered_path)

print(f" Saved shapefiled in: {filtered_path}")
print("# of counties:", len(filtered))


In [None]:
#Raster cut and CRS reprojection, for easier analysis and vizzualization
raster_path = next(carbon_dir.rglob("*.tif"))
print("Original raster:", raster_path)

with rasterio.open(raster_path) as src:
    raster_crs = src.crs
    gdf = gpd.read_file(filtered_path).to_crs(raster_crs)
    geoms = [geom for geom in gdf.geometry]
    out_image, out_transform = mask(src, geoms, crop=True)
    out_meta = src.meta.copy()
    out_meta.update({
        "driver": "GTiff",
        "height": out_image.shape[1],
        "width": out_image.shape[2],
        "transform": out_transform
    })

clipped_raster = PROCESSED / "carbon_MI_WI_MN.tif"
with rasterio.open(clipped_raster, "w", **out_meta) as dest:
    dest.write(out_image)

print(f"Cut raster saved in: {clipped_raster}")