In [None]:
from datetime import datetime
from pathlib import Path

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import geopandas as gpd
import matplotlib.path as mpath
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
def set_circular_boundary(ax, min_lat=55):
    ax.set_extent([-180, 180, min_lat, 90], crs=ccrs.PlateCarree())
    ax.add_feature(cfeature.LAND, facecolor="lightgrey")
    ax.add_feature(cfeature.BORDERS, linewidth=0.5)
    ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
    ax.gridlines(draw_labels=True, color="gray", alpha=0.5, linestyle="--")

    theta = np.linspace(0, 2 * np.pi, 100)
    center, radius = [0.5, 0.5], 0.5
    verts = np.vstack([np.sin(theta), np.cos(theta)]).T
    circle = mpath.Path(verts * radius + center)
    ax.set_boundary(circle, transform=ax.transAxes)


polar_kwargs = {
    "cmap": "viridis",
    "legend": True,
    "transform": ccrs.PlateCarree(),
    "linewidth": 0.1,
    "edgecolor": "black",
    "aspect": "equal",
}

In [None]:
out = Path("/isipd/projects/p_aicore_pf/darts-nextgen/data")
s2stats_gdf = gpd.read_parquet(out / "s2stats.parquet")
display(s2stats_gdf.head())
s2stats_gdf.info()

In [None]:
counts_per_grid = s2stats_gdf.groupby(["grid_code"]).count().id
mgrs = gpd.read_file("../data/sentinel-2_grids.geojson")
mgrs["counts_per_grid"] = mgrs["Name"].map(counts_per_grid).fillna(0).astype(int)
mgrs = mgrs[mgrs.counts_per_grid != 0]

In [None]:
# Cartopy polar stereographic plot
fig, ax = plt.subplots(1, 1, figsize=(16, 8), subplot_kw={"projection": ccrs.NorthPolarStereo()})
set_circular_boundary(ax, min_lat=55)
mgrs.plot(column="counts_per_grid", ax=ax, **polar_kwargs)
ax.set_title("Counts per Grid (Polar Stereographic)")

## General Statistics

In [None]:
n_scenes = len(s2stats_gdf)
print(f"Estimated number of scenes: {n_scenes / 1e6:.1f} Million")
total_area = s2stats_gdf["area"].sum()
print(f"Estimated total area: {total_area / 1e9:.1f} Billion km2")
area_per_pixel = 10 * 10 / 1e6  # km2
n_pixels = total_area / area_per_pixel
print(f"Estimated number of pixels: {n_pixels / 1e12:.1f} Trillion")
n_bands = 4 + 1  # optical plus scl
precision = 16
download_size = n_pixels * n_bands * precision / 8
print(f"Estimated download size: {download_size / 1e12:.1f} TB")

In [None]:
sns.displot(s2stats_gdf, x="date", aspect=4)

### Temporal Extent

In [None]:
years = [2015, 2016, 2017, 2018, 2023, 2024]
months = [5, 6, 7, 8, 9, 10]
fig, axs = plt.subplots(
    2, len(years) // 2, figsize=(10 * len(years) // 2, 20), subplot_kw={"projection": ccrs.NorthPolarStereo()}
)
axs = axs.flatten()
for i, year in enumerate(years):
    is_year = s2stats_gdf["date"].dt.year == year
    is_month = s2stats_gdf["date"].dt.month.isin(months)
    counts_per_grid_year = s2stats_gdf[is_year & is_month].groupby(["grid_code"]).count().id
    mgrs_year = gpd.read_file("../data/sentinel-2_grids.geojson")
    mgrs_year["counts_per_grid"] = mgrs_year["Name"].map(counts_per_grid_year).fillna(0).astype(int)
    mgrs_year = mgrs_year[mgrs_year.counts_per_grid != 0]

    set_circular_boundary(axs[i])
    mgrs_year.plot(column="counts_per_grid", ax=axs[i], **polar_kwargs)
    axs[i].set_title(f"Year {year}")
fig.tight_layout()

In [None]:
months = [5, 6, 7, 8, 9, 10]
fig, axs = plt.subplots(
    2, len(months) // 2, figsize=(10 * len(months) // 2, 20), subplot_kw={"projection": ccrs.NorthPolarStereo()}
)
axs = axs.flatten()
for i, month in enumerate(months):
    is_year = s2stats_gdf["date"].dt.year == 2018
    is_month = s2stats_gdf["date"].dt.month == month
    counts_per_grid_year = s2stats_gdf[is_year & is_month].groupby(["grid_code"]).count().id
    mgrs_year = gpd.read_file("../data/sentinel-2_grids.geojson")
    mgrs_year["counts_per_grid"] = mgrs_year["Name"].map(counts_per_grid_year).fillna(0).astype(int)
    mgrs_year = mgrs_year[mgrs_year.counts_per_grid != 0]

    set_circular_boundary(axs[i])
    mgrs_year.plot(column="counts_per_grid", ax=axs[i], **polar_kwargs)
    axs[i].set_title(f"{month} 2018")
fig.tight_layout()

In [None]:
months = [5, 6, 7, 8, 9, 10]
fig, axs = plt.subplots(
    2, len(months) // 2, figsize=(10 * len(months) // 2, 20), subplot_kw={"projection": ccrs.NorthPolarStereo()}
)
axs = axs.flatten()
for i, month in enumerate(months):
    is_year = s2stats_gdf["date"].dt.year == 2019
    is_month = s2stats_gdf["date"].dt.month == month
    counts_per_grid_year = s2stats_gdf[is_year & is_month].groupby(["grid_code"]).count().id
    mgrs_year = gpd.read_file("../data/sentinel-2_grids.geojson")
    mgrs_year["counts_per_grid"] = mgrs_year["Name"].map(counts_per_grid_year).fillna(0).astype(int)
    mgrs_year = mgrs_year[mgrs_year.counts_per_grid != 0]

    set_circular_boundary(axs[i])
    mgrs_year.plot(column="counts_per_grid", ax=axs[i], **polar_kwargs)
    axs[i].set_title(f"{month} 2019")
fig.tight_layout()

## Relevant Statistics

In [None]:
s2stats_cum = []
for year in s2stats_gdf.date.dt.year.unique():
    is_year = s2stats_gdf.date.dt.year == year
    for month in [5, 6, 7, 8, 9, 10]:
        is_month = s2stats_gdf.date.dt.month == month
        for cc in [1, 5, 10, 40]:
            is_cc = s2stats_gdf.cloud_coverage <= cc
            s2stats_cum.append(
                {
                    "year": year,
                    "month": month,
                    "date": datetime(year, month, 1),
                    "cloud_coverage": int(cc),
                    "count": s2stats_gdf[is_year & is_month & is_cc].shape[0],
                    "area": s2stats_gdf[is_year & is_month & is_cc]["area"].sum(),
                }
            )
s2stats_cum = pd.DataFrame(s2stats_cum).sort_values("date")
s2stats_cum

### Cloud Cover thresholds

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 5))
sns.barplot(s2stats_cum, x="date", y="count", hue="cloud_coverage", ax=ax)
plt.xticks(rotation=90);

### Influence of 1% CC on extent

In [None]:
years = [2015, 2016, 2017, 2018, 2023, 2024]
months = [5, 6, 7, 8, 9, 10]
fig, axs = plt.subplots(
    2, len(months) // 2, figsize=(10 * len(months) // 2, 20), subplot_kw={"projection": ccrs.NorthPolarStereo()}
)
axs = axs.flatten()
for i, year in enumerate(years):
    is_year = s2stats_gdf["date"].dt.year == year
    is_month = s2stats_gdf["date"].dt.month.isin(months)
    is_cct = s2stats_gdf["cloud_coverage"] <= 1
    counts_per_grid_year = s2stats_gdf[is_year & is_month & is_cct].groupby(["grid_code"]).count().id
    mgrs_year = gpd.read_file("../data/sentinel-2_grids.geojson")
    mgrs_year["counts_per_grid"] = mgrs_year["Name"].map(counts_per_grid_year).fillna(0).astype(int)
    mgrs_year = mgrs_year[mgrs_year.counts_per_grid != 0]

    set_circular_boundary(axs[i])
    mgrs_year.plot(column="counts_per_grid", ax=axs[i], **polar_kwargs)
    axs[i].set_title(f"Year {year}")
fig.tight_layout()

### Include October?

In [None]:
years = [2016, 2017, 2018, 2023]
fig, axs = plt.subplots(
    1, len(years), figsize=(10 * len(years), 10), subplot_kw={"projection": ccrs.NorthPolarStereo()}
)
axs = axs.flatten()
for i, year in enumerate(years):
    is_year = s2stats_gdf["date"].dt.year == year
    is_month = s2stats_gdf["date"].dt.month == 10
    is_cct = s2stats_gdf["cloud_coverage"] <= 10
    counts_per_grid_year = s2stats_gdf[is_year & is_month & is_cct].groupby(["grid_code"]).count().id
    mgrs_year = gpd.read_file("../data/sentinel-2_grids.geojson")
    mgrs_year["counts_per_grid"] = mgrs_year["Name"].map(counts_per_grid_year).fillna(0).astype(int)
    mgrs_year = mgrs_year[mgrs_year.counts_per_grid != 0]

    set_circular_boundary(axs[i])
    mgrs_year.plot(column="counts_per_grid", ax=axs[i], **polar_kwargs)
    axs[i].set_title(f"October {year}")
fig.tight_layout()

## Advanced Statistics

In [None]:
# Assumption: Only CC <= 10% and month in [6, 7, 8, 9]

is_cct = s2stats_gdf["cloud_coverage"] <= 10
is_month = s2stats_gdf["date"].dt.month.isin([6, 7, 8, 9])

s2stats_gdf_filtered = s2stats_gdf[is_cct & is_month]
n_scenes = len(s2stats_gdf_filtered)
print(f"Estimated number of scenes: {n_scenes / 1e3:.1f} Thousand")
total_area = s2stats_gdf_filtered["area"].sum()
print(f"Estimated total area: {total_area / 1e9:.1f} Billion km2")
area_per_pixel = 10 * 10 / 1e6  # km2
n_pixels = total_area / area_per_pixel
print(f"Estimated number of pixels: {n_pixels / 1e12:.1f} Trillion")
n_bands = 4 + 1  # optical plus scl
precision = 16
download_size = n_pixels * n_bands * precision / 8
print(f"Estimated download size: {download_size / 1e12:.1f} TB")

In [None]:
# Assumption: Only CC <= 1% and month in [5, 6, 7, 8, 9, 10]

is_cct = s2stats_gdf["cloud_coverage"] <= 1
is_month = s2stats_gdf["date"].dt.month.isin([5, 6, 7, 8, 9, 10])

s2stats_gdf_filtered = s2stats_gdf[is_cct & is_month]
n_scenes = len(s2stats_gdf_filtered)
print(f"Estimated number of scenes: {n_scenes / 1e3:.1f} Thousand")
total_area = s2stats_gdf_filtered["area"].sum()
print(f"Estimated total area: {total_area / 1e9:.1f} Billion km2")
area_per_pixel = 10 * 10 / 1e6  # km2
n_pixels = total_area / area_per_pixel
print(f"Estimated number of pixels: {n_pixels / 1e12:.1f} Trillion")
n_bands = 4 + 1  # optical plus scl
precision = 16
download_size = n_pixels * n_bands * precision / 8
print(f"Estimated download size: {download_size / 1e12:.1f} TB")

In [None]:
# Assumption: Only CC <= 10% and month in [6, 7, 8, 9] -> calculate estimate for a single month
# Remove 2015-2017 and 2023 to remove bias

is_cct = s2stats_gdf["cloud_coverage"] <= 10
is_month = s2stats_gdf["date"].dt.month.isin([6, 7, 8, 9])
is_year = s2stats_gdf["date"].dt.year.isin([2018, 2019, 2020, 2021, 2022, 2024])

s2stats_gdf_filtered = s2stats_gdf[is_cct & is_month & is_year]
n_scenes = len(s2stats_gdf_filtered) / (4 * 6)
print(f"Estimated number of scenes: {n_scenes / 1e3:.1f} Thousand")
total_area = s2stats_gdf_filtered["area"].sum() / (4 * 6)
print(f"Estimated total area: {total_area / 1e6:.1f} Million km2")
area_per_pixel = 10 * 10 / 1e6  # km2
n_pixels = total_area / area_per_pixel
print(f"Estimated number of pixels: {n_pixels / 1e9:.1f} Billion")
n_bands = 4 + 1  # optical plus scl
precision = 16
download_size = n_pixels * n_bands * precision / 8
print(f"Estimated download size: {download_size / 1e12:.1f} TB")

In [None]:
# Assumption: Only CC <= 1% and month in [5, 6, 7, 8, 9, 10] -> calculate estimate for a single month
# Remove 2015-2017 and 2023 to remove bias

is_cct = s2stats_gdf["cloud_coverage"] <= 1
is_month = s2stats_gdf["date"].dt.month.isin([5, 6, 7, 8, 9, 10])
is_year = s2stats_gdf["date"].dt.year.isin([2018, 2019, 2020, 2021, 2022, 2024])

s2stats_gdf_filtered = s2stats_gdf[is_cct & is_month & is_year]
n_scenes = len(s2stats_gdf_filtered) / (4 * 6)
print(f"Estimated number of scenes: {n_scenes / 1e3:.1f} Thousand")
total_area = s2stats_gdf_filtered["area"].sum() / (4 * 6)
print(f"Estimated total area: {total_area / 1e6:.1f} Million km2")
area_per_pixel = 10 * 10 / 1e6  # km2
n_pixels = total_area / area_per_pixel
print(f"Estimated number of pixels: {n_pixels / 1e9:.1f} Billion")
n_bands = 4 + 1  # optical plus scl
precision = 16
download_size = n_pixels * n_bands * precision / 8
print(f"Estimated download size: {download_size / 1e12:.1f} TB")