# Completeness of data series and outliers detection

## Import packages

In [None]:
import cartopy.crs as ccrs
import matplotlib.cbook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils

plt.style.use("seaborn-v0_8-notebook")

## Set variables

In [None]:
# Time
year_start = 1995
year_stop = 2023

# Region
lon_slice = slice(-92.10, -84.80)
lat_slice = slice(46.30, 49.00)

# Variable
varname = "lake_surface_water_temperature"

## Set the data request

In [None]:
collection_id = "satellite-lake-water-temperature"
requests = []
for year in range(year_start, year_stop + 1):
    requests.append(
        {
            "variable": "all",
            "year": [str(year)],
            "month": ["07", "08", "09"],
            "day": [f"{day:02d}" for day in range(1, 32)],
            "version": ["4_5" if year < 2021 else "4_5_2"],
        }
    )

## Download data

In [None]:
ds = download.download_and_transform(
    collection_id,
    requests,
    transform_func=utils.regionalise,
    transform_func_kwargs={"lon_slice": lon_slice, "lat_slice": lat_slice},
    chunks={"year": 1},
)

## Plot lakeid

In [None]:
_ = plot.projected_map(
    ds["lakeid_CCI"].isel(time=0),
    projection=ccrs.PlateCarree(),
    show_stats=False,
)

## Compute spatial weighted mean

In [None]:
# Reindex using lakeids and min_quality_level
lakeid = (
    xr.DataArray(np.unique(ds["lakeid_CCI"].values), dims=("lakeid_CCI"))
    .dropna("lakeid_CCI")
    .astype(int)
)
min_quality_level = xr.DataArray([0, 4], dims=("min_quality_level"))
ds = ds.drop_vars("lakeid_CCI").where(ds["lakeid_CCI"] == lakeid)
ds = ds.where(ds["lswt_quality_level"] >= min_quality_level)
ds = ds.assign_coords(lakeid_CCI=lakeid, min_quality_level=min_quality_level)

# Spatial weighted mean
da = diagnostics.spatial_weighted_mean(ds[varname])

## Plot spatial weighted mean

In [None]:
facet = da.dropna("time").plot(
    hue="min_quality_level", col="lakeid_CCI", figsize=(10, 6)
)
for ax in facet.axs.flatten():
    ax.grid()
plt.suptitle("Spatial weighted mean", va="bottom")

# Print missing values
missings = da.sel(min_quality_level=0).isnull().sum("time") / da.sizes["time"] * 100
for lakeid, missing in missings.groupby("lakeid_CCI", squeeze=False):
    print(f"Missing values of lake ID {lakeid}: {float(missing.squeeze()):.2f} %")

## Boxplot

In [None]:
df = da.to_dataframe()
df.boxplot(by=(["lakeid_CCI", "min_quality_level"]))
plt.ylabel(f"{da.attrs['long_name']} [{da.attrs['units']}]")
plt.show()

# Show stats
stats = {}
for label, grouped_df in df.groupby(["lakeid_CCI", "min_quality_level"]):
    (stats[label],) = matplotlib.cbook.boxplot_stats(
        grouped_df.dropna().values.squeeze()
    )
pd.DataFrame(stats)