# Completeness of data series and outliers detection

## Import packages

In [None]:
import cartopy.crs as ccrs
import matplotlib.cbook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils

plt.style.use("seaborn-v0_8-notebook")

## Set variables

In [None]:
# Time
start = "1997-01"
stop = "2000-01"

# Region
lon_slice = slice(-92.10, -84.80)
lat_slice = slice(46.30, 49.00)

# Variable
varname = "lake_surface_water_temperature"

## Set the data request

In [None]:
collection_id = "satellite-lake-water-temperature"
request = {
    "version": "4.0",
    "variable": "all",
    "format": "zip",
}

## Download data

In [None]:
requests = download.update_request_date(
    request, start=start, stop=stop, stringify_dates=True
)
ds = download.download_and_transform(
    collection_id,
    requests,
    chunks={"year": 1, "month": 1},
    transform_func=utils.regionalise,
    transform_func_kwargs={"lon_slice": lon_slice, "lat_slice": lat_slice},
)

## Plot lakeid

In [None]:
_ = plot.projected_map(
    ds["lakeid"].isel(time=0),
    projection=ccrs.PlateCarree(),
    show_stats=False,
)

## Reindex

In [None]:
# Reindex using lakeids and min_quality_level
lakeid = (
    xr.DataArray(np.unique(ds["lakeid"].values), dims=("lakeid"))
    .dropna("lakeid")
    .astype(int)
)
min_quality_level = xr.DataArray([0, 4], dims=("min_quality_level"))
ds = ds.drop_vars("lakeid").where(ds["lakeid"] == lakeid)
ds = ds.where(ds["quality_level"] >= min_quality_level)
ds = ds.assign_coords(lakeid=lakeid, min_quality_level=min_quality_level)

# Spatial weighted mean
da = diagnostics.spatial_weighted_mean(ds[varname])

## Plot spatial weighted mean

In [None]:
da.dropna("time").plot(hue="min_quality_level", col="lakeid", figsize=(10, 6))
plt.grid()
plt.title("Spatial weighted mean")
plt.show()

# Print missing values
missings = da.sel(min_quality_level=0).isnull().sum("time") / da.sizes["time"] * 100
for lakeid, missing in missings.groupby("lakeid", squeeze=False):
    print(f"Missing values of lake ID {lakeid}: {float(missing.squeeze()):.2f} %")

## Boxplot

In [None]:
df = da.to_dataframe()
df.boxplot(by=(["lakeid", "min_quality_level"]))
plt.ylabel(f"{da.attrs['long_name']} [{da.attrs['units']}]")
plt.show()

# Show stats
stats = {}
for label, grouped_df in df.groupby(["lakeid", "min_quality_level"]):
    (stats[label],) = matplotlib.cbook.boxplot_stats(
        grouped_df.dropna().values.squeeze()
    )
pd.DataFrame(stats)