# Completeness of data series and outliers detection

## Import packages

In [None]:
import cartopy.crs as ccrs
import matplotlib.cbook
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils

plt.style.use("seaborn-v0_8-notebook")

## Set variables

In [None]:
# Time
start = "1997-01"
stop = "1997-02"

# Region
lon_slice = slice(28, 41)
lat_slice = slice(-16, 4)

# Variable
varname = "lake_surface_water_temperature"

## Set the data request

In [None]:
collection_id = "satellite-lake-water-temperature"
request = {
    "version": "4.0",
    "variable": "all",
    "format": "zip",
}

## Define function to extract region and compute spatial weighted mean

In [None]:
def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname, lakeids):
    ds = ds[[varname, "lakeid"]]
    ds = ds.chunk({"time": 1, "latitude": 1_200, "longitude": 2_400})
    ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)
    dataarrays = []
    for lakeid in lakeids:
        da = ds[varname].where(ds["lakeid"] == lakeid)
        da = diagnostics.spatial_weighted_mean(da)
        dataarrays.append(da.expand_dims(lakeid=[lakeid]))
    return xr.concat(dataarrays, "lakeid").to_dataset()


def get_lakeid(ds, lon_slice, lat_slice):
    da = ds["lakeid"].isel(time=0)
    da = da.chunk({"latitude": 1_200, "longitude": 2_400})
    da = utils.regionalise(da, lon_slice=lon_slice, lat_slice=lat_slice)
    return da.to_dataset()

## Download data

In [None]:
chunks = {"year": 1, "month": 1}
requests = download.update_request_date(
    request, start=start, stop=stop, stringify_dates=True
)
ds = download.download_and_transform(
    collection_id,
    requests,
    chunks=chunks,
    transform_func=spatial_weighted_mean_of_region,
    transform_func_kwargs={
        "lon_slice": lon_slice,
        "lat_slice": lat_slice,
        "varname": varname,
        "lakeids": [3, 7, 10],
    },
)
da = ds[varname].compute()

## Extract lake IDs to plot a map of the region

In [None]:
# We use one of the request previously cached
single_request = requests[0]
single_request["month"] = single_request["month"][0]
da_lakeid = download.download_and_transform(
    collection_id,
    single_request,
    chunks=chunks,
    transform_func=get_lakeid,
    transform_func_kwargs={
        "lon_slice": lon_slice,
        "lat_slice": lat_slice,
    },
)["lakeid"]

## Plot projected map of lake IDs

In [None]:
_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree(), show_stats=False)

## Plot spatial weighted mean

In [None]:
for lakeid, da_lakeid in da.groupby("lakeid"):
    da_lakeid.dropna("time").plot(label=lakeid)
plt.legend(title="lake ID")
plt.grid()
plt.title("Spatial weighted mean")
plt.show()

# Print missing values
missings = da.isnull().sum("time") / da.sizes["time"] * 100
id_digits = max(map(len, da["lakeid"].astype(str).values))
for lakeid, missing in missings.groupby("lakeid"):
    print(f"Missing values of lake ID {lakeid:<{id_digits}}: {missing.values:.2f} %")

## Boxplot

In [None]:
df = da.to_dataframe()
df.boxplot(by="lakeid")
plt.ylabel(f"{da.attrs['standard_name']} [{da.attrs['units']}]")
plt.show()

# Print statistics
boxplot_stats = {}
for lakeid, df_lakeid in df.groupby("lakeid"):
    values = df_lakeid.dropna().values.squeeze()
    (boxplot_stats[lakeid],) = matplotlib.cbook.boxplot_stats(values)
boxplot_stats = pd.DataFrame(boxplot_stats)
boxplot_stats