# Statistical analysis

# Completeness of data series and outliers detection

Use Case: Check completeness of lake water temperature time series for Great African Lakes and outliers detection.

User Question: The satellite lakes water temperature dataset for Great African Lakes is complete in time? Are there some outliers?

Methods:

```
    - Select Great African Lakes area and extract the mean water lakes temperature
    - Plot the time series
    - Calculate percentage of missing values
    - Boxplot of the values and outliers detection
```

## Import packages

In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils

plt.style.use("seaborn-v0_8-notebook")

## Set variables

In [None]:
# Time
start = "1997-01"
stop = "1997-02"

# Region
lon_slice = slice(28, 41)
lat_slice = slice(-16, 4)

# Variable
varname = "lake_surface_water_temperature"

## Set the data request

In [None]:
collection_id = "satellite-lake-water-temperature"
request = {
    "version": "4.0",
    "variable": "all",
    "format": "zip",
}

## Define function to extract region and compute spatial weighted mean

In [None]:
def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname):
    ds = ds[[varname]]
    ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)
    ds = diagnostics.spatial_weighted_mean(ds)
    return ds

## Download data

In [None]:
chunks = {"year": 1, "month": 1}
requests = download.update_request_date(
    request, start=start, stop=stop, stringify_dates=True
)
ds = download.download_and_transform(
    collection_id,
    requests,
    chunks=chunks,
    transform_func=spatial_weighted_mean_of_region,
    transform_func_kwargs={
        "lon_slice": lon_slice,
        "lat_slice": lat_slice,
        "varname": varname,
    },
)
da = ds[varname]

## Extract lake id to plot a map of the region

In [None]:
# We use one of the request previously cached
single_request = requests[0]
single_request["month"] = single_request["month"][0]
ds_raw = download.download_and_transform(
    collection_id,
    single_request,
    chunks=chunks,
)

da_lakeid = utils.regionalise(
    ds_raw["lakeid"].isel(time=0), lon_slice=lon_slice, lat_slice=lat_slice
)

## Plot projected map

In [None]:
_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree())

## Plot spatial weighted mean

In [None]:
da.plot()
_ = plt.title("Spatial weighted mean")

## Percentage of missing values

In [None]:
num_missing = float(da.isnull().sum() / da.size * 100)
print(f"Number of missing values: {num_missing:.2f} %.")

## Boxplot

In [None]:
# Create a boxplot
valid_da = da.where(da.notnull().compute(), drop=True).chunk(-1)
plt.boxplot(valid_da)

# Add title and labels
# plt.title("Boxplot of array with missing values")
plt.xlabel("Array")
plt.ylabel("lake surface skin temperature")

# Find 1st and 3rd quantile and median
da_qiles = valid_da.quantile([0.25, 0.5, 0.75])

# Finding the IQR region
iqr = da_qiles.sel(quantile=0.75) - da_qiles.sel(quantile=0.25)

# Finding upper and lower whiskers
stats = {
    "median": float(da_qiles.sel(quantile=0.5)),
    "IQR upper bound": float(da_qiles.sel(quantile=0.75) + (1.5 * iqr)),
    "IQR lower bound": float(da_qiles.sel(quantile=0.25) - (1.5 * iqr)),
    "minimum": float(da.min()),
    "maximum": float(da.max()),
}

# Print stats
for key, value in stats.items():
    print(f"The {key} value is {value:.2f} {valid_da.units}")

# Check outliers
no_outliers = (
    stats["minimum"] >= stats["IQR lower bound"]
    and stats["maximum"] <= stats["IQR upper bound"]
)
print(f"\nThere are {'NO' if no_outliers else 'SOME'} outliers in the series.")