diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4420837..957d888 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: rev: v2.4.0 hooks: - id: flake8 - args: ["--max-line-length=88", "--exclude=__init__.py", "--ignore=W605,W503,F722,C901"] + args: ["--max-line-length=88", "--exclude=__init__.py", "--ignore=W605,W503,F722,C901,F401"] - repo: https://github.com/pre-commit/mirrors-isort rev: v4.3.21 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 81ceaef..daf3a54 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,41 +7,80 @@ esmtools v1.1.1 (2019-12-##) Features -------- -- xarray implementation of ``statsmodels.stats.multitest.multipletests``. (:pr:`71`) `Aaron Spring`_ +- ``xarray`` implementation of ``statsmodels.stats.multitest.multipletests``. + (:pr:`71`) `Aaron Spring`_ +- Implements ``nan_policy=...`` keyword for ``stats.linear_slope``, + ``stats.linregress``, ``stats.polyfit``, ``stats.rm_poly``, ``stats.rm_trend``. + (:pr:`70`) `Riley X. Brady`_. + + * ``'none', 'propagate'``: Propagate nans through function. I.e., return a nan for + a given grid cell if a nan exists in it. + * ``'raise'``: Raise an error if there are any nans in the datasets. + * ``'drop', 'omit'``: Like ``skipna``, compute statistical function after removing + nans. + +- Adds support for datetime axes in ``stats.linear_slope``, ``stats.linregress``, + ``stats.polyfit``, ``stats.rm_poly``, ``stats.rm_trend``. Converts datetimes to + numeric time, computes function, and then converts back to datetime. + (:pr:`70`)`Riley X. Brady`_. +- ``stats.linear_slope``, ``stats.linregress``, ``stats.polyfit``, ``stats.rm_poly``, + and ``stats.rm_trend`` are now dask-compatible and vectorized better. + (:pr:`70`) `Riley X. Brady`_. + Bug Fixes --------- +- Does not eagerly evaluate ``dask`` arrays anymore. (:pr:`70`) `Riley X. Brady`_. Internals/Minor Fixes --------------------- -- Adds ``isort`` and ``nbstripout`` to CI for development. Blacken and isort code. (:pr:`73`) `Riley X. Brady`_ +- Adds ``isort`` and ``nbstripout`` to CI for development. Blacken and isort code. + (:pr:`73`) `Riley X. Brady`_ Documentation ------------- -- Add more robust API docs page, information on how to contribute, CHANGELOG, etc. to ``sphinx``. (:pr:`67`) `Riley X. Brady`_. +- Add more robust API docs page, information on how to contribute, CHANGELOG, etc. to + ``sphinx``. (:pr:`67`) `Riley X. Brady`_. Deprecations ------------ -- Remove ``mpas`` and ``vis`` modules. The former is better for a project-dependent package. The latter essentially poorly replicates some of ``proplot`` functionality. (:pr:`69`) `Riley X. Brady`_. +- Removes ``mpas`` and ``vis`` modules. The former is better for a project-dependent + package. The latter essentially poorly replicates some of ``proplot`` functionality. + (:pr:`69`) `Riley X. Brady`_. +- Removes ``stats.smooth_series``, since there is an easy ``xarray`` function for it. + (:pr:`70`) `Riley X. Brady`_. +- Changes ``stats.linear_regression`` to ``stats.linregress``. + (:pr:`70`) `Riley X. Brady`_. +- Changes ``stats.compute_slope`` to ``stats.linear_slope``. + (:pr:`70`) `Riley X. Brady`_. esmtools v1.1 (2019-09-04) ========================== Features -------- -- ``co2_sol`` and ``schmidt`` now can be computed on grids and do not do time-averaging (:pr:`45`) `Riley X. Brady`_. -- ``temp_decomp_takahashi`` now returns a dataset with thermal/non-thermal components (:pr:`45`) `Riley X. Brady`_. -- ``temporal`` module that includes a ``to_annual()`` function for weighted temporal resampling (:pr:`50`) `Riley X. Brady`_. -- ``filtering`` module renamed to ``spatial`` and ``find_indices`` made public. (:pr:`52`) `Riley X. Brady`_. +- ``co2_sol`` and ``schmidt`` now can be computed on grids and do not do time-averaging + (:pr:`45`) `Riley X. Brady`_. +- ``temp_decomp_takahashi`` now returns a dataset with thermal/non-thermal components + (:pr:`45`) `Riley X. Brady`_. +- ``temporal`` module that includes a ``to_annual()`` function for weighted temporal + resampling (:pr:`50`) `Riley X. Brady`_. +- ``filtering`` module renamed to ``spatial`` and ``find_indices`` made public. + (:pr:`52`) `Riley X. Brady`_. - ``standardize`` function moved to stats. (:pr:`52`) `Riley X. Brady`_. - ``loadutils`` removed (:pr:`52`) `Riley X. Brady`_. -- ``calculate_compatible_emissions`` following Jones et al. 2013 (:pr:`54`) `Aaron Spring`_ -- Update ``corr`` to broadcast ``x`` and ``y`` such that a single time series can be correlated across a grid. (:pr:`58`) `Riley X. Brady`_. -- ``convert_lon_to_180to180`` and ``convert_lon_to_0to360`` now wrapped with ``convert_lon`` and now supports 2D lat/lon grids. ``convert_lon()`` is also available as an accessor. (:pr:`60`) `Riley X. Brady`_. +- ``calculate_compatible_emissions`` following Jones et al. 2013 + (:pr:`54`) `Aaron Spring`_ +- Update ``corr`` to broadcast ``x`` and ``y`` such that a single time series can be + correlated across a grid. (:pr:`58`) `Riley X. Brady`_. +- ``convert_lon_to_180to180`` and ``convert_lon_to_0to360`` now wrapped with + ``convert_lon`` and now supports 2D lat/lon grids. ``convert_lon()`` is also + available as an accessor. (:pr:`60`) `Riley X. Brady`_. Internals/Minor Fixes --------------------- -- Changed name back to ``esmtools`` now that the readthedocs domain was cleared up. Thanks Andrew Walter! (:pr:`61`) `Riley X. Brady`_. +- Changed name back to ``esmtools`` now that the readthedocs domain was cleared up. + Thanks Andrew Walter! (:pr:`61`) `Riley X. Brady`_. - ``esmtools`` documentation created with docstring updates for all functions. esm_analysis v1.0.2 (2019-07-27) @@ -49,7 +88,8 @@ esm_analysis v1.0.2 (2019-07-27) Internals/Minor Fixes --------------------- -- Changed name from ``esmtools`` to ``esm_analysis`` since the former was registered on readthedocs. +- Changed name from ``esmtools`` to ``esm_analysis`` since the former was registered + on readthedocs. esmtools v1.0.1 (2019-07-25) ============================ diff --git a/ci/run-linter.sh b/ci/run-linter.sh index 6696865..3d727a9 100755 --- a/ci/run-linter.sh +++ b/ci/run-linter.sh @@ -7,10 +7,7 @@ echo "Code Styling with (black, flake8, isort)" source activate esmtools-dev echo "[flake8]" -flake8 esmtools --max-line-length=88 --exclude=__init__.py --ignore=W605,W503 +flake8 esmtools --max-line-length=88 --exclude=__init__.py --ignore=W605,W503,F722,C901,F401 echo "[black]" black --check --line-length=88 -S esmtools - -echo "[isort]" -isort --recursive -w 88 --check-only esmtools diff --git a/docs/source/api.rst b/docs/source/api.rst index cc2b334..11daf59 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -34,7 +34,9 @@ Composite Analysis .. currentmodule:: esmtools.composite -Functions pertaining to composite analysis. Composite analysis takes the mean view of some field (e.g., sea surface temperature) when some climate index (e.g., El Nino Southern Oscillation) is in its negative or positive mode. +Functions pertaining to composite analysis. Composite analysis takes the mean view of +some field (e.g., sea surface temperature) when some climate index +(e.g., El Nino Southern Oscillation) is in its negative or positive mode. .. autosummary:: :toctree: api/ @@ -97,18 +99,18 @@ Statistical functions. A portion directly wrap functions from ``climpred``. .. autosummary:: :toctree: api/ - standardize - nanmean - cos_weight + ACF area_weight - smooth_series - fit_poly - linear_regression + autocorr corr + cos_weight + linear_slope + linregress + polyfit + nanmean rm_poly rm_trend - autocorr - ACF + standardize Testing ------- @@ -123,7 +125,7 @@ Statistical tests. :toctree: api/ ttest_ind_from_stats - multipletest + multipletests Temporal -------- diff --git a/docs/source/api/esmtools.stats.fit_poly.rst b/docs/source/api/esmtools.stats.fit_poly.rst deleted file mode 100644 index 48c6995..0000000 --- a/docs/source/api/esmtools.stats.fit_poly.rst +++ /dev/null @@ -1,6 +0,0 @@ -esmtools.stats.fit\_poly -======================== - -.. currentmodule:: esmtools.stats - -.. autofunction:: fit_poly diff --git a/docs/source/api/esmtools.stats.linear_regression.rst b/docs/source/api/esmtools.stats.linear_regression.rst deleted file mode 100644 index dc4ab72..0000000 --- a/docs/source/api/esmtools.stats.linear_regression.rst +++ /dev/null @@ -1,6 +0,0 @@ -esmtools.stats.linear\_regression -================================= - -.. currentmodule:: esmtools.stats - -.. autofunction:: linear_regression diff --git a/docs/source/api/esmtools.stats.linear_slope.rst b/docs/source/api/esmtools.stats.linear_slope.rst new file mode 100644 index 0000000..8696dcb --- /dev/null +++ b/docs/source/api/esmtools.stats.linear_slope.rst @@ -0,0 +1,6 @@ +esmtools.stats.linear\_slope +============================ + +.. currentmodule:: esmtools.stats + +.. autofunction:: linear_slope diff --git a/docs/source/api/esmtools.stats.linregress.rst b/docs/source/api/esmtools.stats.linregress.rst new file mode 100644 index 0000000..a5d6ed7 --- /dev/null +++ b/docs/source/api/esmtools.stats.linregress.rst @@ -0,0 +1,6 @@ +esmtools.stats.linregress +========================= + +.. currentmodule:: esmtools.stats + +.. autofunction:: linregress diff --git a/docs/source/api/esmtools.stats.polyfit.rst b/docs/source/api/esmtools.stats.polyfit.rst new file mode 100644 index 0000000..a3c4b89 --- /dev/null +++ b/docs/source/api/esmtools.stats.polyfit.rst @@ -0,0 +1,6 @@ +esmtools.stats.polyfit +====================== + +.. currentmodule:: esmtools.stats + +.. autofunction:: polyfit diff --git a/docs/source/api/esmtools.stats.smooth_series.rst b/docs/source/api/esmtools.stats.smooth_series.rst deleted file mode 100644 index 7099a52..0000000 --- a/docs/source/api/esmtools.stats.smooth_series.rst +++ /dev/null @@ -1,6 +0,0 @@ -esmtools.stats.smooth\_series -============================= - -.. currentmodule:: esmtools.stats - -.. autofunction:: smooth_series diff --git a/docs/source/api/esmtools.testing.multipletests.rst b/docs/source/api/esmtools.testing.multipletests.rst new file mode 100644 index 0000000..0bd3c19 --- /dev/null +++ b/docs/source/api/esmtools.testing.multipletests.rst @@ -0,0 +1,6 @@ +esmtools.testing.multipletests +============================== + +.. currentmodule:: esmtools.testing + +.. autofunction:: multipletests diff --git a/docs/source/api/esmtools.stats.ttest_ind_from_stats.rst b/docs/source/api/esmtools.testing.ttest_ind_from_stats.rst similarity index 100% rename from docs/source/api/esmtools.stats.ttest_ind_from_stats.rst rename to docs/source/api/esmtools.testing.ttest_ind_from_stats.rst diff --git a/esmtools/carbon.py b/esmtools/carbon.py index 193f834..d1659bc 100644 --- a/esmtools/carbon.py +++ b/esmtools/carbon.py @@ -7,7 +7,7 @@ from tqdm import tqdm from .checks import is_xarray -from .stats import linear_regression, nanmean, rm_poly +from .stats import linregress, nanmean, rm_poly @is_xarray([0, 1]) @@ -51,9 +51,9 @@ def sol_calc(t, s): return ff ff = xr.apply_ufunc( - sol_calc, t, s, input_core_dims=[[], []], vectorize=True, dask="allowed" + sol_calc, t, s, input_core_dims=[[], []], vectorize=True, dask='allowed' ) - ff.attrs["units"] = "mol/kg/atm" + ff.attrs['units'] = 'mol/kg/atm' return ff @@ -88,13 +88,13 @@ def calc_schmidt(t): return Sc Sc = xr.apply_ufunc( - calc_schmidt, t, input_core_dims=[[]], vectorize=True, dask="allowed" + calc_schmidt, t, input_core_dims=[[]], vectorize=True, dask='allowed' ) return Sc @is_xarray(0) -def temp_decomp_takahashi(ds, time_dim="time", temperature="tos", pco2="spco2"): +def temp_decomp_takahashi(ds, time_dim='time', temperature='tos', pco2='spco2'): """Decompose surface pCO2 into thermal and non-thermal components. .. note:: @@ -130,19 +130,19 @@ def temp_decomp_takahashi(ds, time_dim="time", temperature="tos", pco2="spco2"): >>> decomp = temp_decomp_takahashi(ds) """ if temperature not in ds.data_vars: - raise ValueError(f"{temperature} is not a variable in your dataset.") + raise ValueError(f'{temperature} is not a variable in your dataset.') if pco2 not in ds.data_vars: - raise ValueError(f"{pco2} is not a variable in your dataset.") + raise ValueError(f'{pco2} is not a variable in your dataset.') fac = 0.0432 tos_mean = ds[temperature].mean(time_dim) tos_diff = ds[temperature] - tos_mean - thermal = (ds[pco2].mean(time_dim) * (np.exp(tos_diff * fac))).rename("thermal") - non_thermal = (ds[pco2] * (np.exp(tos_diff * -fac))).rename("non_thermal") + thermal = (ds[pco2].mean(time_dim) * (np.exp(tos_diff * fac))).rename('thermal') + non_thermal = (ds[pco2] * (np.exp(tos_diff * -fac))).rename('non_thermal') decomp = xr.merge([thermal, non_thermal]) decomp.attrs[ - "description" - ] = "Takahashi decomposition of pCO2 into thermal and non-thermal components." + 'description' + ] = 'Takahashi decomposition of pCO2 into thermal and non-thermal components.' return decomp @@ -221,7 +221,7 @@ def spco2_sensitivity(ds): """ def _check_variables(ds): - requiredVars = ["spco2", "tos", "sos", "talkos", "dissicos"] + requiredVars = ['spco2', 'tos', 'sos', 'talkos', 'dissicos'] if not all(i in ds.data_vars for i in requiredVars): missingVars = [i for i in requiredVars if i not in ds.data_vars] raise ValueError( @@ -234,23 +234,23 @@ def _check_variables(ds): # sensitivities at each grid cell. # TODO: Add keyword for sliding mean, as in N year chunks of time to # account for trends. - DIC = ds["dissicos"] - ALK = ds["talkos"] - SALT = ds["sos"] - pCO2 = ds["spco2"] + DIC = ds['dissicos'] + ALK = ds['talkos'] + SALT = ds['sos'] + pCO2 = ds['spco2'] buffer_factor = dict() - buffer_factor["ALK"] = -(ALK ** 2) / ((2 * DIC - ALK) * (ALK - DIC)) - buffer_factor["DIC"] = (3 * ALK * DIC - 2 * DIC ** 2) / ( + buffer_factor['ALK'] = -(ALK ** 2) / ((2 * DIC - ALK) * (ALK - DIC)) + buffer_factor['DIC'] = (3 * ALK * DIC - 2 * DIC ** 2) / ( (2 * DIC - ALK) * (ALK - DIC) ) # Compute sensitivities sensitivity = dict() - sensitivity["tos"] = 0.0423 - sensitivity["sos"] = 1 / SALT - sensitivity["talkos"] = (1 / ALK) * buffer_factor["ALK"] - sensitivity["dissicos"] = (1 / DIC) * buffer_factor["DIC"] + sensitivity['tos'] = 0.0423 + sensitivity['sos'] = 1 / SALT + sensitivity['talkos'] = (1 / ALK) * buffer_factor['ALK'] + sensitivity['dissicos'] = (1 / DIC) * buffer_factor['DIC'] sensitivity = xr.Dataset(sensitivity) * pCO2 return sensitivity @@ -304,9 +304,9 @@ def spco2_decomposition_index( def regression_against_index(ds, index, psig=None): terms = dict() for term in ds.data_vars: - if term != "spco2": - reg = linear_regression(index, ds[term], psig=psig) - terms[term] = reg["slope"] + if term != 'spco2': + reg = linregress(index, ds[term], psig=psig) + terms[term] = reg['slope'] terms = xr.Dataset(terms) return terms @@ -317,16 +317,16 @@ def regression_against_index(ds, index, psig=None): your time series if you are using detrend.""" ) elif detrend: - ds_terms_anomaly = rm_poly(ds_terms, order=order, dim="time") + ds_terms_anomaly = rm_poly(ds_terms, order=order, dim='time') else: - warnings.warn("Your data are not being detrended.") + warnings.warn('Your data are not being detrended.') ds_terms_anomaly = ds_terms - nanmean(ds_terms) if deseasonalize: - clim = ds_terms_anomaly.groupby("time.month").mean("time") - ds_terms_anomaly = ds_terms_anomaly.groupby("time.month") - clim + clim = ds_terms_anomaly.groupby('time.month').mean('time') + ds_terms_anomaly = ds_terms_anomaly.groupby('time.month') - clim else: - warnings.warn("Your data are not being deseasonalized.") + warnings.warn('Your data are not being deseasonalized.') # Apply sliding window to regressions. I.e., compute in N year chunks # then average the resulting dpCO2/dX. @@ -334,8 +334,8 @@ def regression_against_index(ds, index, psig=None): terms = regression_against_index(ds_terms_anomaly, index) terms_in_pCO2_units = terms * nanmean(pco2_sensitivity) else: - years = [y for y in index.groupby("time.year").groups] - y_end = index["time.year"][-1] + years = [y for y in index.groupby('time.year').groups] + y_end = index['time.year'][-1] res = [] for y1 in tqdm(years): y2 = y1 + sliding_window @@ -345,11 +345,11 @@ def regression_against_index(ds, index, psig=None): terms = regression_against_index(ds, ind) sens = pco2_sensitivity.sel(time=slice(str(y1), str(y2))) res.append(terms * nanmean(sens)) - terms_in_pCO2_units = xr.concat(res, dim="time").mean("time") + terms_in_pCO2_units = xr.concat(res, dim='time').mean('time') if plot: terms_in_pCO2_units.to_array().plot( - col="variable", cmap="RdBu_r", robust=True, **plot_kwargs + col='variable', cmap='RdBu_r', robust=True, **plot_kwargs ) else: return terms_in_pCO2_units @@ -376,10 +376,11 @@ def spco2_decomposition(ds_terms, detrend=True, order=1, deseasonalize=False): terms_in_pCO2_units (xr.Dataset): terms of spco2 decomposition References: - * Lovenduski, Nicole S., Nicolas Gruber, Scott C. Doney, and Ivan D. Lima. - “Enhanced CO2 Outgassing in the Southern Ocean from a Positive Phase of - the Southern Annular Mode.” Global Biogeochemical Cycles 21, no. 2 - (2007). https://doi.org/10/fpv2wt. + * Lovenduski, Nicole S., Nicolas Gruber, Scott C. Doney, and Ivan D. Lima. + “Enhanced CO2 Outgassing in the Southern Ocean from a Positive Phase of + the Southern Annular Mode.” Global Biogeochemical Cycles 21, no. 2 + (2007). https://doi.org/10/fpv2wt. + """ pco2_sensitivity = spco2_sensitivity(ds_terms) @@ -389,18 +390,18 @@ def spco2_decomposition(ds_terms, detrend=True, order=1, deseasonalize=False): to remove from your time series.""" ) elif detrend: - ds_terms_anomaly = rm_poly(ds_terms, order=order, dim="time") + ds_terms_anomaly = rm_poly(ds_terms, order=order, dim='time') else: - warnings.warn("Your data are not being detrended.") - ds_terms_anomaly = ds_terms - ds_terms.mean("time") + warnings.warn('Your data are not being detrended.') + ds_terms_anomaly = ds_terms - ds_terms.mean('time') if deseasonalize: - clim = ds_terms_anomaly.groupby("time.month").mean("time") - ds_terms_anomaly = ds_terms_anomaly.groupby("time.month") - clim + clim = ds_terms_anomaly.groupby('time.month').mean('time') + ds_terms_anomaly = ds_terms_anomaly.groupby('time.month') - clim else: - warnings.warn("Your data are not being deseasonalized.") + warnings.warn('Your data are not being deseasonalized.') - terms_in_pCO2_units = pco2_sensitivity.mean("time") * ds_terms_anomaly + terms_in_pCO2_units = pco2_sensitivity.mean('time') * ds_terms_anomaly return terms_in_pCO2_units @@ -414,16 +415,17 @@ def calculate_compatible_emissions(global_co2_flux, co2atm_forcing): Returns: xr.object: compatible emissions in PgC/yr. - References: - * Jones, Chris, Eddy Robertson, Vivek Arora, Pierre Friedlingstein, Elena - Shevliakova, Laurent Bopp, Victor Brovkin, et al. “Twenty-First-Century - Compatible CO2 Emissions and Airborne Fraction Simulated by CMIP5 Earth - System Models under Four Representative Concentration Pathways.” - Journal of Climate 26, no. 13 (February 1, 2013): 4398–4413. - https://doi.org/10/f44bbn. + Reference: + * Jones, Chris, Eddy Robertson, Vivek Arora, Pierre Friedlingstein, Elena + Shevliakova, Laurent Bopp, Victor Brovkin, et al. “Twenty-First-Century + Compatible CO2 Emissions and Airborne Fraction Simulated by CMIP5 Earth + System Models under Four Representative Concentration Pathways.” + Journal of Climate 26, no. 13 (February 1, 2013): 4398–4413. + https://doi.org/10/f44bbn. + """ - compatible_emissions = co2atm_forcing.diff("time") * 2.12 - global_co2_flux - compatible_emissions.name = "compatible_emissions" + compatible_emissions = co2atm_forcing.diff('time') * 2.12 - global_co2_flux + compatible_emissions.name = 'compatible_emissions' return compatible_emissions @@ -435,21 +437,21 @@ def get_iam_emissions(): """ ds = [] - member = ["rcp26", "rcp45", "rcp85"] + member = ['rcp26', 'rcp45', 'rcp85'] for r in member: - if r == "rcp26": - r = "rcp3pd" + if r == 'rcp26': + r = 'rcp3pd' r = r.upper() - link = f"http://www.pik-potsdam.de/~mmalte/rcps/data/{r}_EMISSIONS.xls" - e = pd.read_excel(link, sheet_name=f"{r}_EMISSIONS", skiprows=35, header=2) + link = f'http://www.pik-potsdam.de/~mmalte/rcps/data/{r}_EMISSIONS.xls' + e = pd.read_excel(link, sheet_name=f'{r}_EMISSIONS', skiprows=35, header=2) e = e.set_index(e.columns[0]) - e.index.name = "Year" - ds.append(e[["FossilCO2", "OtherCO2"]].to_xarray()) - ds = xr.concat(ds, "member") - ds = ds.sel(Year=slice(1850, 2100)).rename({"Year": "time"}) - ds["member"] = member - ds["IAM_emissions"] = ds["FossilCO2"] + ds["OtherCO2"] - return ds["IAM_emissions"] + e.index.name = 'Year' + ds.append(e[['FossilCO2', 'OtherCO2']].to_xarray()) + ds = xr.concat(ds, 'member') + ds = ds.sel(Year=slice(1850, 2100)).rename({'Year': 'time'}) + ds['member'] = member + ds['IAM_emissions'] = ds['FossilCO2'] + ds['OtherCO2'] + return ds['IAM_emissions'] def plot_compatible_emissions( @@ -484,54 +486,54 @@ def plot_compatible_emissions( fig, ax = plt.subplots(figsize=(10, 6)) # hist alpha = 0.1 - c = "gray" + c = 'gray' compatible_emissions.isel(member=0).sel( time=slice(None, 2005) - ).to_dataframe().unstack()["compatible_emissions"].plot( + ).to_dataframe().unstack()['compatible_emissions'].plot( ax=ax, legend=False, color=c, alpha=alpha ) compatible_emissions.isel(member=0).sel(time=slice(None, 2005)).mean( - "initialization" - ).plot(ax=ax, color="w", lw=3) + 'initialization' + ).plot(ax=ax, color='w', lw=3) compatible_emissions.isel(member=0).sel(time=slice(None, 2005)).mean( - "initialization" + 'initialization' ).plot(ax=ax, color=c, lw=2) # rcps - colors = ["royalblue", "orange", "red"][::-1] + colors = ['royalblue', 'orange', 'red'][::-1] for i, m in enumerate(global_co2_flux.member.values[::-1]): c = colors[i] compatible_emissions.sel(member=m).sel( time=slice(2005, None) - ).to_dataframe().unstack()["compatible_emissions"].plot( + ).to_dataframe().unstack()['compatible_emissions'].plot( ax=ax, legend=False, color=c, alpha=alpha ) compatible_emissions.sel(member=m).sel(time=slice(2005, None)).mean( - "initialization" - ).plot(ax=ax, color="w", lw=3) + 'initialization' + ).plot(ax=ax, color='w', lw=3) compatible_emissions.sel(member=m).sel(time=slice(2005, None)).mean( - "initialization" + 'initialization' ).plot(ax=ax, color=c, lw=2) if iam_emissions is not None: ls = (0, (5, 5)) iam_emissions.isel(member=0).sel(time=slice(None, 2005)).plot( - ax=ax, color="white", lw=3 + ax=ax, color='white', lw=3 ) iam_emissions.isel(member=0).sel(time=slice(None, 2005)).plot( - ax=ax, color="gray", lw=2, ls=ls + ax=ax, color='gray', lw=2, ls=ls ) for i, m in enumerate(global_co2_flux.member.values[::-1]): c = colors[i] iam_emissions.sel(member=m).sel(time=slice(2005, None)).plot( - ax=ax, color="white", lw=3 + ax=ax, color='white', lw=3 ) iam_emissions.sel(member=m).sel(time=slice(2005, None)).plot( ax=ax, color=c, lw=2, ls=ls ) # fig aestetics - ax.axhline(y=0, ls=":", c="gray") - ax.set_ylabel("Compatible emissions [PgC/yr]") - ax.set_xlabel("Time [year]") - ax.set_title("Compatible emissions") + ax.axhline(y=0, ls=':', c='gray') + ax.set_ylabel('Compatible emissions [PgC/yr]') + ax.set_xlabel('Time [year]') + ax.set_title('Compatible emissions') return ax diff --git a/esmtools/checks.py b/esmtools/checks.py index c5df743..4d8ee7b 100644 --- a/esmtools/checks.py +++ b/esmtools/checks.py @@ -1,10 +1,20 @@ from functools import wraps +import numpy as np import xarray as xr from .exceptions import DimensionError +def has_missing(data): + """Returns ``True`` if any NaNs in ``data`` and ``False`` otherwise. + + Args: + data (ndarray or xarray object): Array to check for missing data. + """ + return np.isnan(data).any() + + # https://stackoverflow.com/questions/10610824/ # python-shortcut-for-writing-decorators-which-accept-arguments def dec_args_kwargs(wrapper): @@ -16,6 +26,11 @@ def dec_args_kwargs(wrapper): def has_dims(xobj, dims, kind): """ Checks that at the minimum, the object has provided dimensions. + + Args: + xobj (xarray object): Dataset or DataArray to check dimensions on. + dims (list or str): Dimensions being checked. + kind (str): String to precede "object" in the error message. """ if isinstance(dims, str): dims = [dims] @@ -55,6 +70,7 @@ def wrapper(*args, **kwargs): raise IOError( f"""The input data is not an xarray DataArray or Dataset. + Your input was of type: {typecheck}""" ) except IndexError: diff --git a/esmtools/composite.py b/esmtools/composite.py index 63c3421..dca2f25 100644 --- a/esmtools/composite.py +++ b/esmtools/composite.py @@ -45,7 +45,8 @@ def composite_analysis( References: * Motivated from Ryan Abernathy's notebook here: - https://rabernat.github.io/research_computing/xarray.html + https://rabernat.github.io/research_computing/xarray.html + """ def compute_ttest_for_composite(composite, index, psig): diff --git a/esmtools/constants.py b/esmtools/constants.py index 8b6aea7..a315e1e 100644 --- a/esmtools/constants.py +++ b/esmtools/constants.py @@ -10,3 +10,31 @@ "fdr_tsbh", "fdr_tsbky", ] + +# Number of days per month for each `cftime` calendar. +# Taken from xarray example: +# http://xarray.pydata.org/en/stable/examples/monthly-means.html +DAYS_PER_MONTH = { + "noleap": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "365_day": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "standard": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "gregorian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "proleptic_gregorian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "all_leap": [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "366_day": [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], + "360_day": [0, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], + "julian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], +} +# Not as simple as summing since there's nuance here in which years get a quarter year. +DAYS_PER_YEAR = { + "noleap": 365, + "365_day": 365.25, + "standard": 365.25, + "gregorian": 365.25, + "proleptic_gregorian": 365.25, + "all_leap": 366, + "366_day": 366, + "360_day": 360, + "julian": 365.25, +} +CALENDARS = [k for k in DAYS_PER_MONTH] diff --git a/esmtools/physics.py b/esmtools/physics.py index 1c120a5..9d58a93 100644 --- a/esmtools/physics.py +++ b/esmtools/physics.py @@ -10,8 +10,7 @@ def stress_to_speed(x, y): - """ - This converts ocean wind stress to wind speed at 10m over the ocean so + """This converts ocean wind stress to wind speed at 10m over the ocean so that one can use the native ocean grid rather than trying to interpolate between ocean and atmosphere grids. diff --git a/esmtools/stats.py b/esmtools/stats.py index 65a6ab4..a40bff8 100644 --- a/esmtools/stats.py +++ b/esmtools/stats.py @@ -1,16 +1,16 @@ +import warnings + import climpred.stats as st import numpy as np import numpy.polynomial.polynomial as poly +import scipy import xarray as xr -from scipy.stats import linregress as lreg -from .checks import has_dims, is_xarray -from .utils import get_dims +from .checks import has_dims, has_missing, is_xarray +from .timeutils import TimeUtilAccessor +from .utils import get_dims, match_nans -# ------------------ -# GENERAL STATISTICS -# ------------------ @is_xarray(0) def standardize(ds, dim="time"): """Standardize Dataset/DataArray @@ -41,17 +41,12 @@ def nanmean(ds, dim="time"): return ds -# -------------------------- -# AREA-WEIGHTING DEFINITIONS -# -------------------------- @is_xarray(0) def cos_weight(da, lat_coord="lat", lon_coord="lon", one_dimensional=True): """ Area-weights data on a regular (e.g. 360x180) grid that does not come with cell areas. Uses cosine-weighting. - NOTE: Currently explicitly writing `xr` as a prefix for xarray-specific - definitions. Since `esmtools` is supposed to be a wrapper for xarray, - this might be altered in the future. + Parameters ---------- da : DataArray with longitude and latitude @@ -61,9 +56,11 @@ def cos_weight(da, lat_coord="lat", lon_coord="lon", one_dimensional=True): Name of longitude coordinate one_dimensional : bool (optional) If true, assumes that lat and lon are 1D (i.e. not a meshgrid) + Returns ------- aw_da : Area-weighted DataArray + Examples -------- import esmtools as et @@ -135,317 +132,404 @@ def area_weight(da, area_coord="area"): return aw_da -# ----------- -# TIME SERIES -# ----------- -@is_xarray(0) -def smooth_series(da, dim, length, center=True): - """ - Returns a smoothed version of the input timeseries. - NOTE: Currently explicitly writing `xr` as a prefix for xarray-specific - definitions. Since `esmtools` is supposed to be a wrapper for xarray, - this might be altered in the future. - Parameters - ---------- - da : xarray DataArray - dim : str - dimension to smooth over (e.g. 'time') - length : int - number of steps to smooth over for the given dim - center : boolean (default to True) - whether to center the smoothing filter or start from the beginning - Returns - ------- - smoothed : smoothed DataArray object +def _check_y_not_independent_variable(y, dim): + """Checks that `y` is not the independent variable in statistics functions. + + Args: + y (xr.DataArray or xr.Dataset): Dependent variable from statistics functions. + dim (str): Dimension statistical function is being applied over. + + Raises: + ValueError: If `y` is a DataArray and equal to `dim`. This infers that something + like a time axis is being placed in the dependent variable. + """ - return da.rolling({dim: length}, center=center).mean() + if isinstance(y, xr.DataArray) and (y.name == dim): + raise ValueError( + f"Dependent variable y should not be the same as the dim {dim} being " + "applied over. Change your y variable to x." + ) -@is_xarray(0) -def fit_poly(ds, order, dim="time"): - """Returns the fitted polynomial line of order N +def _convert_time_and_return_slope_factor(x, dim): + """Converts `x` to numeric time (if datetime) and returns slope factor. + + The numeric time accounts for differences in length of months, leap years, etc. + when fitting a regression and also ensures that the numpy functions don't break + with datetimes. + + Args: + x (xr.DataArray or xr.Dataset): Independent variable from statistical functions. + dim (str): Dimension statistical function is being applied over. + + Returns: + x (xr.DataArray or xr.Dataset): If `x` is a time axis, converts to numeric + time. Otherwise, return the original `x`. + slope_factor (float): Factor to multiply slope by if returning regression + results. This accounts for the fact that datetimes are converted to + "days since 1990-01-01" numeric time and thus the answer comes out + in the original units per day (e.g., degC/day). This auto-converts to + the original temporal frequency (e.g., degC/year) if the calendar + can be inferred. + """ + slope_factor = 1.0 + if isinstance(x, xr.DataArray): + if x.timeutils.is_temporal: + slope_factor = x.timeutils.slope_factor + x = x.timeutils.return_numeric_time() + return x, slope_factor - .. note:: - This automatically interpolates across NaNs to make the fit. + +def _handle_nans(x, y, nan_policy): + """Modifies `x` and `y` based on `nan_policy`. Args: - ds (xarray object): Time series to fit polynomial to. - order (int): Order of polynomial fit. - dim (optional str): Dimension over which to fit hte polynomial. + x, y (xr.DataArray or ndarrays): Two time series to which statistical function + is being applied. + nan_policy (str): One of ['none', 'propagate', 'raise', 'omit', 'drop']. If + 'none' or 'propagate', return unmodified so the nans can be propagated + through in the functions. If 'raise', raises a warning if there are any + nans in `x` or `y`. If 'omit' or 'drop', removes values that contain + a nan in either `x` or `y` and returns resulting `x` and `y`. Returns: - xarray object with polynomial fit. + x, y (xr.DataArray or ndarrays): Modified `x` and `y` datasets. - References: - This is a modification of @ahuang11's script `rm_poly` in `climpred`. + Raises: + ValueError: If `nan_policy` is 'raise' and there are nans in either `x` or `y`; + if `nan_policy` is not one of ['none', 'propagate', 'raise', 'omit', + 'drop']; or if `x` or `y` are larger than 1-dimensional. """ - has_dims(ds, dim, "dataset") + # Only support 1D, since we are doing `~np.isnan()` indexing for 'omit'/'drop'. + if (x.ndim > 1) or (y.ndim > 1): + raise ValueError( + f"x and y must be 1-dimensional. Got {x.ndim} for x and {y.ndim} for y." + ) - # handle both datasets and dataarray - if isinstance(ds, xr.Dataset): - da = ds.to_array() - return_ds = True - else: - da = ds.copy() - return_ds = False - - da_dims_orig = list(da.dims) # orig -> original - if len(da_dims_orig) > 1: - # want independent axis to be the leading dimension - da_dims_swap = da_dims_orig.copy() # copy to prevent contamination - - # https://stackoverflow.com/questions/1014523/ - # simple-syntax-for-bringing-a-list-element-to-the-front-in-python - da_dims_swap.insert(0, da_dims_swap.pop(da_dims_swap.index(dim))) - da = da.transpose(*da_dims_swap) - - # hide other dims into a single dim - da = da.stack({"other_dims": da_dims_swap[1:]}) - dims_swapped = True + if nan_policy in ["none", "propagate"]: + return x, y + elif nan_policy == "raise": + if has_missing(x) or has_missing(y): + raise ValueError( + "Input data contains NaNs. Consider changing `nan_policy` to 'none' " + "or 'omit'. Or get rid of those NaNs somehow." + ) + else: + return x, y + elif nan_policy in ["omit", "drop"]: + if has_missing(x) or has_missing(y): + x_mod, y_mod = match_nans(x, y) + # The above function pairwise-matches nans. Now we remove them so that we + # can compute the statistic without the nans. + x_mod = x_mod[np.isfinite(x_mod)] + y_mod = y_mod[np.isfinite(y_mod)] + return x_mod, y_mod + else: + return x, y else: - dims_swapped = False - - # NaNs will make the polyfit fail--interpolate any NaNs in - # the provided dim to prevent poor fit, while other dims' NaNs - # will be filled with 0s; however, all NaNs will be replaced - # in the final output - nan_locs = np.isnan(da.values) - - # any(nan_locs.sum(axis=0)) fails if not 2D - if nan_locs.ndim == 1: - nan_locs = nan_locs.reshape(len(nan_locs), 1) - - # check if there's any NaNs in the provided dim because - # interpolate_na is computationally expensive to run regardless of NaNs - if any(nan_locs.sum(axis=0)) > 0: - # Could do a check to see if there's any NaNs that aren't bookended. - # [0, np.nan, 2], can interpolate. - da = da.interpolate_na(dim) - if any(nan_locs[0, :]): - # [np.nan, 1, 2], no first value to interpolate from; back fill - da = da.bfill(dim) - if any(nan_locs[-1, :]): - # [0, 1, np.nan], no last value to interpolate from; forward fill - da = da.ffill(dim) - - # this handles the other axes; doesn't matter since it won't affect the fit - da = da.fillna(0) - - # the actual operation of detrending - y = da.values - x = np.arange(0, len(y), 1) - coefs = poly.polyfit(x, y, order) - fit = poly.polyval(x, coefs) - fit = fit.transpose() - fit = xr.DataArray(fit, dims=da.dims, coords=da.coords) - - if dims_swapped: - # revert the other dimensions to its original form and ordering - fit = fit.unstack("other_dims").transpose(*da_dims_orig) - - if return_ds: - # revert back into a dataset - return xr.merge( - fit.sel(variable=var).rename(var).drop("variable") - for var in fit["variable"].values + raise ValueError( + f"{nan_policy} not one of ['none', 'propagate', 'raise', 'omit', 'drop']" ) + + +def _polyfit(x, y, order, nan_policy): + """Helper function for performing ``np.poly.polyfit`` which is used in both + ``polyfit`` and ``rm_poly``. + + Args: + x, y (ndarrays): Independent and dependent variables used in the polynomial + fit. + order (int): Order of polynomial fit to perform. + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. + + Returns: + fit (ndarray, xarray object): If ``nan_policy`` is 'none' or 'propagate' and + a nan exists in the time series, returns all nans. Otherwise, returns the + polynomial fit. + """ + x_mod, y_mod = _handle_nans(x, y, nan_policy) + # This catches cases where a given grid cell is full of nans, like in land masking. + if (nan_policy in ["omit", "drop"]) and (x_mod.size == 0): + return np.full(len(x), np.nan) + # This catches cases where there is missing values in the independent axis, which + # breaks polyfit. + elif (nan_policy in ["none", "propagate"]) and (has_missing(x_mod)): + return np.full(len(x), np.nan) else: - return fit + # fit to data without nans, return applied to original independent axis. + coefs = poly.polyfit(x_mod, y_mod, order) + return poly.polyval(x, coefs) -@is_xarray(0) -def linear_regression(da, dim="time", interpolate_na=False, compact=True, psig=None): +def _warn_if_not_converted_to_original_time_units(x): + """Administers warning if the independent variable is in datetimes and the + calendar frequency could not be inferred. + + Args: + x (xr.DataArray or xr.Dataset): Independent variable for statistical functions. """ - Computes the least-squares linear regression of an xr.DataArray x against - another xr.DataArray y. + if isinstance(x, xr.DataArray): + if x.timeutils.is_temporal: + if x.timeutils.freq is None: + warnings.warn( + "Datetime frequency not detected. Slope and std. errors will be " + "in original units per day (e.g., degC per day). Multiply by " + "e.g., 365.25 to convert to original units per year." + ) - Note: This only returns the statistics from the LSR (slope, intercept, rvalue, - pvalue, stderr). Use `fit_poly` to get the fitted line returned. - Parameters - ---------- - da : xarray DataArray - dim : str (default to 'time') - dimension over which to compute the linear regression. - interpolate_na : bool (default to False) - If True, linearly interpolate NaNs (and backfill/forwardfill). Note that if - this is False, the linear regression will return NaN if there are *any* - NaNs in the time series. - compact : boolean (default to True) - If true, return all results of linregress as a single dataset. - If false, return results as five separate DataArrays. - psig : double (default to None) - Alpha level for correlation significance. If not None, NaNs out any grid cells - that are larger than psig. +@is_xarray([0, 1]) +def linear_slope(x, y, dim="time", nan_policy="none"): + """Returns the linear slope with y regressed onto x. - Returns - ------- - ds : xarray Dataset - Dataset containing slope, intercept, rvalue, pvalue, stderr from - the linear regression. Excludes the dimension the regression was - computed over. If compact is False, these five parameters are - returned separately. + .. note:: + + This function will try to infer the time freqency of sampling if ``x`` is in + datetime units. The final slope will be returned in the original units per + that frequency (e.g. SST per year). If the frequency cannot be inferred + (e.g. because the sampling is irregular), it will return in the original + units per day (e.g. SST per day). + + Args: + x (xarray object): Independent variable (predictor) for linear regression. + y (xarray object): Dependent variable (predictand) for linear regression. + dim (str, optional): Dimension to apply linear regression over. + Defaults to "time". + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. + + Returns: + xarray object: Slopes computed through a least-squares linear regression. """ - # Check if dataset. - if isinstance(da, xr.Dataset): - raise NotImplementedError( - "Datasets are not yet supported for this function. " - + "Please retry with a DataArray." - ) + has_dims(x, dim, "predictor (x)") + has_dims(y, dim, "predictand (y)") + _check_y_not_independent_variable(y, dim) + x, slope_factor = _convert_time_and_return_slope_factor(x, dim) + + def _linear_slope(x, y, nan_policy): + x, y = _handle_nans(x, y, nan_policy) + # This catches cases where a given grid cell is full of nans, like in + # land masking. + if (nan_policy in ["omit", "drop"]) and (x.size == 0): + return np.asarray([np.nan]) + # This catches cases where there is missing values in the independent axis, + # which breaks polyfit. + elif (nan_policy in ["none", "propagate"]) and (has_missing(x)): + return np.asarray([np.nan]) + else: + return np.polyfit(x, y, 1)[0] + + slopes = xr.apply_ufunc( + _linear_slope, + x, + y, + nan_policy, + vectorize=True, + dask="parallelized", + input_core_dims=[[dim], [dim], []], + output_dtypes=["float64"], + ) + _warn_if_not_converted_to_original_time_units(x) + return slopes * slope_factor - x = da[dim] - # Linear regression doesn't like to work with actual datetime. So temporarily - # convert to integers. - # `np.object` covers cftime. - if np.issubdtype(x, np.datetime64) or isinstance(x, np.object): - x = np.arange(len(x)) - - if interpolate_na: - # borrowed from @ahuang11's implementation in `climpred` - da_dims_orig = list(da.dims) # orig -> original - if len(da_dims_orig) > 1: - # want independent axis to be the leading dimension - da_dims_swap = da_dims_orig.copy() # copy to prevent contamination - - # https://stackoverflow.com/questions/1014523/ - # simple-syntax-for-bringing-a-list-element-to-the-front-in-python - da_dims_swap.insert(0, da_dims_swap.pop(da_dims_swap.index(dim))) - da = da.transpose(*da_dims_swap) - - # hide other dims into a single dim - da = da.stack({"other_dims": da_dims_swap[1:]}) - dims_swapped = True + +@is_xarray([0, 1]) +def linregress(x, y, dim="time", nan_policy="none"): + """Vectorized applciation of ``scipy.stats.linregress``. + + .. note:: + + This function will try to infer the time freqency of sampling if ``x`` is in + datetime units. The final slope and standard error will be returned in the + original units per that frequency (e.g. SST per year). If the frequency + cannot be inferred (e.g. because the sampling is irregular), it will return in + the original units per day (e.g. SST per day). + + Args: + x (xarray object): Independent variable (predictor) for linear regression. + y (xarray object): Dependent variable (predictand) for linear regression. + dim (str, optional): Dimension to apply linear regression over. + Defaults to "time". + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. + + Returns: + xarray object: Slope, intercept, correlation, p value, and standard error for + the linear regression. These 5 parameters are added as a new dimension + "parameter". + + """ + has_dims(x, dim, "predictor (x)") + has_dims(y, dim, "predictand (y)") + _check_y_not_independent_variable(y, dim) + x, slope_factor = _convert_time_and_return_slope_factor(x, dim) + + def _linregress(x, y, slope_factor, nan_policy): + x, y = _handle_nans(x, y, nan_policy) + # This catches cases where a given grid cell is full of nans, like in + # land masking. + if (nan_policy in ["omit", "drop"]) and (x.size == 0): + return np.full(5, np.nan) else: - dims_swapped = False - - # This is borrowed from @ahuang11's implementation in `climpred` to handle - # NaNs in the time series. - nan_locs = np.isnan(da.values) - - # any(nan_locs.sum(axis=0)) fails if not 2D - if nan_locs.ndim == 1: - nan_locs = nan_locs.reshape(len(nan_locs), 1) - - # check if there's any NaNs in the provided dim because - # interpolate_na is computationally expensive to run regardless of NaNs - # if nan_locs.sum(dim=dim).any(): - if any(nan_locs.sum(axis=0)) > 0: - da = da.interpolate_na(dim) - if any(nan_locs[0, :]): - # [np.nan, 1, 2], no first value to interpolate from; back fill - da = da.bfill(dim) - if any(nan_locs[-1, :]): - # [0, 1, np.nan], no last value to interpolate from; forward fill - da = da.ffill(dim) - - # this handles the other axes; doesn't matter since it won't affect the fit - da = da.fillna(0) - if dims_swapped: - da = da.unstack("other_dims").transpose(*da_dims_orig) + m, b, r, p, e = scipy.stats.linregress(x, y) + # Multiply slope and standard error by factor. If time indices were + # converted to numeric units, this gets them back to the original units. + m *= slope_factor + e *= slope_factor + return np.array([m, b, r, p, e]) results = xr.apply_ufunc( - lreg, + _linregress, x, - da, - input_core_dims=[[dim], [dim]], - output_core_dims=[[], [], [], [], []], + y, + slope_factor, + nan_policy, vectorize=True, - dask="allowed", + dask="parallelized", + input_core_dims=[[dim], [dim], [], []], + output_core_dims=[["parameter"]], + output_dtypes=["float64"], + output_sizes={"parameter": 5}, ) - - # Force into a cleaner dataset. The above function returns a dataset - # with no clear labeling. - ds = xr.Dataset() - labels = ["slope", "intercept", "rvalue", "pvalue", "stderr"] - for i, l in enumerate(labels): - results[i].name = l - ds = xr.merge([ds, results[i]]) - if psig is not None: - ds = ds.where(ds["pvalue"] < psig) - if compact: - return ds - else: - return ds["slope"], ds["intercept"], ds["rvalue"], ds["pvalue"], ds["stderr"] + results = results.assign_coords( + parameter=["slope", "intercept", "rvalue", "pvalue", "stderr"] + ) + _warn_if_not_converted_to_original_time_units(x) + return results @is_xarray(0) -def corr(x, y, dim="time", lead=0, return_p=False): - """ - Computes the Pearson product-momment coefficient of linear correlation. +def polyfit(x, y, order, dim="time", nan_policy="none"): + """Returns the fitted polynomial line of ``y`` regressed onto ``x``. - This version calculates the effective degrees of freedom, accounting - for autocorrelation within each time series that could fluff the - significance of the correlation. + .. note:: - Parameters - ---------- - x, y : xarray DataArray - time series being correlated (can be multi-dimensional) - dim : str (default 'time') - Correlation dimension - lead : int (default 0) - If lead > 0, x leads y by that many time steps. - If lead < 0, x lags y by that many time steps. - return_p : boolean (default False) - If true, return both r and p + This will be released as a standard ``xarray`` func in 0.15.2. - Returns - ------- - r : correlation coefficient - p : p-value accounting for autocorrelation (if return_p True) + Args: + x, y (xr.DataArray or xr.Dataset): Independent and dependent variables used in + the polynomial fit. + order (int): Order of polynomial fit to perform. + dim (str, optional): Dimension to apply polynomial fit over. + Defaults to "time". + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. - References (for dealing with autocorrelation): - ---------- - 1. Wilks, Daniel S. Statistical methods in the atmospheric sciences. - Vol. 100. Academic press, 2011. - 2. Lovenduski, Nicole S., and Nicolas Gruber. "Impact of the Southern - Annular Mode on Southern Ocean circulation and biology." Geophysical - Research Letters 32.11 (2005). - 3. Brady, R. X., Lovenduski, N. S., Alexander, M. A., Jacox, M., and - Gruber, N.: On the role of climate modes in modulating the air-sea CO2 - fluxes in Eastern Boundary Upwelling Systems, Biogeosciences Discuss., - https://doi.org/10.5194/bg-2018-415, in review, 2018. + Returns: + xarray object: The polynomial fit for ``y`` regressed onto ``x``. Has the same + dimensions as ``y``. """ - # Broadcasts a time series to the same coordinates/size as the grid. If they - # are both grids, this function does nothing and isn't expensive. - x, y = xr.broadcast(x, y) + has_dims(x, dim, "predictor (x)") + has_dims(y, dim, "predictand (y)") + _check_y_not_independent_variable(y, dim) + x, _ = _convert_time_and_return_slope_factor(x, dim) - # Negative lead should have y lead x. - if lead < 0: - lead = np.abs(lead) - return st.corr(y, x, dim=dim, lag=lead, return_p=return_p) - else: - return st.corr(x, y, dim=dim, lag=lead, return_p=return_p) + return xr.apply_ufunc( + _polyfit, + x, + y, + order, + nan_policy, + vectorize=True, + dask="parallelized", + input_core_dims=[[dim], [dim], [], []], + output_core_dims=[[dim]], + output_dtypes=["float"], + ) @is_xarray(0) -def rm_poly(da, order, dim="time"): - """ - Returns xarray object with nth-order fit removed from every time series. - Input - ----- - da : xarray DataArray - Single time series or many gridded time series of object to be - detrended - order : int - Order of polynomial fit to be removed. If 1, this is functionally - the same as calling `rm_trend` - dim : str (default 'time') - Dimension over which to remove the polynomial fit. - Returns - ------- - detrended_ts : xarray DataArray - DataArray with detrended time series. +def rm_poly(x, y, order, dim="time", nan_policy="none"): + """Removes a polynomial fit from ``y`` regressed onto ``x``. + + Args: + x, y (xr.DataArray or xr.Dataset): Independent and dependent variables used in + the polynomial fit. + order (int): Order of polynomial fit to perform. + dim (str, optional): Dimension to apply polynomial fit over. + Defaults to "time". + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. + + Returns: + xarray object: ``y`` with polynomial fit of order ``order`` removed. """ - return st.rm_poly(da, order, dim=dim) + has_dims(x, dim, "predictor (x)") + has_dims(y, dim, "predictand (y)") + _check_y_not_independent_variable(y, dim) + x, _ = _convert_time_and_return_slope_factor(x, dim) + + def _rm_poly(x, y, order, nan_policy): + fit = _polyfit(x, y, order, nan_policy) + return y - fit + + return xr.apply_ufunc( + _rm_poly, + x, + y, + order, + nan_policy, + vectorize=True, + dask="parallelized", + input_core_dims=[[dim], [dim], [], []], + output_core_dims=[[dim]], + output_dtypes=["float64"], + ) @is_xarray(0) -def rm_trend(da, dim="time"): - """ - Calls rm_poly with an order 1 argument. +def rm_trend(x, y, dim="time", nan_policy="none"): + """Removes a linear fit from ``y`` regressed onto ``x``. + + Args: + x, y (xr.DataArray or xr.Dataset): Independent and dependent variables used in + the linear fit. + dim (str, optional): Dimension to apply linear fit over. + Defaults to "time". + nan_policy (str, optional): Policy to use when handling nans. Defaults to + "none". + + * 'none', 'propagate': If a NaN exists anywhere on the given dimension, + return nans for that whole dimension. + * 'raise': If a NaN exists at all in the datasets, raise an error. + * 'omit', 'drop': If a NaN exists in `x` or `y`, drop that index and + compute the slope without it. + + Returns: + xarray object: ``y`` with linear fit removed. """ - return st.rm_trend(da, dim=dim) + return rm_poly(x, y, 1, dim=dim, nan_policy=nan_policy) @is_xarray(0) @@ -506,3 +590,53 @@ def ACF(ds, dim="time", nlags=None): acf.append(res) acf = xr.concat(acf, dim=dim) return acf + + +@is_xarray(0) +def corr(x, y, dim="time", lead=0, return_p=False): + """ + Computes the Pearson product-momment coefficient of linear correlation. + + This version calculates the effective degrees of freedom, accounting + for autocorrelation within each time series that could fluff the + significance of the correlation. + + Parameters + ---------- + x, y : xarray DataArray + time series being correlated (can be multi-dimensional) + dim : str (default 'time') + Correlation dimension + lead : int (default 0) + If lead > 0, x leads y by that many time steps. + If lead < 0, x lags y by that many time steps. + return_p : boolean (default False) + If true, return both r and p + + Returns + ------- + r : correlation coefficient + p : p-value accounting for autocorrelation (if return_p True) + + References (for dealing with autocorrelation): + ---------- + 1. Wilks, Daniel S. Statistical methods in the atmospheric sciences. + Vol. 100. Academic press, 2011. + 2. Lovenduski, Nicole S., and Nicolas Gruber. "Impact of the Southern + Annular Mode on Southern Ocean circulation and biology." Geophysical + Research Letters 32.11 (2005). + 3. Brady, R. X., Lovenduski, N. S., Alexander, M. A., Jacox, M., and + Gruber, N.: On the role of climate modes in modulating the air-sea CO2 + fluxes in Eastern Boundary Upwelling Systems, Biogeosciences Discuss., + https://doi.org/10.5194/bg-2018-415, in review, 2018. + """ + # Broadcasts a time series to the same coordinates/size as the grid. If they + # are both grids, this function does nothing and isn't expensive. + x, y = xr.broadcast(x, y) + + # Negative lead should have y lead x. + if lead < 0: + lead = np.abs(lead) + return st.corr(y, x, dim=dim, lag=lead, return_p=return_p) + else: + return st.corr(x, y, dim=dim, lag=lead, return_p=return_p) diff --git a/esmtools/temporal.py b/esmtools/temporal.py index 4f5a79e..37d1b38 100644 --- a/esmtools/temporal.py +++ b/esmtools/temporal.py @@ -3,87 +3,11 @@ import numpy as np import xarray as xr -# This supports all calendars used in netCDF. -dpm = { - "noleap": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "365_day": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "standard": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "gregorian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "proleptic_gregorian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "all_leap": [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "366_day": [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], - "360_day": [0, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], - "julian": [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], -} -# Converts from `cftime` class name to netCDF convention for calendar -cftime_to_netcdf = { - "DatetimeJulian": "julian", - "DatetimeProlepticGregorian": "proleptic_gregorian", - "DatetimeNoLeap": "noleap", - "DatetimeAllLeap": "all_leap", - "DatetimeGregorian": "gregorian", -} -CALENDARS = [k for k in dpm] -resolutions = {"annual": "time.year"} -RESOLUTIONS = [k for k in resolutions] - - -def _leap_year(year, calendar="standard"): - """Determine if year is a leap year""" - leap = False - if (calendar in ["standard", "gregorian", "proleptic_gregorian", "julian"]) and ( - year % 4 == 0 - ): - leap = True - if ( - (calendar == "proleptic_gregorian") - and (year % 100 == 0) - and (year % 400 != 0) - ): - leap = False - elif ( - (calendar in ["standard", "gregorian"]) - and (year % 100 == 0) - and (year % 400 != 0) - and (year < 1583) - ): - leap = False - return leap - - -def _get_dpm(time, calendar="standard"): - """ - return a array of days per month corresponding to the months provided in `months` - """ - month_length = np.zeros(len(time), dtype=np.int) - - cal_days = dpm[calendar] - - for i, (month, year) in enumerate(zip(time.month, time.year)): - month_length[i] = cal_days[month] - if _leap_year(year, calendar=calendar): - month_length[i] += 1 - return month_length - +from .constants import CALENDARS +from .timeutils import get_calendar, get_days_per_month -def _retrieve_calendar(ds, dim="time"): - """Attempt to pull calendar type automatically using cftime objects. - - .. note:: - This relies upon ``xarray``'s automatic conversion of time to ``cftime``. - - Args: - ds (xarray object): Dataset being resampled. - dim (optional str): Time dimension. - """ - example_time = ds[dim].values[0] - # Type of variable being used for time. - var_type = type(example_time).__name__ - if var_type in cftime_to_netcdf: - # If this is a `cftime` object, infer what type of calendar it is. - return cftime_to_netcdf[var_type] - else: - raise ValueError(f"Please submit a calendar from {CALENDARS}") +GROUPBY_TIMES = {"annual": "time.year"} +TIME_RESOLUTIONS = [k for k in GROUPBY_TIMES] def _weighted_resample(ds, calendar=None, dim="time", resample_resolution=None): @@ -101,18 +25,18 @@ def _weighted_resample(ds, calendar=None, dim="time", resample_resolution=None): resolution with weighting. """ if (calendar is None) or (calendar not in CALENDARS): - calendar = _retrieve_calendar(ds, dim=dim) + calendar = get_calendar(ds[dim]) - if resample_resolution not in RESOLUTIONS: - raise ValueError(f"Please submit a temporal resolution from {RESOLUTIONS}") + if resample_resolution not in TIME_RESOLUTIONS: + raise ValueError(f"Please submit a temporal resolution from {TIME_RESOLUTIONS}") time_length = xr.DataArray( - _get_dpm(ds.time.to_index(), calendar=calendar), + get_days_per_month(ds.time.to_index(), calendar=calendar), coords=[ds.time], name="time_length", ) - time_res = resolutions[resample_resolution] + time_res = GROUPBY_TIMES[resample_resolution] # Get weights of each time unit (e.g., daily, monthly) weights = time_length.groupby(time_res) / time_length.groupby(time_res).sum() diff --git a/esmtools/tests/conftest.py b/esmtools/tests/conftest.py new file mode 100644 index 0000000..08d23cb --- /dev/null +++ b/esmtools/tests/conftest.py @@ -0,0 +1,164 @@ +import numpy as np +import pytest +import xarray as xr + + +@pytest.fixture() +def gridded_da_float(): + """Mock data of gridded time series in float time.""" + # Wrapper so fixture can be called multiple times. + # https://alysivji.github.io/pytest-fixures-with-function-arguments.html + def _gen_data(): + data = np.random.rand(60, 3, 3) + da = xr.DataArray(data, dims=["time", "lat", "lon"]) + # Annual resolution time axis for 60 years. + da["time"] = np.arange(1900, 1960) + return da + + return _gen_data + + +@pytest.fixture() +def gridded_da_landmask(gridded_da_float): + """Mock data for gridded time series in float time with land mask (nans over + all time in some grid cells).""" + data = gridded_da_float() + # Mask arbitrary chunk through all time to simulate land mask. + data = data.where((data.lat < 2) & (data.lon > 0)) + return data + + +@pytest.fixture() +def gridded_da_missing_data(gridded_da_float): + """Mock data for gridded time series in float time with missing data (nans over + some time points in different grid cells).""" + data = gridded_da_float() + # Add nan to arbitrary time steps. + data[5, 0, 0] = np.nan + data[3, 1, 0] = np.nan + return data + + +@pytest.fixture() +def gridded_da_datetime(): + """Mock data of gridded time series in numpy datetime.""" + # Wrapper so fixture can be called multiple times. + # https://alysivji.github.io/pytest-fixures-with-function-arguments.html + def _gen_data(): + data = np.random.rand(60, 3, 3) + da = xr.DataArray(data, dims=["time", "lat", "lon"]) + # Monthly resolution time axis for 5 years. + da["time"] = np.arange("1990-01", "1995-01", dtype="datetime64[M]") + return da + + return _gen_data + + +@pytest.fixture() +def gridded_da_cftime(): + """Mock data of gridded time series in cftime.""" + # Wrapper so fixture can be called multiple times. + # https://alysivji.github.io/pytest-fixures-with-function-arguments.html + def _gen_data(): + data = np.random.rand(60, 3, 3) + da = xr.DataArray(data, dims=["time", "lat", "lon"]) + # Monthly resolution time axis for 5 years. + da["time"] = xr.cftime_range(start="1990-01", freq="M", periods=da.time.size) + return da + + return _gen_data + + +@pytest.fixture() +def gridded_ds_float(gridded_da_float): + """Mock data of a gridded time series as an xarray Dataset with float time.""" + data = xr.Dataset() + data["foo"] = gridded_da_float() + data["bar"] = gridded_da_float() + return data + + +@pytest.fixture() +def gridded_ds_datetime(gridded_da_datetime): + """Mock data of a gridded time series as an xarray Dataset with numpy datetime.""" + data = xr.Dataset() + data["foo"] = gridded_da_datetime() + data["bar"] = gridded_da_datetime() + return data + + +@pytest.fixture() +def gridded_ds_cftime(gridded_da_cftime): + """Mock data of a gridded time series as an xarray Dataset with cftime.""" + data = xr.Dataset() + data["foo"] = gridded_da_cftime() + data["bar"] = gridded_da_cftime() + return data + + +@pytest.fixture() +def ts_monthly_da(): + """Mock time series at monthly resolution for ten years.""" + # Wrapper so fixture can be called multiple times. + # https://alysivji.github.io/pytest-fixures-with-function-arguments.html + def _gen_data(): + data = np.random.rand(60) + da = xr.DataArray(data, dims=["time"]) + # Monthly resolution time axis for 10 years. + da["time"] = np.arange("1990-01", "1995-01", dtype="datetime64[M]") + return da + + return _gen_data + + +@pytest.fixture() +def ts_annual_da(): + """Mock time series at annual resolution for twenty years.""" + + def _gen_data(): + data = np.random.rand(20) + da = xr.DataArray(data, dims=["time"]) + da["time"] = xr.cftime_range("1990", freq="YS", periods=da.time.size) + return da + + return _gen_data + + +@pytest.fixture() +def annual_all_leap(): + """Mock annual 12-year time series with an all leap calendar.""" + data = xr.DataArray(np.random.rand(12,), dims=["time"]) + data["time"] = xr.cftime_range( + "1990", freq="YS", periods=data.time.size, calendar="all_leap" + ) + return data + + +@pytest.fixture() +def annual_no_leap(): + """Mock annual 12-year time series with a no leap calendar.""" + data = xr.DataArray(np.random.rand(12,), dims=["time"]) + data["time"] = xr.cftime_range( + "1990", freq="YS", periods=data.time.size, calendar="noleap" + ) + return data + + +@pytest.fixture() +def annual_gregorian(): + """Mock annual 12-year time series with a Gregorian calendar.""" + data = xr.DataArray(np.random.rand(12,), dims=["time"]) + data["time"] = xr.cftime_range( + "1990", freq="YS", periods=data.time.size, calendar="gregorian" + ) + return data + + +@pytest.fixture() +def annual_julian(): + """Mock annual 12-year time series with a Julian calendar.""" + data = xr.DataArray(np.random.rand(12,), dims=["time"]) + data["time"] = xr.cftime_range( + "1990", freq="YS", periods=data.time.size, calendar="julian" + ) + return data diff --git a/esmtools/tests/test_checks.py b/esmtools/tests/test_checks.py new file mode 100644 index 0000000..667b836 --- /dev/null +++ b/esmtools/tests/test_checks.py @@ -0,0 +1,8 @@ +from esmtools.checks import has_missing + + +def test_has_missing(gridded_da_float, gridded_da_landmask, gridded_da_missing_data): + """Tests that `has_missing` function works with various NaN configurations.""" + assert not has_missing(gridded_da_float()) + assert has_missing(gridded_da_landmask) + assert has_missing(gridded_da_missing_data) diff --git a/esmtools/tests/test_stats.py b/esmtools/tests/test_stats.py deleted file mode 100644 index 59ade32..0000000 --- a/esmtools/tests/test_stats.py +++ /dev/null @@ -1,80 +0,0 @@ -import numpy as np -import pytest -import xarray as xr - -from esmtools.stats import corr - - -@pytest.fixture() -def gridded_ts_da(): - """Mock data of gridded time series.""" - # Wrapper so fixture can be called multiple times. - # https://alysivji.github.io/pytest-fixures-with-function-arguments.html - def _gen_data(): - data = np.random.rand(120, 10, 10) - da = xr.DataArray(data, dims=["time", "lat", "lon"]) - # Monthly resolution time axis for 10 years. - da["time"] = np.arange("1990-01", "2000-01", dtype="datetime64[M]") - return da - - return _gen_data - - -@pytest.fixture() -def ts_monthly_da(): - """Mock time series at monthly resolution for ten years.""" - # Wrapper so fixture can be called multiple times. - # https://alysivji.github.io/pytest-fixures-with-function-arguments.html - def _gen_data(): - data = np.random.rand(120) - da = xr.DataArray(data, dims=["time"]) - # Monthly resolution time axis for 10 years. - da["time"] = np.arange("1990-01", "2000-01", dtype="datetime64[M]") - return da - - return _gen_data - - -def test_corr_two_grids(gridded_ts_da): - """Tests that correlations between two grids work.""" - x = gridded_ts_da() - y = gridded_ts_da() - corrcoeff = corr(x, y, dim="time") - # check that there's no NaNs in the resulting output. - assert not corrcoeff.isnull().any() - - -def test_corr_grid_and_ts(ts_monthly_da, gridded_ts_da): - """Tests that correlation works between a grid and a time series.""" - x = ts_monthly_da() - y = gridded_ts_da() - corrcoeff = corr(x, y, dim="time") - # check that there's no NaNs in the resulting output. - assert not corrcoeff.isnull().any() - - -def test_corr_lead(gridded_ts_da): - """Tests positive lead for correlation.""" - x = gridded_ts_da() - y = gridded_ts_da() - corrcoeff = corr(x, y, dim="time", lead=3) - # check that there's no NaNs in the resulting output. - assert not corrcoeff.isnull().any() - - -def test_corr_lag(gridded_ts_da): - """Tests negative lead for correlation.""" - x = gridded_ts_da() - y = gridded_ts_da() - corrcoeff = corr(x, y, dim="time", lead=-3) - # check that there's no NaNs in the resulting output. - assert not corrcoeff.isnull().any() - - -def test_corr_return_p(gridded_ts_da): - """Tests that p-value is returned properly for correlation.""" - x = gridded_ts_da() - y = gridded_ts_da() - corrcoeff, pval = corr(x, y, dim="time", return_p=True) - # check that there's no NaNs in the resulting output. - assert not (corrcoeff.isnull().any()) & (pval.isnull().any()) diff --git a/esmtools/tests/test_stats_nan_policies.py b/esmtools/tests/test_stats_nan_policies.py new file mode 100644 index 0000000..dbd0e69 --- /dev/null +++ b/esmtools/tests/test_stats_nan_policies.py @@ -0,0 +1,169 @@ +import numpy as np +import pytest + +from esmtools.stats import linear_slope, linregress, polyfit, rm_poly, rm_trend + + +@pytest.mark.parametrize("nan_policy", ("drop", "omit")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_full_data_omit_nans(gridded_da_float, func, nan_policy): + """Tests that the `drop`/`omit` nan policy works with a dataset that has no + missing data.""" + data = gridded_da_float() + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + assert result.notnull().all() + + +@pytest.mark.parametrize("nan_policy", ("none", "propagate")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_full_data_propagate_nans(gridded_da_float, func, nan_policy): + """Tests that the `none`/`propagate` nan policy works with a dataset that has no + missing data.""" + data = gridded_da_float() + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + assert result.notnull().all() + + +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_full_data_raise_nans(gridded_da_float, func): + """Tests that the `raise` nan policy does not raise an error with a dataset that + has no missing data.""" + data = gridded_da_float() + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": "raise"} + if func in [polyfit, rm_poly] + else {"nan_policy": "raise"} + ) + result = func(x, y, **args) + assert result.notnull().all() + + +@pytest.mark.parametrize("nan_policy", ("drop", "omit")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_landmask_omit_nans(gridded_da_landmask, func, nan_policy): + """Tests that the `drop`/`omit` nan policy works with a dataset that simulates + having land masking.""" + data = gridded_da_landmask + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + assert result.isel(lat=0, lon=0).isnull().all() + assert result.isel(lat=1, lon=1).notnull().all() + + +@pytest.mark.parametrize("nan_policy", ("none", "propagate")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_landmask_propagate_nans(gridded_da_landmask, func, nan_policy): + """Tests that the `none`/`propagate` nan policy works with a dataset that simulates + having land masking.""" + data = gridded_da_landmask + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + assert result.isel(lat=0, lon=0).isnull().all() + assert result.isel(lat=1, lon=1).notnull().all() + + +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_landmask_raise_nans(gridded_da_landmask, func): + """Tests that error is raised for `raise` with a dataset that simulates having nan + masking.""" + with pytest.raises(ValueError): + data = gridded_da_landmask + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": "raise"} + if func in [polyfit, rm_poly] + else {"nan_policy": "raise"} + ) + func(x, y, **args) + + +@pytest.mark.parametrize("nan_policy", ("drop", "omit")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_missing_data_omit_nans(gridded_da_missing_data, func, nan_policy): + """Tests that the `drop`/`omit` nan policy works with a dataset that simulates + missing data.""" + data = gridded_da_missing_data + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + if func not in [rm_poly, rm_trend]: + assert result.notnull().all() + else: + assert result.isel(lat=0, lon=0, time=5).isnull() + + +@pytest.mark.parametrize("nan_policy", ("none", "propagate")) +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_missing_data_propagate_nans(gridded_da_missing_data, func, nan_policy): + """Tests that the `none`/`propagate` nan policy works with a dataset that simulates + missing data.""" + data = gridded_da_missing_data + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": nan_policy} + if func in [polyfit, rm_poly] + else {"nan_policy": nan_policy} + ) + result = func(x, y, **args) + assert result.isel(lat=0, lon=0).isnull().all() + assert result.isel(lat=1, lon=0).isnull().all() + assert result.isel(lat=0, lon=2).notnull().all() + + +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_missing_data_raise_nans(gridded_da_missing_data, func): + """Tests that error is raised for `raise` with a dataset that simulates missing + data.""" + with pytest.raises(ValueError): + data = gridded_da_missing_data + x, y = data["time"], data + args = ( + {"order": 1, "nan_policy": "raise"} + if func in [polyfit, rm_poly] + else {"nan_policy": "raise"} + ) + func(x, y, **args) + + +@pytest.mark.parametrize("func", (polyfit, linear_slope, linregress, rm_poly, rm_trend)) +def test_missing_independent_data_propagate_nans(gridded_da_float, func): + """Tests that functions still work with nans in independent axis, in the case + that you're fitting to something lik ENSO data with missing values. `polyfit` + functions break with missing data in `x` but not in `y`.""" + x = gridded_da_float() + y = gridded_da_float() + x[3, 0, 0] = np.nan + args = ( + {"order": 1, "nan_policy": "propagate"} + if func in [polyfit, rm_poly] + else {"nan_policy": "propagate"} + ) + result = func(x, y, **args) + assert result.isel(lat=0, lon=0).isnull().all() diff --git a/esmtools/tests/test_stats_poly_and_corr.py b/esmtools/tests/test_stats_poly_and_corr.py new file mode 100644 index 0000000..cb9d151 --- /dev/null +++ b/esmtools/tests/test_stats_poly_and_corr.py @@ -0,0 +1,217 @@ +import numpy as np +import numpy.polynomial.polynomial as poly +import pytest +from xarray.testing import assert_allclose + +from esmtools.stats import corr, polyfit, rm_poly, rm_trend + +TIME_TYPES = ["datetime", "cftime", "float"] + + +def _np_rm_poly(x, y, order): + coefs = poly.polyfit(x, y, order) + fit = poly.polyval(x, coefs) + return y - fit + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_rm_poly_against_time( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded, order +): + """Tests that ``rm_poly`` works against a time index.""" + data = eval(f"gridded_da_{time_type}")() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + detrended = rm_poly(x, y, order) + assert (detrended != y).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +def test_rm_poly_against_time_dask( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded +): + """Tests that ``rm_poly`` works against a time index with dask arrays.""" + data = eval(f"gridded_da_{time_type}")() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + expected = rm_poly(x, y, 2) + actual = rm_poly(x.chunk(), y.chunk(), 2).compute() + assert_allclose(expected, actual) + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_rm_poly_against_data(gridded_da_float, gridded, order): + """Tests that ``rm_poly`` works against a single data series and another gridded + data series.""" + x = gridded_da_float() if gridded else gridded_da_float().isel(lat=0, lon=0) + y = gridded_da_float() + detrended = rm_poly(x, y, order) + assert (detrended != y).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +def test_rm_poly_against_data_dask(gridded_da_float, gridded): + """Tests that ``rm_poly`` works against a single data series and another gridded + data series with dask arrays.""" + x = gridded_da_float() if gridded else gridded_da_float().isel(lat=0, lon=0) + y = gridded_da_float() + expected = rm_poly(x, y, 2) + actual = rm_poly(x.chunk(), y.chunk(), 2).compute() + assert_allclose(expected, actual) + + +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_rm_poly_validity(gridded_da_float, order): + """Tests that grid cells are different and equal to doing it manually.""" + data = gridded_da_float() + x = data["time"] + y = data + actual = rm_poly(x, y, order) + for i in range(3): + for j in range(3): + single_grid_cell = y.isel(lon=i, lat=j) + expected = _np_rm_poly(x.values, single_grid_cell.values, order) + assert (actual.isel(lon=i, lat=j).values == expected).all() + + +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_rm_poly_validity_dask(gridded_da_float, order): + """Tests that grid cells are different and equal to doing it manually with dask + arrays.""" + data = gridded_da_float().chunk() + x = data["time"] + y = data + actual = rm_poly(x, y, order) + for i in range(3): + for j in range(3): + single_grid_cell = y.isel(lon=i, lat=j) + expected = _np_rm_poly(x.values, single_grid_cell.values, order) + assert (actual.isel(lon=i, lat=j).values == expected).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +def test_rm_trend_against_time( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded +): + """Tests that ``rm_trend`` is equivelant to ``rm_poly`` with order=1 against time + series.""" + data = eval(f"gridded_da_{time_type}")() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + expected = rm_poly(x, y, 1) + actual = rm_trend(x, y) + assert_allclose(expected, actual) + + +@pytest.mark.parametrize("gridded", (True, False)) +def test_rm_trend_against_data(gridded_da_float, gridded): + """Tests that ``rm_trend`` is equivelant to ``rm_poly`` with order=1 against data + series and gridded data.""" + x = gridded_da_float() if gridded else gridded_da_float().isel(lat=0, lon=0) + y = gridded_da_float() + expected = rm_poly(x, y, 1) + actual = rm_trend(x, y) + assert_allclose(expected, actual) + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_polyfit_against_time( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded, order +): + """Tests that ``polyfit`` plus ``rm_poly`` equals the original time series when + fitting against time.""" + data = eval(f"gridded_da_{time_type}")() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + dt = rm_poly(x, y, order) + fit = polyfit(x, y, order) + diff = np.abs((dt + fit) - y) + assert (diff < 1e-15).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +def test_polyfit_against_time_dask( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded +): + """Tests that ``polyfit`` plus ``rm_poly`` equals the original time series when + fitting against time with dask arrays.""" + data = eval(f"gridded_da_{time_type}")().chunk() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + dt = rm_poly(x, y, 2) + fit = polyfit(x, y, 2) + diff = np.abs((dt + fit) - y) + assert (diff.compute() < 1e-15).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("order", (1, 2, 3, 4)) +def test_polyfit_against_data(gridded_da_float, gridded, order): + """Tests that ``polyfit`` plus ``rm_poly`` equals the original time series when + fitting against data series and gridded data.""" + x = gridded_da_float() if gridded else gridded_da_float().isel(lat=0, lon=0) + y = gridded_da_float() + dt = rm_poly(x, y, order) + fit = polyfit(x, y, order) + diff = np.abs((dt + fit) - y) + assert (diff < 1e-15).all() + + +@pytest.mark.parametrize("gridded", (True, False)) +def test_polyfit_against_data_dask(gridded_da_float, gridded): + """Tests that ``polyfit`` plus ``rm_poly`` equals the original time series + when fitting against data series and gridded data with dask arrays.""" + x = gridded_da_float() if gridded else gridded_da_float().isel(lat=0, lon=0).chunk() + y = gridded_da_float() + dt = rm_poly(x, y, 2) + fit = polyfit(x, y, 2) + diff = np.abs((dt + fit) - y) + assert (diff.compute() < 1e-15).all() + + +def test_corr_two_grids(gridded_da_float): + """Tests that correlations between two grids work.""" + x = gridded_da_float() + y = gridded_da_float() + corrcoeff = corr(x, y, dim="time") + assert not corrcoeff.isnull().any() + + +def test_corr_grid_and_ts(ts_monthly_da, gridded_da_datetime): + """Tests that correlation works between a grid and a time series.""" + x = ts_monthly_da() + y = gridded_da_datetime() + corrcoeff = corr(x, y, dim="time") + assert not corrcoeff.isnull().any() + + +def test_corr_lead(gridded_da_float): + """Tests positive lead for correlation.""" + x = gridded_da_float() + y = gridded_da_float() + corrcoeff = corr(x, y, dim="time", lead=3) + assert not corrcoeff.isnull().any() + + +def test_corr_lag(gridded_da_float): + """Tests negative lead for correlation.""" + x = gridded_da_float() + y = gridded_da_float() + corrcoeff = corr(x, y, dim="time", lead=-3) + assert not corrcoeff.isnull().any() + + +def test_corr_return_p(gridded_da_float): + """Tests that p-value is returned properly for correlation.""" + x = gridded_da_float() + y = gridded_da_float() + corrcoeff, pval = corr(x, y, dim="time", return_p=True) + assert not (corrcoeff.isnull().any()) & (pval.isnull().any()) diff --git a/esmtools/tests/test_stats_regressions.py b/esmtools/tests/test_stats_regressions.py new file mode 100644 index 0000000..5d744e3 --- /dev/null +++ b/esmtools/tests/test_stats_regressions.py @@ -0,0 +1,150 @@ +import numpy as np +import pytest +from xarray.tests import assert_allclose + +from esmtools.stats import linear_slope, linregress + +TIME_TYPES = ["datetime", "cftime", "float"] + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_regression_time_da( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded, func +): + """Tests that linear slope can be computed on an in-memory DataArray with respect to + time.""" + data = eval(f"gridded_da_{time_type}")() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + result = func(x, y, "time") + assert not result.isnull().any() + if func == linear_slope: + expected_shape = (3, 3) if gridded else () + elif func == linregress: + expected_shape = (3, 3, 5) if gridded else (5,) + assert result.shape == expected_shape + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_regression_singular_data_da( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded, func +): + """Tests that ``linear_slope`` works between a grid or time series of data and + another singular data (e.g., a Nino index).""" + data = eval(f"gridded_da_{time_type}")() + x = data.isel(lat=0, lon=0) + y = data if gridded else data.isel(lat=0, lon=0) + result = func(x, y, "time") + assert not result.isnull().any() + if func == linear_slope: + expected_shape = (3, 3) if gridded else () + elif func == linregress: + expected_shape = (3, 3, 5) if gridded else (5,) + assert result.shape == expected_shape + + +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_regression_gridded_to_gridded( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, func +): + """Tests that ``linear_slope`` works between one grid of data and another grid + of data.""" + data1 = eval(f"gridded_da_{time_type}")() + data2 = eval(f"gridded_da_{time_type}")() + result = func(data1, data2, "time") + assert not result.isnull().any() + expected_shape = (3, 3) if func == linear_slope else (3, 3, 5) + assert result.shape == expected_shape + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_regression_time_ds( + gridded_ds_datetime, gridded_ds_cftime, gridded_ds_float, time_type, gridded, func +): + """Tests that linear slope can be computed on an in-memory Dataset with respect to + time.""" + ds = eval(f"gridded_ds_{time_type}") + x = ds["time"] + if not gridded: + ds = ds.isel(lat=0, lon=0) + result = func(x, ds, "time") + assert len(ds.data_vars) > 1 + assert not result["foo"].isnull().any() + assert not result["bar"].isnull().any() + if func == linear_slope: + expected_shape = (3, 3) if gridded else () + elif func == linregress: + expected_shape = (3, 3, 5) if gridded else (5,) + assert result["foo"].shape == expected_shape + assert result["bar"].shape == expected_shape + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_regression_da_dask( + gridded_da_datetime, gridded_da_cftime, gridded_da_float, time_type, gridded, func +): + """Tests that linear slope can be computed on a dask DataArray.""" + data = eval(f"gridded_da_{time_type}")().chunk() + x = data["time"] + y = data if gridded else data.isel(lat=0, lon=0) + expected = func(x.load(), y.load(), "time") + actual = func(x, y, "time").compute() + assert_allclose(expected, actual) + + +@pytest.mark.parametrize("gridded", (True, False)) +@pytest.mark.parametrize("time_type", TIME_TYPES) +@pytest.mark.parametrize("func", (linear_slope, linregress)) +def test_linear_slope_ds_dask( + gridded_ds_datetime, gridded_ds_cftime, gridded_ds_float, time_type, gridded, func +): + """Tests that linear slope can be computed on a dask Dataset.""" + ds = eval(f"gridded_ds_{time_type}").chunk() + x = ds["time"] + if not gridded: + ds = ds.isel(lat=0, lon=0) + expected = func(x.load(), ds.load(), "time") + actual = func(x, ds, "time").compute() + assert_allclose(expected, actual) + + +def test_linear_slope_time_conversion_accurate(ts_annual_da): + """Tests that computed slope with cftime/datetime conversion is accurate.""" + data = ts_annual_da() + data2 = data.copy() + # Just create numeric time, e.g. time in years. + data2["time"] = np.arange(data2.time.size) + slope_cftime = linear_slope(data["time"], data, "time") + slope_ints = linear_slope(data2["time"], data2, "time") + diff = slope_cftime - slope_ints + assert np.abs(diff) < 1e-4 + + +def test_linear_slope_raises_error(ts_annual_da): + """Tests that error is raised for `linear_slope` when independent variable is + in y position.""" + with pytest.raises(ValueError): + data = ts_annual_da() + x = data["time"] + y = data + linear_slope(y, x, "time") + + +def test_linear_slope_linregress_same_result(ts_annual_da): + """Tests that ``linear_slope`` and ``linregress`` returns the same result + for the slope parameter.""" + data = ts_annual_da() + x = data["time"] + y = data + m1 = linear_slope(x, y, "time") + m2 = linregress(x, y, "time").sel(parameter="slope") + assert np.abs(m1.values - m2.values) < 1e-15 diff --git a/esmtools/tests/test_timeutils.py b/esmtools/tests/test_timeutils.py new file mode 100644 index 0000000..fd962db --- /dev/null +++ b/esmtools/tests/test_timeutils.py @@ -0,0 +1,63 @@ +import pandas as pd +import pytest + +import esmtools + + +def test_timeutils_accessor(annual_gregorian): + """Test that the `timeutils` accessor can be called.""" + assert annual_gregorian.timeutils._obj.notnull().all() + + +def test_annual_factor( + annual_all_leap, annual_no_leap, annual_gregorian, annual_julian +): + """Tests that the annual factor returned by timeutils is accurate.""" + assert annual_all_leap["time"].timeutils.annual_factor == 366.0 + assert annual_no_leap["time"].timeutils.annual_factor == 365.0 + assert annual_gregorian["time"].timeutils.annual_factor == 365.25 + assert annual_julian["time"].timeutils.annual_factor == 365.25 + + +def test_calendar(annual_all_leap, annual_no_leap, annual_gregorian, annual_julian): + """Tests that the calendar returned by timeutils is accurate.""" + assert annual_all_leap["time"].timeutils.calendar == "all_leap" + assert annual_no_leap["time"].timeutils.calendar == "noleap" + assert annual_gregorian["time"].timeutils.calendar == "gregorian" + assert annual_julian["time"].timeutils.calendar == "julian" + + +@pytest.mark.parametrize("frequency", ("L", "D", "AS-NOV", "W-TUE")) +def test_freq(annual_gregorian, frequency): + """Tests that the calendar frequency returned by timeutils is accurate.""" + data = annual_gregorian + data["time"] = pd.date_range("1990", freq=frequency, periods=data.time.size) + assert data["time"].timeutils.freq == frequency + + +expected_slopes = { + "L": 1 / (24 * 60 * 60 * 1e3), + "D": 1, + "AS-NOV": 365.25, + "W-TUE": 7.0, +} + + +@pytest.mark.parametrize("frequency", ("L", "D", "AS-NOV", "W-TUE")) +def test_slope_factor(annual_gregorian, frequency): + """Tests that the slope factor returned by timeutils is accurate.""" + data = annual_gregorian + data["time"] = pd.date_range("1990", freq=frequency, periods=data.time.size) + assert data["time"].timeutils.slope_factor == expected_slopes[frequency] + + +def test_return_numeric_time_datetime(gridded_da_datetime): + """Tests that datetimes are properly converted to numeric time by timeutils.""" + data = gridded_da_datetime()["time"] + assert data.timeutils.return_numeric_time().dtype == "float64" + + +def test_return_numeric_time_cftime(gridded_da_cftime): + """Tests that cftimes are properly converted to numeric time by timeutils.""" + data = gridded_da_cftime()["time"] + assert data.timeutils.return_numeric_time().dtype == "float64" diff --git a/esmtools/tests/test_utils.py b/esmtools/tests/test_utils.py new file mode 100644 index 0000000..5339cf3 --- /dev/null +++ b/esmtools/tests/test_utils.py @@ -0,0 +1,46 @@ +import numpy as np + +from esmtools.utils import match_nans + + +def test_match_nans_no_nans(ts_annual_da): + """Tests that `match_nans` doesn't modify arrays without nans.""" + x = ts_annual_da() + y = ts_annual_da() + x, y = match_nans(x, y) + assert x.notnull().all() + assert y.notnull().all() + + +def test_match_nans_pairwise(ts_annual_da): + """Tests that `match_nans` actually matches nans pairwise.""" + x = ts_annual_da() + y = ts_annual_da() + x[3] = np.nan + y[0] = np.nan + x, y = match_nans(x, y) + assert x[3].isnull() + assert x[0].isnull() + assert y[3].isnull() + assert y[0].isnull() + + +def test_match_nans_doesnt_modify_original(ts_annual_da): + """Tests that `match_nans` does not mutate original time series.""" + x = ts_annual_da() + y = ts_annual_da() + x[3] = np.nan + y[0] = np.nan + x_mod, y_mod = match_nans(x, y) + assert x[0].notnull() + assert y[3].notnull() + + +def test_int_arrays_apply_nans(): + """Tests that match nans converts int arrays into floats when adding nans to avoid + returning an int nan (which is a crazy number).""" + x = np.array([1, 2, 3, 4, 5]).astype("int") + y = np.array([3, np.nan, 4, 5, 6]) + x, y = match_nans(x, y) + assert x.dtype == "float" + assert np.isnan(x).sum() > 0 diff --git a/esmtools/timeutils.py b/esmtools/timeutils.py new file mode 100644 index 0000000..7320d4f --- /dev/null +++ b/esmtools/timeutils.py @@ -0,0 +1,464 @@ +import cftime +import numpy as np +import pandas as pd +import xarray as xr +from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like + +from .constants import DAYS_PER_MONTH, DAYS_PER_YEAR + + +@xr.register_dataarray_accessor("timeutils") +class TimeUtilAccessor: + def __init__(self, xarray_obj): + self._obj = xarray_obj + self._is_temporal = contains_datetime_like_objects(self._obj) + + @property + def annual_factor(self): + return DAYS_PER_YEAR[self.calendar] + + @property + def calendar(self): + return get_calendar(self._obj) + + @property + def freq(self): + if self.is_temporal: + return infer_freq(self._obj) + else: + return None + + @property + def is_cftime_like(self): + if (self.is_temporal) and (contains_cftime_datetimes(self._obj)): + return True + else: + return False + + @property + def is_datetime_like(self): + if (self.is_temporal) and (is_np_datetime_like(self._obj)): + return True + else: + return False + + @property + def is_temporal(self): + return self._is_temporal + + @property + def slope_factor(self): + if self.freq is None: + return 1.0 + else: + slope_factors = self.construct_slope_factors() + return slope_factors[self.freq] + + def return_numeric_time(self): + """Returns numeric time.""" + if self.is_datetime_like: + x = (self._obj - np.datetime64("1990-01-01")) / np.timedelta64(1, "D") + return x + elif self.is_cftime_like: + x = cftime.date2num( + self._obj, "days since 1990-01-01", calendar=self.calendar + ) + x = xr.DataArray(x, dims=self._obj.dims, coords=self._obj.coords) + return x + else: + raise ValueError("DataArray is not a time array of datetimes or cftimes.") + + @staticmethod + def construct_quarterly_aliases(): + quarters = ["Q", "BQ", "QS", "BQS"] + for month in [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ]: + quarters.append(f"Q-{month}") + quarters.append(f"BQ-{month}") + quarters.append(f"BQS-{month}") + quarters.append(f"QS-{month}") + return quarters + + @staticmethod + def construct_annual_aliases(): + years = ["A", "Y", "BA", "BY", "AS", "YS", "BAS", "BYS", "Q"] + for month in [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ]: + years.append(f"A-{month}") + years.append(f"BA-{month}") + years.append(f"BAS-{month}") + years.append(f"AS-{month}") + return years + + def construct_slope_factors(self): + years = self.construct_annual_aliases() + years = {k: self.annual_factor for k in years} + + quarters = self.construct_quarterly_aliases() + quarters = {k: self.annual_factor / 4 for k in quarters} + + months = ("M", "BM", "CBM", "MS", "BMS", "CBMS") + months = {k: self.annual_factor / 12 for k in months} + + semimonths = {k: 15 for k in ("SM", "SMS")} + + weeks = ("W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT") + weeks = {k: 7 for k in weeks} + + days = {k: 1 for k in ("B", "C", "D")} + hours = {k: 1 / 24 for k in ("BH", "H")} + mins = {k: 1 / (24 * 60) for k in ("T", "min")} + secs = {k: 1 / (24 * 60 * 60) for k in ("S")} + millisecs = {k: 1 / (24 * 60 * 60 * 1e3) for k in ("ms", "L")} + microsecs = {k: 1 / (24 * 60 * 60 * 1e6) for k in ("U", "us")} + nanosecs = {k: 1 / (24 * 60 * 60 * 1e9) for k in ("N")} + + DATETIME_FACTOR = {} + for d in ( + years, + quarters, + months, + semimonths, + weeks, + days, + hours, + mins, + secs, + millisecs, + microsecs, + nanosecs, + ): + DATETIME_FACTOR.update(d) + return DATETIME_FACTOR + + +def contains_datetime_like_objects(var): + """Check if a variable contains datetime like objects (either + np.datetime64, np.timedelta64, or cftime.datetime) + """ + return is_np_datetime_like(var.dtype) or contains_cftime_datetimes(var) + + +def get_calendar(dates): + """Attempt to pull calendar type automatically using ``cftime`` objects. + + .. note:: + This relies upon ``xarray``'s automatic conversion of time to ``cftime``. + + Args: + dates (xarray.DataArray): Dates from which to retrieve calendar. + + Returns: + str: Name of calendar in NetCDF convention. + + Raises: + ValueError: If inferred calendar is not in our list of supported calendars. + """ + if np.asarray(dates).dtype == "datetime64[ns]": + return "proleptic_gregorian" + else: + return np.asarray(dates).ravel()[0].calendar + + +def get_days_per_month(time, calendar="standard"): + """Return an array of days per month corresponding to a given calendar. + + Args: + x (xarray index): Time index from xarray object. + calendar (optional str): Calendar type. + + Returns: + ndarray: Array of number of days for each monthly time step provided. + + Raises: + ValueError: If input time index is not a CFTimeIndex or DatetimeIndex. + """ + is_time_index(time, "time index") + month_length = np.zeros(len(time), dtype=np.int) + + cal_days = DAYS_PER_MONTH[calendar] + + for i, (month, year) in enumerate(zip(time.month, time.year)): + month_length[i] = cal_days[month] + if leap_year(year, calendar=calendar): + month_length[i] += 1 + return month_length + + +def is_time_index(xobj, kind): + """ + Checks that xobj coming through is a DatetimeIndex or CFTimeIndex. + + This checks that `esmtools` is converting the DataArray to an index, + i.e. through .to_index() + """ + xtype = type(xobj).__name__ + if xtype not in ["CFTimeIndex", "DatetimeIndex"]: + raise ValueError( + f"Your {kind} object must be either an xr.CFTimeIndex or " + f"pd.DatetimeIndex." + ) + return True + + +def leap_year(year, calendar="standard"): + """Determine if year is a leap year. + + Args: + year (int): Year to assess. + calendar (optional str): Calendar type. + + Returns: + bool: True if year is a leap year. + """ + leap = False + if (calendar in ["standard", "gregorian", "proleptic_gregorian", "julian"]) and ( + year % 4 == 0 + ): + leap = True + if ( + (calendar == "proleptic_gregorian") + and (year % 100 == 0) + and (year % 400 != 0) + ): + leap = False + elif ( + (calendar in ["standard", "gregorian"]) + and (year % 100 == 0) + and (year % 400 != 0) + and (year < 1583) + ): + leap = False + return leap + + +def infer_freq(index): + """NOTE: This is pulled from xarray v0.15.2, which isn't released yet. I want + to avoid making a requirement the master unreleased branch. We'll switch this + simply to `xr.infer_freq()` once it's released.""" + from xarray.core.dataarray import DataArray + + if isinstance(index, (DataArray, pd.Series)): + dtype = np.asarray(index).dtype + if dtype == "datetime64[ns]": + index = pd.DatetimeIndex(index.values) + elif dtype == "timedelta64[ns]": + index = pd.TimedeltaIndex(index.values) + else: + index = xr.CFTimeIndex(index.values) + + if isinstance(index, xr.CFTimeIndex): + inferer = _CFTimeFrequencyInferer(index) + return inferer.get_freq() + + return pd.infer_freq(index) + + +_ONE_MICRO = 1 +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR +_MONTH_ABBREVIATIONS = { + 1: "JAN", + 2: "FEB", + 3: "MAR", + 4: "APR", + 5: "MAY", + 6: "JUN", + 7: "JUL", + 8: "AUG", + 9: "SEP", + 10: "OCT", + 11: "NOV", + 12: "DEC", +} + + +class _CFTimeFrequencyInferer: + def __init__(self, index): + self.index = index + self.values = index.asi8 + + if len(index) < 3: + raise ValueError("Need at least 3 dates to infer frequency") + + self.is_monotonic = ( + self.index.is_monotonic_decreasing or self.index.is_monotonic_increasing + ) + + self._deltas = None + self._year_deltas = None + self._month_deltas = None + + def get_freq(self): + if not self.is_monotonic or not self.index.is_unique: + return None + + delta = self.deltas[0] # Smallest delta + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + # There is no possible intraday frequency with a non-unique delta + # Different from pandas: we don't need to manage DST and business offsets + # in cftime + elif not len(self.deltas) == 1: + return None + + if _is_multiple(delta, _ONE_HOUR): + return _maybe_add_count("H", delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + return _maybe_add_count("T", delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + return _maybe_add_count("S", delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + return _maybe_add_count("L", delta / _ONE_MILLI) + else: + return _maybe_add_count("U", delta / _ONE_MICRO) + + def _infer_daily_rule(self): + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.year_deltas[0] + month = _MONTH_ABBREVIATIONS[self.index[0].month] + alias = f"{annual_rule}-{month}" + return _maybe_add_count(alias, nyears) + + quartely_rule = self._get_quartely_rule() + if quartely_rule: + nquarters = self.month_deltas[0] / 3 + mod_dict = {0: 12, 2: 11, 1: 10} + month = _MONTH_ABBREVIATIONS[mod_dict[self.index[0].month % 3]] + alias = f"{quartely_rule}-{month}" + return _maybe_add_count(alias, nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return _maybe_add_count(monthly_rule, self.month_deltas[0]) + + if len(self.deltas) == 1: + # Daily as there is no "Weekly" offsets with CFTime + days = self.deltas[0] / _ONE_DAY + return _maybe_add_count("D", days) + + # CFTime has no business freq and no "week of month" (WOM) + return None + + def _get_annual_rule(self): + if len(self.year_deltas) > 1: + return None + + if len(np.unique(self.index.month)) > 1: + return None + + return {"cs": "AS", "ce": "A"}.get(month_anchor_check(self.index)) + + def _get_quartely_rule(self): + if len(self.month_deltas) > 1: + return None + + if not self.month_deltas[0] % 3 == 0: + return None + + return {"cs": "QS", "ce": "Q"}.get(month_anchor_check(self.index)) + + def _get_monthly_rule(self): + if len(self.month_deltas) > 1: + return None + + return {"cs": "MS", "ce": "M"}.get(month_anchor_check(self.index)) + + @property + def deltas(self): + """Sorted unique timedeltas as microseconds.""" + if self._deltas is None: + self._deltas = _unique_deltas(self.values) + return self._deltas + + @property + def year_deltas(self): + """Sorted unique year deltas.""" + if self._year_deltas is None: + self._year_deltas = _unique_deltas(self.index.year) + return self._year_deltas + + @property + def month_deltas(self): + """Sorted unique month deltas.""" + if self._month_deltas is None: + self._month_deltas = _unique_deltas(self.index.year * 12 + self.index.month) + return self._month_deltas + + +def _is_multiple(us, mult: int): + """Whether us is a multiple of mult""" + return us % mult == 0 + + +def _maybe_add_count(base: str, count: float): + """If count is greater than 1, add it to the base offset string""" + if count != 1: + assert count == int(count) + count = int(count) + return f"{count}{base}" + else: + return base + + +def month_anchor_check(dates): + """Return the monthly offset string. + Return "cs" if all dates are the first days of the month, + "ce" if all dates are the last day of the month, + None otherwise. + Replicated pandas._libs.tslibs.resolution.month_position_check + but without business offset handling. + """ + calendar_end = True + calendar_start = True + + for date in dates: + if calendar_start: + calendar_start &= date.day == 1 + + if calendar_end: + cal = date.day == date.daysinmonth + if calendar_end: + calendar_end &= cal + elif not calendar_start: + break + + if calendar_end: + return "ce" + elif calendar_start: + return "cs" + else: + return None + + +def _unique_deltas(arr): + """Sorted unique deltas of numpy array""" + return np.sort(np.unique(np.diff(arr))) diff --git a/esmtools/utils.py b/esmtools/utils.py index d9b4a35..5d0cc8a 100644 --- a/esmtools/utils.py +++ b/esmtools/utils.py @@ -1,3 +1,8 @@ +import numpy as np + +from .checks import has_missing + + def get_dims(da): """ Simple function to retrieve dimensions from a given dataset/datarray. @@ -5,3 +10,27 @@ def get_dims(da): list if desired for any reason. """ return list(da.dims) + + +def match_nans(x, y): + """Performs pairwise matching of nans between ``x`` and ``y``. + + Args: + x, y (ndarray or xarray object): Array-like objects to pairwise match nans over. + + Returns: + x, y (ndarray or xarray object): If either ``x`` or ``y`` has missing data, + adds nans in the same places between the two arrays. + """ + if has_missing(x) or has_missing(y): + # Need to copy to avoid mutating original objects and to avoid writeable errors + # with ``xr.apply_ufunc`` with vectorize turned on. + x, y = x.copy(), y.copy() + idx = np.logical_or(np.isnan(x), np.isnan(y)) + # NaNs cannot be added to `int` arrays. + if x.dtype == "int": + x = x.astype("float") + if y.dtype == "int": + y = y.astype("float") + x[idx], y[idx] = np.nan, np.nan + return x, y diff --git a/requirements.txt b/requirements.txt index 96018ef..7f82c7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ numpy scipy statsmodels tqdm -xarray +xarray>=0.15.1