# Building code for NYSM +/-15 day averages
- https://docs.xarray.dev/en/stable/examples/monthly-means.html
- https://docs.xarray.dev/en/stable/generated/xarray.Dataset.rolling.html#xarray.Dataset.rolling

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Read in NYSM data
# Reading in one month takes 1-2 seconds, so 6 years ~1-2 min
nysm_sites = pd.read_csv("/spare11/atm533/data/nysm_sites.csv")
nysm_data = pd.read_csv("/spare11/atm533/data/nysm_data_202008.csv") 


In [3]:
# 126 sites * 12 per hour (every 5 min) * 24 hours * 31 days
# len(nysm_data)

In [4]:

# set station and time to be indices of the df so that the conversion to xarray dataset makes them coordinates
nysm_data = nysm_data.set_index(["station", "time"])

# convert to dataset
nysm_ds = nysm_data.to_xarray()

# convert time coordinate of the dataset to datetime format
# NOTE: must be executed twice (last subcomment here by Vinod: https://stackoverflow.com/questions/62572678/xarray-coords-conversion-to-datetime64)
nysm_ds["time"] = pd.DatetimeIndex(nysm_ds["time"].values)
nysm_ds["time"] = pd.DatetimeIndex(nysm_ds["time"].values)

nysm_ds

In [5]:
# Calculate the mean of each variable by averaging over station and date 
# The groupby call groups data by date, then within each date, average over the time coordinate
ds_avg_by_site_date = nysm_ds.groupby("time.date").mean("time")

  out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)


In [6]:
ds_avg_by_site_date

In [7]:
# By date, calculate rolling average of past +/- 5 days. Adjust this to +/- 15 once I get all data
# Rolling window is set to be 5 dates (dates are days)
# Note that there is a parameter, min_periods, whose default is set to None which equates to it being equal the size of the window. In other words, the if the rolling window is 5 and centered, that means we need 2 before, term in middle, 2 terms after, so we would have NANs for the first two terms
# Center = True, as it suggests, means that at a date coordinate (a date row) it is at the center of the rolling average
# Can confirm this is working as we think by changing it to center = True and seeing that the first 2 are nans
rolling_5day_avg = ds_avg_by_site_date.rolling({"date":5}, center = True).mean()
rolling_5day_avg

## REFERENCE CODE ONLY
#### Ignore..


In [109]:
# call on a coordinate
nysm_ds.station

# call on a varibale
nysm_ds['temp_2m [degC]']

In [None]:
nysm_ds['temp_2m [degC]'].shape

In [29]:
# REFERENCE ONLY
# "data array" . mean ("dimension to average across"... eliminating that dim completely, it's averaged out if left blank, will average over everything to give ONE mean value)
# produces another array whose length depends on what you averaged over!

# example 1
avg_temp_by_day = nysm_ds['temp_2m [degC]'].mean('station')
print(avg_temp_by_day.shape)

# example 2
avg_temp_by_station = nysm_ds['temp_2m [degC]'].mean('time')
print(avg_temp_by_station.shape)

(8929,)
(126,)


In [39]:
# this is just like SQL - using all data, but group certain rows together based on common (i.e. season) and then still want to average over the time dimension so that dim is gone and we're left with station only
test1 = nysm_ds.groupby("time.season").mean("time")

  out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)


In [43]:
test1

In [38]:
test2= nysm_ds.groupby("time.season")
type(test2)

xarray.core.groupby.DatasetGroupBy

In [10]:
ds_sub = nysm_ds.sel(time=nysm_ds.time.dt.month.isin([4]))

In [13]:
nysm_ds.time.dt.date

In [99]:
# REFERENCE -- Check that this rolling average is working correctly, not getting mixed up by sites or anything

# SUBSET DATASET: https://stackoverflow.com/questions/38846323/python-xarray-dataset-sel-select-multiple-values-along-one-dimension
# way 1
y = ds_avg_by_site_date.where(ds_avg_by_site_date.station=='WOLC', drop = True)#, "date" :'2020-08-06'})
y

# way 2 (using sel and also datetime issue)
# Using .sel (datetime format issue..)
# g = ds_avg_by_site_date.sel(ds_avg_by_site_date.date == dt.date(2020, 7, 31)) # DATETIME ISSUE
# g

# print out the temps for WOLC then avg in cell below to make sure it works
y["temp_2m [degC]"]

In [8]:
num = 23.8 +22.29583333+25.55173611+23.64409722+20.59097222
num/5

23.176527776

In [9]:
# check that this was the first non nan value for WOLC
# rolling_5day_avg
rolling_5day_avg.where(rolling_5day_avg.station=='WOLC', drop = True) # confirmed! the first value is 23.18