In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import geopandas as gpd
import urllib
import xarray as xr

# In Class Practice #9: Xarray
---
In this practice, we will learn how to use `xarray` to generate `DataArray` and `DataSet`, conduct basic data analysis, and learn about visualization!


# !!! Download the data first!!!
https://drive.google.com/drive/folders/1CgrxWaIGJjlHrv2ITiRqW4T1YybdTSip?usp=sharing

### Today, we are going to learn how to use real-world data, the ERA5 dataset!
ERA5 is short for ECMWF Reanalysis v5. </br>
ERA5 is the fifth generation ECMWF atmospheric reanalysis of the global climate covering the period from January 1940 to present. ERA5 is produced by the Copernicus Climate Change Service (C3S) at ECMWF. </br>
ERA5 provides hourly estimates of a large number of atmospheric, land and oceanic climate variables. The data cover the Earth on a 31km grid and resolve the atmosphere using 137 levels from the surface up to a height of 80km. ERA5 includes information about uncertainties for all variables at reduced spatial and temporal resolutions. </br>

# NOTE: We have a total of **5** practices in this training, including [#3.2.](#3.2.), [#5.1](#5.1.), [#5.3.1.](#5.3.1), [#6.1.](#6.1.), [#7.1.](#7.1.)

# 0. Before we dive into the real data, let's first learn how to create data array!
First, let us learn how to generate `DataArray` and `DataSet`!

In [None]:
# let's first generate a random 3d dataset

# we assume this is a 365-day precipitation data
# for a domain (latitutde dimenion: 5, 
#               longitude dimension: 21)
numpy_array = np.random.rand(365,5,21)

# Let's first prepare the coordinates for each
# dataset
date_list = pd.date_range("2023-01-01",periods=365,freq='1D')
lat = np.linspace(40,41,5)
lon = np.linspace(-94,-90,21)

In [None]:
# let's generate data array
# Please note: we need three elements to generate 
# a meaningful DataArray
# 1. N-D numpy data array or list
# 2. dimension names
# 3. coordinates for each dimensions (if not specified, it will be numeric, i.e., 0,1,2,...)
da = xr.DataArray(numpy_array, 
                  dims=['time','lat','lon'],
                  coords={'time':date_list,
                          'lat':lat,
                          'lon':lon})

In [None]:
da

## 0.1 How can we generate a Dataset?

In [None]:
# Once we have a data array, we can simply create 
# a dataset using following script!
ds = xr.Dataset({'precip':da})

In [None]:
ds

In [None]:
# Say if we have an additional dataarray - air temperature
numpy_array_airT = np.random.rand(365,5,21)
da_airT = xr.DataArray(numpy_array_airT, 
                       dims=['time','lat','lon'],
                       coords={'time':date_list,
                               'lat':lat,
                               'lon':lon})

In [None]:
# we can create a Dataset containing both variables
ds_two_var = xr.Dataset({'precip':da,
                         'airT':da_airT})

In [None]:
ds_two_var

# 1. How do we read `NetCDF` files?

We use `xr.open_dataset` to open `NetCDF` files.

#### Here we take a look at the monthly averaged 2-m air temperature for year 2022.

In [None]:
ds = xr.open_dataset('e5.moda.an.sfc.128_167_2t.ll025sc.2022.conus.nc')

In [None]:
# let's first take a look at the dataset
ds

## When comparing to open `csv` files using `pandas`, we do not have to specify `parse_dates`, cause the date information is usually embedded in `NetCDF` files.

# 2. Data slicing

In [None]:
# how do we select one variable from the dataset?
ds['VAR_2T']

In [None]:
# How we can look at the attributions of this variable?
ds['VAR_2T'].attrs

In [None]:
# how do we get the "long_name" attribute?
ds['VAR_2T'].attrs['long_name']

## 2.1. Selecting certain time periods
We can directly use `.sel(time="time")`

In [None]:
ds['VAR_2T'].sel(time="2022-01-01")

Or we can use `isel(time=int)` to select the i-th time step

In [None]:
ds['VAR_2T'].isel(time=0)

#### Similarly, we can of course select a period of time!
#### How about we select the three summer month, JJA.

In [None]:
ds['VAR_2T'].sel(time=slice("2022-06-01","2022-08-31"))

In [None]:
ds['VAR_2T'].isel(time=slice(5,8))

## 2.2. Select a sub-region?
In many applications, we only focus on a small region in the world. 

For example, we specifically want to look at the temperatures in the great lake regions.

The latitude ranges from **41N to 49N**.

The longitude ranges from **93E to 74E**

In [None]:
# Note: 1. the longitude ranges from 0-360 in the dataset.
#       2. the latitude is in descending order
ds_great_lakes = ds['VAR_2T'].sel(latitude=slice(49,41),longitude=slice(-93+360,-74+360))
ds_great_lakes

# 3. How do we quickly visualize `NetCDF` files?
Let's first take a look at what the January mean temperature looks like!

In [None]:
plt.figure(dpi=300)
ds_great_lakes.isel(time=0).plot()

## From the plot above, we can clearly see that the great lakes are slightly warmer than nearby land regions in January 2022.

### 3.1. What if we want to quickly look at the temperatures for all months?

In [None]:
ds['VAR_2T'].plot(x='longitude',
                  y='latitude',
                  col="time", 
                  col_wrap=3)

## 3.2. 
Practice #1: Plot the temperature maps for all month but with 6 columns

In [None]:
# INSERT YOUR CODE HERE FOR SECTION 3.2

# 4. How can we concatenate multi-year data?

In [None]:
# Let's first read in data for years before 2022
ds_pre2022 = xr.open_dataset("e5.moda.an.sfc.128_167_2t.ll025sc.2000_2021.conus.nc")

In [None]:
# Here, we have to specify that we concatenate 
# data in 'time' dimension
ds_concat = xr.concat([ds_pre2022,ds],dim='time')

In [None]:
ds_concat

# 5. `Resample` v.s. `Groupby`

`Resample` aims to change the frequency of the dataset </br>
`Groupby` aims to put data into corresponding gruops

## 5.1. 

Practice #2: Please answer the following questions

1. Given a 10-year daily temperature, we were asked to calculate the mean temperature for each month, should we use `Resample` or `Groupby`?

2. After we get the monthly average, we were asked to calculate the mean monthly temperature across all 10 yeras, shouldwe use `Resample` or `Groupby`?

#### Type your answer in this markdown cell:




---

## 5.2. Syntax for `Resample`

Let's first calculate the mean seasonal temperature.

Usually, Spring includes March April May. Naturally, Summer (JJA), Fall (SON), Winter (DJF).

In [None]:
# therefore, to make sure that all months are available 
# for each season, we decide to truncate the dataset
# to 2000 December to 2022 Nov.
ds_concat_sel = ds_concat.sel(time=slice("2000-12-01","2022-11-01"))

To use `resample`, we would need to specify a `dictionary` object.


`ds.resample{"time dimension name":"frequency"}`

Usually used `frequency` includes: `D`-daily, `M`-monthly, `QS-DEC`-seasonally

In [None]:
ds_season = ds_concat_sel.resample({"time":"QS-DEC"}).mean()

In [None]:
ds_season

# 5.3. Syntax for `groupby`

Let's calculate the mean seasonal temperature across the 22 years of data!

Since now we have all data for each season, now we only need to group the data by their season.

In [None]:
# we need to know that the syntax for groupby
# is different from the syntax fro resample
# Here, the syntax for groupby is shown below
# For more different groups, we can refer to 
# this website: 
# https://docs.xarray.dev/en/stable/generated/xarray.DataArray.groupby.html

ds_season_mean = ds_season.groupby(ds_season.time.dt.season).mean()

In [None]:
ds_season_mean

## 5.3.1. 

Practice #3: Quick visualization of the seasonal temperatures

Plot temperatures for four seasons (2 by 2 plots)

### What can you observe the 2-meter temperature above great lakes compared to the 2-meter temperatures in nearby land areas?

In [None]:
# INSERT YOUR CODE HERE

# 6. Regional average
We want to know whether there is a statistically significant trend in mean annual temperatures in the US. 

First, we calculate the mean annual temperatures. Should we use `resample` or `groupby`?

In [None]:
# YS denotes the start of the year, which only refers to the
# time stamp. For example, if we use "YS", the timestamp for
# annual mean will be "2000-01-01", if we use "YE", the timestamp
# will be "2000-12-31". However, they both provide the same 
# annual mean value for Year 2000.
ds_annual = ds_concat.resample({"time":'YS'}).mean()

### Then we calculate the regional average temperatures 

In [None]:
ds_annual_mean = ds_annual['VAR_2T'].mean(dim=['latitude','longitude'])

In [None]:
# Quick visualization
ds_annual_mean.plot()

## 6.1. 

Practice #4: Did you observe a statistically significant increasing trend in the mean annual air temperature?

Please conduct a hypothesis test for the slope of linear regression (`annual mean temperature` versus `year`) with 95% confidence interval.

In [None]:
# INSERT YOUR CODE HERE

# 7. Can we plot the gridded dataset together with shapefiles?

In [None]:
import pyproj
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [None]:
# let's first calculate the mean annual temperature across all years
da_annual_mean = ds_annual['VAR_2T'].mean(dim='time')

## 7.1. `da.plot.pcolormesh()`

In [None]:
plt.figure(dpi=300)
# first let's define the CRS of targeted domain
proj = ccrs.Mercator()
ax = plt.axes(projection=proj)

# set the extent of the map
# [lon_min, lon_max, lat_min, lat_max]
ax.set_extent([-135,-55,22,48])

# To plot gridded dataset, we need to use the function
# da.plot.pcolormesh(x_dim,y_dim)
# NOTE: here we need to specify the CRS of the gridded
#       dataset (usually it is Plate Carree if we 
#       use lat/lon coordinates)
da_annual_mean.plot.pcolormesh('longitude', 'latitude', ax=ax, 
                               transform=ccrs.PlateCarree(),
                               zorder=1)
ax.coastlines(linewidth=0.5, color='k', zorder=2)


## We can even quickly add the country boundries or major lakes using `cartopy.feature`

In [None]:
plt.figure(dpi=300)
proj = ccrs.Mercator()
ax = plt.axes(projection=proj)
ax.set_extent([-135,-55,22,48])
da_annual_mean.plot.pcolormesh('longitude', 'latitude', ax=ax, 
                               transform=ccrs.PlateCarree(),
                               zorder=1)
ax.coastlines(linewidth=0.5, color='k', zorder=2)
#
# Here we can use 'cfeature.BORDERS' for adding country boundries
#
ax.add_feature(cfeature.BORDERS, edgecolor='silver',
               facecolor="none", lw=0.8, zorder=3)
#
# Here we can use 'cfeature.LAKES' for adding major lakes globally
# 
ax.add_feature(cfeature.LAKES, edgecolor='none',
               facecolor="dodgerblue",  zorder=3)

# 7.1. 

Practice #5: Zoom in the plot above to focus on New York State

1. Use the code in above section as an example.
2. Please adjust the extent of the map to focus on New York State
3. It is ok to keep the county boundries (Please use "transparent" facecolor, and "lightgray" edgecolor)
4. Add grid lines to show latitude/longitude coordinates

In [None]:
import matplotlib.ticker as mticker

In [None]:
ny = gpd.read_file('NY_counties.gpkg')

In [None]:
# INSERT YOUR CODE HERE 