# How to read data and extract data information

### Load the modules/libraries that are needed and useful

In [2]:
import numpy as np
import xarray as xr
import pandas as pd

### In Python, # denotes a comment. Anything following a # on the same line is ignored by the Python interpreter.

In [3]:
# open data file
DIR = '/nfs/spare11/env315/data/'
filename = 'era5_an_temp_reg2_daily_2021.nc'
datafile = DIR+filename
data = xr.open_mfdataset(datafile,combine='by_coords').compute()
data

## Study xarray and data structure

In [None]:
# get data coordinate
coords = data.coords
lat_coord = data.coords["lat"] 
print(coords)
print('***************')
print(lat_coord)
print('***************')
print(coords['lat'])
print('***************')

In [None]:
# get data dimension
dims = data.dims
print(dims)
print('***************')

sizes = data.sizes
print(sizes)
print('***************')

In [None]:
# if we only want the values of latitude
lat = lat_coord.values

# print the data type of latitude
print(type(lat))
print('***************')
# print the size of latitude
print(np.shape(lat))
print('***************')

# print the values of latitude
print(lat)
print('***************')

## Get familiar with NumPy array in Python

#### A NumPy array is a grid-like data structure that holds a collection of elements, all of the same data type (e.g., numbers, strings). It provides a way to store and manipulate large amounts of data in a structured and efficient manner.

#### NumPy provides a wide range of mathematical functions that can be applied to arrays, such as element-wise operations, linear algebra, and statistical functions.

In [None]:
# index and values of NumPy array
print(lat[0]) # first value of data array "lat"
print(lat[-1]) # last value of data array "lat"
print(lat[90]) # also last value of data array "lat"
print(lat[0:10]) # print the first 10 values of data array "lat"
## note that it does not contain the 10th value of data array "lat" 

In [None]:
# extract longitude coordinate
lon_coord = data.coords["lon"] 

# extract longitude values
lon = lon_coord.values

# print the size of longitude
print(np.shape(lon))
print('***************')

# print the 10th value of longitude
print(type(lon[9]),lon[9])
print('***************')

# print the values of longitude
print(lon) # spacing is 2deg
print('***************')
print(lon[1]-lon[0])

In [None]:
# extract plev coordinate
lev_coord = data.coords["plev"] 

# extract plev values
lev = lev_coord.values

# print the size of plev
print(np.shape(lev))
print('***************')

# print the 1st and last values of plev
print(lev[0],lev[-1],lev[36])
print(type(lev[0]))
print('***************')

# print the values of plev
print(lev) 
print('***************')

In [None]:
# extract time coordinate
time_coord = data.coords["time"] 

# extract time values
times = time_coord.values

# print the type and size of time
print(type(times),np.shape(times))
print('***************')

# print the 1st value of time
print(type(times[0]),times[0]) 
print('***************')

### Sometimes, we may want to analyze the data, we want to analyze the data at a specific region or pressure level or time. When we work with high-dimension data, we want to fix certain dimensions to analyze the other dimensions. In these cases, we need to select the data we want. 

In [None]:
# select latitude of the Northern Hemisphere
# two methods we can do this
# method 1: using Numpy array
lat_nh = lat[np.where(lat>=0)]
print(type(lat_nh),lat_nh)
print('***************')

# method 2: using xarray
lat_nh = lat_coord.sel(lat=slice(90,0))
print(lat_nh)
print('***************')
# select latitude of 60S
lat_60s = lat_coord.sel(lat=60)
print(lat_60s)

In [None]:
# select the continental United States (CONUS) using xarray
lat_conus = lat_coord.sel(lat=slice(49,24.5))
lon_conus = lon_coord.sel(lon=slice(264.6,278.2))
lev_1000 = lev_coord.sel(plev=100000)
time_1 = time_coord.sel(time='2021-01-23')
print(lat_conus.values)
print('***************')
print(lon_conus.values)
print('***************')
print(lev_1000.values)
print('***************')
print(time_1.values)

In [None]:
dat_conus = data.sel(lat=slice(49,24.5),lon=slice(264.6,278.2),
                    plev=100000,time='2021-01-23')
dat_conus

In [None]:
dat_1 = data.sel(plev=100000,time='2021-01-23')
dat_1