# Import packages

In [1]:
import pandas as pd, numpy as np
import dypy.netcdf as dn
import dypy.intergrid as ig
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import ndimage
import os
import re

# Define constants

In [2]:
# W-E direction
LON_MIN_ERA= 0
LON_MAX_ERA = 15

# S-N direction
LAT_MIN_ERA= 42
LAT_MAX_ERA = 50

In [3]:
path = "/net/litho/atmosdyn/INTEXseas/cesm/cesm112_LENS/b.e112.B20TRLENS.f09_g16.ethz.001/archive/atm/hist/b.e112.B20TRLENS.f09_g16.ethz.001.cam.h2.1990-01-01-21600.nc"

lons, lats = dn.read_var(path, ["lon", "lat"])

xindex = np.where((lons >= LON_MIN_ERA) & (lons <= LON_MAX_ERA))[0]
yindex = np.where((lats >= LAT_MIN_ERA) & (lats <= LAT_MAX_ERA))[0]
print("lons: " + str(lons[xindex]))
print("lats: " + str(lats[yindex]))

xmin, xmax = xindex.min(), xindex.max()
ymin, ymax = yindex.min(), yindex.max()

index = np.s_[:, :, ymin:(ymax+1), xmin:(xmax+1)]

lo = np.array([LAT_MIN_ERA, LON_MIN_ERA])
hi = np.array([LAT_MAX_ERA, LON_MAX_ERA])

QUERY_POINTS = [[lat, lon] for lat in lats[yindex] for lon in lons[xindex]]
QUERY_POINTS_LABELS = [[str(int(100*query_point[0])), str(int(100*query_point[1]))] for query_point in QUERY_POINTS]
print(QUERY_POINTS_LABELS)

lons: [ 0.    1.25  2.5   3.75  5.    6.25  7.5   8.75 10.   11.25 12.5  13.75
 15.  ]
lats: [42.87958115 43.82198953 44.76439791 45.70680628 46.64921466 47.59162304
 48.53403141 49.47643979]
[['4287', '0'], ['4287', '125'], ['4287', '250'], ['4287', '375'], ['4287', '500'], ['4287', '625'], ['4287', '750'], ['4287', '875'], ['4287', '1000'], ['4287', '1125'], ['4287', '1250'], ['4287', '1375'], ['4287', '1500'], ['4382', '0'], ['4382', '125'], ['4382', '250'], ['4382', '375'], ['4382', '500'], ['4382', '625'], ['4382', '750'], ['4382', '875'], ['4382', '1000'], ['4382', '1125'], ['4382', '1250'], ['4382', '1375'], ['4382', '1500'], ['4476', '0'], ['4476', '125'], ['4476', '250'], ['4476', '375'], ['4476', '500'], ['4476', '625'], ['4476', '750'], ['4476', '875'], ['4476', '1000'], ['4476', '1125'], ['4476', '1250'], ['4476', '1375'], ['4476', '1500'], ['4570', '0'], ['4570', '125'], ['4570', '250'], ['4570', '375'], ['4570', '500'], ['4570', '625'], ['4570', '750'], ['4570', '875'], [

# Define functions

In [4]:
def get_interpolated_variables(grid_values, index_level):
    grid_values_cut = grid_values[index_level][(LAT_MIN_ERA+90):(LAT_MAX_ERA+90+1),(180+LON_MIN_ERA):(180+LON_MAX_ERA+1)]
    interfunc = ig.Intergrid(grid_values_cut, lo=lo, hi=hi, verbose = False)
    return interfunc(QUERY_POINTS) 

# Recursively get all relevant file paths

In [5]:
rootdir = "/net/bio/atmosdyn/erainterim/cdf/"
relevant_file_paths = []
for root, subdirs, files in os.walk(rootdir):
    if len(root.split("/")) == 8:
        if (root.split("/")[6] != "1979"):
            regex = re.compile('P\w+')
            relevant_file_paths_local = [os.path.join(root, netcdf_file) for netcdf_file in files if regex.match(netcdf_file) ]
            relevant_file_paths.extend(relevant_file_paths_local)

# Read in all relevant ERAI files

In [6]:
feature_list_names =["date"]
feature_list_names.extend([f"SLP_{query_point[0]}_{query_point[1]}_sealevel" for query_point in QUERY_POINTS_LABELS])
feature_list_names.extend([f"T_{query_point[0]}_{query_point[1]}_900" for query_point in QUERY_POINTS_LABELS])
for pressure_level in ["850", "700", "500"]:
    for variable in ['Z','T','Q','U','V']:
        feature_list_names.extend([f"{variable}_{query_point[0]}_{query_point[1]}_{pressure_level}" for query_point in QUERY_POINTS_LABELS])
print(len(feature_list_names))

1769


In [None]:
def parallelize_reading():
    

In [7]:
rows_list=[]
for file_path in relevant_file_paths:
    print(file_path, end="\r")
    feature_list = [file_path[-11:]] # Get datetime string
    
    SLP, = dn.read_var(file_path, ['SLP'])

    feature_list.extend(get_interpolated_variables(grid_values=np.expand_dims(SLP,0), index_level=0))
    
    # Switch to corresponding Z file
    file_path = file_path.replace("P", "Z")
    
    Z,T,Q,U,V = dn.read_var(file_path, ['Z','T','Q','U','V'])
    
    feature_list.extend(get_interpolated_variables(grid_values=T, index_level=0))

    for index, pressure_level in {1: "850", 3: "700", 5: "500"}.items():
        
        feature_list.extend(get_interpolated_variables(grid_values=Z, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=T, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=Q, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=U, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=V, index_level=index))
        
    rows_list.append(feature_list)
    
df = pd.DataFrame(rows_list, columns=feature_list_names)
df["date"] = pd.to_datetime(df["date"], format='%Y%m%d_%H')
df.head()

/net/bio/atmosdyn/erainterim/cdf/1980/06/P19800607_00

KeyboardInterrupt: 

In [None]:
# Write recipe outputs
df.to_csv("/net/litho/atmosdyn2/chmony/data/MeteorologicalData/ERAI_data.csv.gz", index=False)