# Import packages

In [1]:
import pandas as pd, numpy as np
import dypy.netcdf as dn
import dypy.intergrid as ig
import os
import re
from constants import *

# Define local constants

In [2]:
INDEX_ERA = np.s_[:, :, (LAT_MIN_ERA+90):(LAT_MAX_ERA+90+1),(180+LON_MIN_ERA):(180+LON_MAX_ERA+1)]

SOUTH_WEST_POINT = np.array([LAT_MIN_ERA, LON_MIN_ERA])
NORTH_EAST_POINT = np.array([LAT_MAX_ERA, LON_MAX_ERA])

QUERY_POINTS = [[lat, lon] for lat in LATS_CESM for lon in LONS_CESM]
QUERY_POINTS_LABELS = [[str(int(100*query_point[0])), str(int(100*query_point[1]))] for query_point in QUERY_POINTS]

# Define functions

In [3]:
def get_interpolated_variables(grid_values, index_level):
    grid_values_cut = grid_values[index_level]
    interfunc = ig.Intergrid(grid_values_cut, lo=SOUTH_WEST_POINT, hi=NORTH_EAST_POINT, verbose = False)
    return interfunc(QUERY_POINTS) 

# Recursively get all relevant file paths

In [4]:
rootdir = "/net/bio/atmosdyn/erainterim/cdf/"
relevant_file_paths = []
for root, subdirs, files in os.walk(rootdir):
    if len(root.split("/")) == 8:
        if (root.split("/")[6] != "1979"):
            regex = re.compile('P\w+')
            relevant_file_paths_local = [os.path.join(root, netcdf_file) for netcdf_file in files if regex.match(netcdf_file) ]
            relevant_file_paths.extend(relevant_file_paths_local)
print(len(relevant_file_paths))

57952


# Read in all relevant ERAI files

In [5]:
feature_list_names =["date"]
feature_list_names.extend([f"SLP_{query_point[0]}_{query_point[1]}_sealevel" for query_point in QUERY_POINTS_LABELS])
feature_list_names.extend([f"T_{query_point[0]}_{query_point[1]}_900" for query_point in QUERY_POINTS_LABELS])
for pressure_level in ["850", "700", "500"]:
    for variable in ['Z','T','Q','U','V']:
        feature_list_names.extend([f"{variable}_{query_point[0]}_{query_point[1]}_{pressure_level}" for query_point in QUERY_POINTS_LABELS])
print(len(feature_list_names))

1769


In [None]:
rows_list=[]
for file_path in relevant_file_paths:
    print(len(rows_list)/len(relevant_file_paths), end="\r", flush=True)
    feature_list = [file_path[-11:]] # Get datetime string

    SLP, = dn.read_var(file_path, ['SLP'], index=INDEX_ERA)

    feature_list.extend(get_interpolated_variables(grid_values=np.expand_dims(SLP,0), index_level=0))

    # Switch to corresponding Z file
    file_path = file_path.replace("P", "Z")

    Z,T,Q,U,V = dn.read_var(file_path, ['Z','T','Q','U','V'], index=INDEX_ERA)
    
    feature_list.extend(get_interpolated_variables(grid_values=T, index_level=0))

    for index, pressure_level in {1: "850", 3: "700", 5: "500"}.items():

        feature_list.extend(get_interpolated_variables(grid_values=Z, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=T, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=Q, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=U, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=V, index_level=index))

    rows_list.append(feature_list)

0.02591800110436223557

In [None]:
df = pd.DataFrame(rows_list, columns=feature_list_names)
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d_%H")
print(df.shape)
df.head()

In [None]:
# Write recipe outputs
df.to_csv("/net/litho/atmosdyn2/chmony/data/MeteorologicalData/ERAI_data.csv.gz", index=False)