# Import packages

In [1]:
import pandas as pd, numpy as np
import dypy.netcdf as dn
import dypy.intergrid as ig
import os
import re

# Custom functions
from utils_ERA import *

# Import constants from constants.py file in directory above
import sys
sys.path.append("..")
from constants import *

--- Avaliable variables ---
Base directory: BASE_DIR
Month names: MONTH_NAMES
Bounding box coordinates: LON_MIN, LON_MAX, LAT_MIN, LAT_MAX
ERAI coordinates: LONS_ERA, LATS_ERA
CESM coordinates: LONS_CESM, LATS_CESM
CESM slice: INDEX_CESM
CESM coordinates for plotting (string): LONS_CESM_STRING, LATS_CESM_STRING


# Define local constants

In [2]:
# Slice of coordiantes to read later
INDEX_ERA = np.s_[:, :, (LAT_MIN+90):(LAT_MAX+90+1),(180+LON_MIN):(180+LON_MAX+1)]

# Lower left and upper right corner of bounding box
SOUTH_WEST_POINT = np.array([LAT_MIN, LON_MIN])
NORTH_EAST_POINT = np.array([LAT_MAX, LON_MAX])

# CESM coordinates to which interpolate to
QUERY_POINTS = [[lat, lon] for lat in LATS_CESM for lon in LONS_CESM]
QUERY_POINTS_LABELS = [[str(int(100*query_point[0])), str(int(100*query_point[1]))] for query_point in QUERY_POINTS]

# Recursively get all relevant file paths

In [3]:
# Basedir of ERAI data
rootdir = "/net/bio/atmosdyn/erainterim/cdf/"

# Walk through all subdirs and only keep the P-files at the deepest level
relevant_file_paths = []
for root, subdirs, files in os.walk(rootdir):
    if len(root.split("/")) == 8:
        if (root.split("/")[6] != "1979"):  # Exclude the year 1979 (since no foehn data here)
            relevant_file_paths_local = [os.path.join(root, netcdf_file) for netcdf_file in files if re.compile('P\w+').match(netcdf_file)]
            relevant_file_paths.extend(relevant_file_paths_local)
print("Relevant P-files: ", len(relevant_file_paths))

Relevant P-files:  57952


# Read in all relevant ERAI files

In [4]:
# Engineer all feature names for all variables, pressure levels, and coordiantes first
feature_list_names = ["date"]
feature_list_names.extend([f"SLP_{query_point[0]}_{query_point[1]}_sealevel" for query_point in QUERY_POINTS_LABELS])
feature_list_names.extend([f"T_{query_point[0]}_{query_point[1]}_900" for query_point in QUERY_POINTS_LABELS])
for pressure_level in ["850", "700", "500"]:
    for variable in ['Z','T','U','V']:
        feature_list_names.extend([f"{variable}_{query_point[0]}_{query_point[1]}_{pressure_level}" for query_point in QUERY_POINTS_LABELS])
print("Number of final features (excl. date): ", len(feature_list_names)-1)

Number of final features (excl. date):  1456


In [None]:
# Loop over all file paths from above (each path resembles one time-point)
rows_list = []
for i, file_path in enumerate(relevant_file_paths):
    # Print progress
    print(i/len(relevant_file_paths), end="\r", flush=True)
    
    # Get datetime string and hereby begin a new feature list for this time point
    feature_list = [file_path[-11:]] 
    
    # Read SLP values and add to feature list
    SLP, = dn.read_var(file_path, ['SLP'], index=INDEX_ERA)
    feature_list.extend(get_interpolated_variables(np.expand_dims(SLP,0), 0, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))

    # Switch to corresponding Z file (where the other variables are)
    file_path = file_path.replace("P", "Z")

    # Read other variables
    Z,T,U,V = dn.read_var(file_path, ['Z','T','U','V'], index=INDEX_ERA)
    
    # Append temperature at 900 hPa (equivalent to 0 as index level)
    feature_list.extend(get_interpolated_variables(T, 0, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))

    # Loop over other pressure levels and add to features (has to be same order as above)
    for index, pressure_level in {1: "850", 3: "700", 5: "500"}.items():
        feature_list.extend(get_interpolated_variables(Z, index, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))
        feature_list.extend(get_interpolated_variables(T, index, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))
        feature_list.extend(get_interpolated_variables(U, index, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))
        feature_list.extend(get_interpolated_variables(V, index, SOUTH_WEST_POINT, NORTH_EAST_POINT, QUERY_POINTS))

    # Add list to final list of samples
    rows_list.append(feature_list)

In [8]:
# Create dataframe and convert date column
df = pd.DataFrame(rows_list, columns=feature_list_names)
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d_%H")
df.head()

Unnamed: 0,date,SLP_4287_0_sealevel,SLP_4287_125_sealevel,SLP_4287_250_sealevel,SLP_4287_375_sealevel,SLP_4287_500_sealevel,SLP_4287_625_sealevel,SLP_4287_750_sealevel,SLP_4287_875_sealevel,SLP_4287_1000_sealevel,...,V_4947_375_500,V_4947_500_500,V_4947_625_500,V_4947_750_500,V_4947_875_500,V_4947_1000_500,V_4947_1125_500,V_4947_1250_500,V_4947_1375_500,V_4947_1500_500
0,1980-01-01 00:00:00,1017.995239,1018.003906,1017.821594,1016.92041,1015.298279,1012.365479,1010.165161,1008.873108,1007.041077,...,-21.335085,-21.929634,-18.944633,-13.576672,-7.859432,-4.315187,-4.537302,-6.384339,-7.468239,-6.366766
1,1980-01-01 06:00:00,1012.411987,1012.941711,1013.162537,1012.466492,1011.17334,1009.057678,1007.720276,1007.450928,1007.185791,...,-15.864392,-17.260447,-18.230337,-17.349325,-14.684937,-11.736959,-9.278327,-6.926729,-5.746246,-5.830248
2,1980-01-01 12:00:00,1013.14917,1011.777222,1011.90448,1011.719299,1010.750427,1009.099365,1008.059875,1007.817688,1007.70166,...,-14.286724,-13.759618,-12.58048,-11.242802,-10.006481,-8.826975,-7.680785,-6.542734,-5.758639,-4.876877
3,1980-01-01 18:00:00,1014.345581,1013.098083,1012.252258,1011.263855,1009.9599,1008.008911,1006.892273,1006.739563,1006.628662,...,-16.41197,-14.912545,-13.090832,-11.290992,-9.406482,-7.761153,-6.514461,-5.640243,-5.237644,-5.301913
4,1980-01-02 00:00:00,1016.135559,1013.480103,1011.916931,1011.259705,1010.306091,1008.409668,1006.916138,1006.435059,1006.093689,...,-14.576132,-13.82945,-12.889864,-11.361773,-9.516212,-7.717099,-6.117799,-4.799734,-3.900637,-3.176462


In [8]:
# Write recipe outputs
df.to_csv(os.path.join(BASE_DIR, "data", "MeteorologicalData", "ERAI_data.csv.gz"), index=False)