# Import packages

In [1]:
import pandas as pd, numpy as np
import dypy.netcdf as dn
import dypy.intergrid as ig
import os
import re
from constants import *

  'Matplotlib is building the font cache using fc-list. '


# Define local constants

In [2]:
INDEX_ERA = np.s_[:, :, (LAT_MIN_ERA+90):(LAT_MAX_ERA+90+1),(180+LON_MIN_ERA):(180+LON_MAX_ERA+1)]

SOUTH_WEST_POINT = np.array([LAT_MIN_ERA, LON_MIN_ERA])
NORTH_EAST_POINT = np.array([LAT_MAX_ERA, LON_MAX_ERA])

QUERY_POINTS = [[lat, lon] for lat in LATS_CESM for lon in LONS_CESM]
QUERY_POINTS_LABELS = [[str(int(100*query_point[0])), str(int(100*query_point[1]))] for query_point in QUERY_POINTS]

# Define functions

In [3]:
def get_interpolated_variables(grid_values, index_level):
    grid_values_cut = grid_values[index_level]
    interfunc = ig.Intergrid(grid_values_cut, lo=SOUTH_WEST_POINT, hi=NORTH_EAST_POINT, verbose = False)
    return interfunc(QUERY_POINTS) 

# Recursively get all relevant file paths

In [4]:
rootdir = "/net/bio/atmosdyn/erainterim/cdf/"
relevant_file_paths = []
for root, subdirs, files in os.walk(rootdir):
    if len(root.split("/")) == 8:
        if (root.split("/")[6] != "1979"):
            regex = re.compile('P\w+')
            relevant_file_paths_local = [os.path.join(root, netcdf_file) for netcdf_file in files if regex.match(netcdf_file) ]
            relevant_file_paths.extend(relevant_file_paths_local)
print(len(relevant_file_paths))

57952


# Read in all relevant ERAI files

In [5]:
feature_list_names =["date"]
feature_list_names.extend([f"SLP_{query_point[0]}_{query_point[1]}_sealevel" for query_point in QUERY_POINTS_LABELS])
feature_list_names.extend([f"T_{query_point[0]}_{query_point[1]}_900" for query_point in QUERY_POINTS_LABELS])
for pressure_level in ["850", "700", "500"]:
    for variable in ['Z','T','Q','U','V']:
        feature_list_names.extend([f"{variable}_{query_point[0]}_{query_point[1]}_{pressure_level}" for query_point in QUERY_POINTS_LABELS])
print(len(feature_list_names))

1769


In [6]:
rows_list=[]
for file_path in relevant_file_paths:
    print(len(rows_list)/len(relevant_file_paths), end="\r", flush=True)
    feature_list = [file_path[-11:]] # Get datetime string

    SLP, = dn.read_var(file_path, ['SLP'], index=INDEX_ERA)

    feature_list.extend(get_interpolated_variables(grid_values=np.expand_dims(SLP,0), index_level=0))

    # Switch to corresponding Z file
    file_path = file_path.replace("P", "Z")

    Z,T,Q,U,V = dn.read_var(file_path, ['Z','T','Q','U','V'], index=INDEX_ERA)
    
    feature_list.extend(get_interpolated_variables(grid_values=T, index_level=0))

    for index, pressure_level in {1: "850", 3: "700", 5: "500"}.items():

        feature_list.extend(get_interpolated_variables(grid_values=Z, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=T, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=Q, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=U, index_level=index))
        feature_list.extend(get_interpolated_variables(grid_values=V, index_level=index))

    rows_list.append(feature_list)

0.99998274434014366657

In [7]:
df = pd.DataFrame(rows_list, columns=feature_list_names)
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d_%H")
print(df.shape)
df.head()

(57952, 1769)


Unnamed: 0,date,SLP_4287_0_sealevel,SLP_4287_125_sealevel,SLP_4287_250_sealevel,SLP_4287_375_sealevel,SLP_4287_500_sealevel,SLP_4287_625_sealevel,SLP_4287_750_sealevel,SLP_4287_875_sealevel,SLP_4287_1000_sealevel,...,V_4947_375_500,V_4947_500_500,V_4947_625_500,V_4947_750_500,V_4947_875_500,V_4947_1000_500,V_4947_1125_500,V_4947_1250_500,V_4947_1375_500,V_4947_1500_500
0,1980-01-01 00:00:00,1017.995239,1018.003906,1017.821594,1016.92041,1015.298279,1012.365479,1010.165161,1008.873108,1007.041077,...,-21.335085,-21.929634,-18.944633,-13.576672,-7.859432,-4.315187,-4.537302,-6.384339,-7.468239,-6.366766
1,1980-01-01 06:00:00,1012.411987,1012.941711,1013.162537,1012.466492,1011.17334,1009.057678,1007.720276,1007.450928,1007.185791,...,-15.864392,-17.260447,-18.230337,-17.349325,-14.684937,-11.736959,-9.278327,-6.926729,-5.746246,-5.830248
2,1980-01-01 12:00:00,1013.14917,1011.777222,1011.90448,1011.719299,1010.750427,1009.099365,1008.059875,1007.817688,1007.70166,...,-14.286724,-13.759618,-12.58048,-11.242802,-10.006481,-8.826975,-7.680785,-6.542734,-5.758639,-4.876877
3,1980-01-01 18:00:00,1014.345581,1013.098083,1012.252258,1011.263855,1009.9599,1008.008911,1006.892273,1006.739563,1006.628662,...,-16.41197,-14.912545,-13.090832,-11.290992,-9.406482,-7.761153,-6.514461,-5.640243,-5.237644,-5.301913
4,1980-01-02 00:00:00,1016.135559,1013.480103,1011.916931,1011.259705,1010.306091,1008.409668,1006.916138,1006.435059,1006.093689,...,-14.576132,-13.82945,-12.889864,-11.361773,-9.516212,-7.717099,-6.117799,-4.799734,-3.900637,-3.176462


In [8]:
# Write recipe outputs
df.to_csv("/net/litho/atmosdyn2/chmony/data/MeteorologicalData/ERAI_data.csv.gz", index=False)