In [1]:
# Local imports
import os
import sys
import glob

# Third-party imports
import numpy as np

In [2]:
# Set up the directory in which the data are stored
alt_lag_dir = "/gws/nopw/j04/canari/users/benhutch/alternate-lag-processed-data/"

# Print the contents of the directory
print("Contents of directory:")
print(os.listdir(alt_lag_dir))

# Print the most recent file
print("Most recent file:")
print(max(glob.iglob(alt_lag_dir + "*.npy"), key=os.path.getctime))

Contents of directory:


In [None]:
# Extract the contents of the directory into a dataframe
# With headings: "variable", "season", "region", "start_year", "end_year", "forecast_range", "lag", "datetime", "filesize"
import pandas as pd

# Create a list of all the files in the directory
files = glob.glob(alt_lag_dir + "*.npy")

# Extract the filenames
filenames = [os.path.basename(file) for file in files]

# Convert the list to a dataframe
df = pd.DataFrame(filenames, columns=["filename"])

# Print the dataframe
print(df)

# Extract the variables
df["variable"] = df["filename"].str.split("_", expand=True)[0]

# Extract the seasons
df["season"] = df["filename"].str.split("_", expand=True)[1]

# Extract the regions
df["region"] = df["filename"].str.split("_", expand=True)[2]

# Extract the start years
df["start_year"] = df["filename"].str.split("_", expand=True)[3]

# Extract the end years
df["end_year"] = df["filename"].str.split("_", expand=True)[4]

# Extract the lags
df["lag"] = df["filename"].str.split("_", expand=True)[6]

# Extract the forecast ranges
df["forecast_range"] = df["filename"].str.split("_", expand=True)[5]

# Create a column for the alt lag
# If the file contains the string alternate_lag, then set this to True
# Otherwise, set it to False
df["alt_lag"] = df["filename"].str.contains("alternate_lag")

# If the psl_DJFM_global_1962_1980_2-9_2_1706281292.628301_alternate_lag.npy
# 1706281292.628301 is the datetime
# Extract the datetimes
df["datetime"] = df["filename"].str.split("_", expand=True)[7]

# Remove the .npy extension
df["datetime"] = df["datetime"].str.replace(".npy", "")

# Convert the datetimes to a more readable format
df["datetime"] = pd.to_datetime(df["datetime"], unit="s")

# Calculate the file sizes
df["filesize"] = [os.path.getsize(file) for file in files]

# Create a column for the n_ens members
# Load the file (as a .npy file) and insert the shape into the dataframe
df["n_ens"] = [np.load(file).shape[1] for file in files]

# Set up a column for the raw data
# If the file does not contain the string alternate_lag, then set this to True
# Otherwise, set it to False
df["raw"] = ~df["alt_lag"]

df

In [None]:
df

In [None]:
df

In [None]:
# Print the unique options in ['forecast_range']
# Create a new dataframe where df['forecast_range'] == '1'


In [None]:
df_new = df[df['forecast_range'] == '2-3']

In [None]:
df_new

In [None]:
df_new

In [None]:
print(df['forecast_range'].unique())

In [None]:
# Save this file as a .csv
df.to_csv("alternate_lag_file_info.csv", index=False)

In [None]:
# Set up the variables for the data
var="psl"
season="DJFM"
region="global"
start_year="1964"
end_year="2014"
lag="4"
forecast_range="2-5"

In [None]:
# Form the filename
filename = f"{var}_{season}_{region}_{start_year}_{end_year}_{forecast_range}_{lag}_*_alternate_lag.npy"

# Print the filename
print("Filename:")
print(filename)

# Print the files that match the filename
print("Matching files:")
print(glob.glob(alt_lag_dir + filename))

In [None]:
# Load the file
data = np.load(glob.glob(alt_lag_dir + filename)[1])

# Print the shape of the data
print("Shape of data:")
print(data.shape)

# Print the data
print("Data:")
print(data)

In [None]:
# Set up the other filename
# Like this: psl_DJFM_global_1961_1980_2-9_2_1706281292.628301.npy
filename = f"{var}_{season}_{region}_*_{end_year}_{forecast_range}_{lag}_*.npy"

# Print the filename
print("Filename:")
print(filename)

# Print the files that match the filename
print("Matching files:")
print(glob.glob(alt_lag_dir + filename))

# Find the matching file which doesn't have the "alternate_lag" string in it
for file in glob.glob(alt_lag_dir + filename):
    if "alternate_lag" not in file:
        data = np.load(file)

In [None]:
# Print the shape of the data
print("Shape of data:")
print(data.shape)

# Print the data
print("Data:")
print(data)