### Importing packages and data_utils.py

In [13]:
!git clone https://github.com/leap-stc/ClimSim.git
%cd ClimSim
!pip install .

Cloning into 'ClimSim'...
remote: Enumerating objects: 4506, done.[K
remote: Counting objects: 100% (811/811), done.[K
remote: Compressing objects: 100% (357/357), done.[K
remote: Total 4506 (delta 461), reused 718 (delta 441), pack-reused 3695[K
Receiving objects: 100% (4506/4506), 140.16 MiB | 29.50 MiB/s, done.
Resolving deltas: 100% (1554/1554), done.
Updating files: 100% (2068/2068), done.
/content/ClimSim/ClimSim
Processing /content/ClimSim/ClimSim
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: climsim-utils
  Building wheel for climsim-utils (setup.py) ... [?25l[?25hdone
  Created wheel for climsim-utils: filename=climsim_utils-0.0.1-py3-none-any.whl size=14489 sha256=c9bee0a7f2a4b1216959209039d3cf7a0fa8b37e0faf1ad6d31743f4e016b8c9
  Stored in directory: /tmp/pip-ephem-wheel-cache-nl0lsm0r/wheels/07/2a/cc/eb42675edeed67be576a3d1f3b55c8050f115a21cfe7e71e99
Successfully built climsim-utils
Installing collected packages: climsim-u

In [14]:
import tensorflow as tf
print(tf.__version__)
import itertools
import os
import xarray as xr

2.14.0


In [15]:
from climsim_utils.data_utils import *

### Instantiating class

In [16]:
grid_path = '../ClimSim/grid_info/ClimSim_low-res_grid-info.nc'
norm_path = './preprocessing/normalizations/'

grid_info = xr.open_dataset(grid_path)
input_mean = xr.open_dataset(norm_path + 'inputs/input_mean.nc')
input_max = xr.open_dataset(norm_path + 'inputs/input_max.nc')
input_min = xr.open_dataset(norm_path + 'inputs/input_min.nc')
output_scale = xr.open_dataset(norm_path + 'outputs/output_scale.nc')

data = data_utils(grid_info = grid_info,
                  input_mean = input_mean,
                  input_max = input_max,
                  input_min = input_min,
                  output_scale = output_scale)

In [17]:
data.data_path='../ClimSim/e3smdata'

In [18]:
def expand_pattern(pattern):
    """generate every possible values"""
    matches = re.findall(r'\[([^\]]+)\]', pattern)
    replacements = [list(match) for match in matches]

    for replacement in itertools.product(*replacements):
        temp_pattern = pattern
        for r in replacement:
            temp_pattern = re.sub(r'\[([^\]]+)\]', r, temp_pattern, 1)
        yield temp_pattern

def generate_filenames_with_times(pattern, end_time=85200):
    date_pattern, _ = pattern.split('-*')

    expanded_patterns = expand_pattern(date_pattern)

    times = range(0, end_time + 1, 1200)


    filenames = []
    for date in expanded_patterns:
        for time in times:
            filename = f"E3SM-MMF.mli.{date}-{time:05}.nc"
            filenames.append(filename)

    return filenames

# change the pattern
pattern = "000[2]-0[1]-0[2]-*"
file_names = generate_filenames_with_times(pattern, 85200)


file_names[:5]


['E3SM-MMF.mli.0002-01-02-00000.nc',
 'E3SM-MMF.mli.0002-01-02-01200.nc',
 'E3SM-MMF.mli.0002-01-02-02400.nc',
 'E3SM-MMF.mli.0002-01-02-03600.nc',
 'E3SM-MMF.mli.0002-01-02-04800.nc']

In [19]:
import requests
# Directory to save files
save_dir = "../ClimSim/e3smdata"

# Ensure save directory exists
os.makedirs(save_dir, exist_ok=True)
# Download loop
base_url = "https://huggingface.co/datasets/LEAP/ClimSim_low-res/resolve/main/"


for file_name in file_names[:10]:


    match = re.search(r'\.(\d{4})-(\d{2})-', file_name)
    if match:
        year, month = match.groups()
        url = f"{base_url}train/{year}-{month}/{file_name}"
    else:
        print(f"Cannot determine year and month for {file_name}")
        continue

    # downloading
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(save_dir, file_name), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {file_name}")

print("Download complete.")

Download complete.


In [20]:
base_url

'https://huggingface.co/datasets/LEAP/ClimSim_low-res/resolve/main/'

In [21]:

# directory_path = '../ClimSim/e3smdata'


# for filename in os.listdir(directory_path):
#     # connect path
#     file_path = os.path.join(directory_path, filename)

#     # check files
#     if os.path.isfile(file_path):
#         print(filename)


### Create training data

In [22]:
# import os; os.makedirs("sampledata", exist_ok=True)
# os.makedirs("e3smdata", exist_ok=True)


In [23]:
# set inputs and outputs to V1 subset
data.set_to_v1_vars()

# set regular expressions for selecting training data
data.set_regexps(data_split = 'train',
                 regexps = ['E3SM-MMF.mli.000[1234567]-*-*-*.nc', # years 1 through 7
                            'E3SM-MMF.mli.0008-01-*-*.nc']) # first month of year 8
# set temporal subsampling
data.set_stride_sample(data_split = 'train', stride_sample = 7)
# create list of files to extract data from

data.set_filelist(data_split = 'train')

# first_file = data.get_filelist('train')[0]
# ds = data.get_xrdata(first_file)
# print(list(ds.variables))

# save numpy files of training data
data.save_as_npy(data_split = 'train', save_path = '../ClimSim/sampledata')

UnknownError: ignored

### Create validation data

In [None]:
# set regular expressions for selecting validation data
data.set_regexps(data_split = 'val',
                 regexps = ['E3SM-MMF.mli.0008-0[23456789]-*-*.nc', # months 2 through 9 of year 8
                            'E3SM-MMF.mli.0008-1[012]-*-*.nc', # months 10 through 12 of year 8
                            'E3SM-MMF.mli.0009-01-*-*.nc']) # first month of year 9
# set temporal subsampling
data.set_stride_sample(data_split = 'val', stride_sample = 7)
# create list of files to extract data from
data.set_filelist(data_split = 'val')
# save numpy files of validation data
data.save_as_npy(data_split = 'val', save_path = '')

### Create scoring data

In [None]:
# set regular expressions for selecting scoring data (stride of 6 is needed for daily averaging)
data.set_regexps(data_split = 'scoring',
                 regexps = ['E3SM-MMF.mli.0008-0[23456789]-*-*.nc', # months 2 through 9 of year 8
                            'E3SM-MMF.mli.0008-1[012]-*-*.nc', # months 10 through 12 of year 8
                            'E3SM-MMF.mli.0009-01-*-*.nc']) # first month of year 9
# set temporal subsampling
data.set_stride_sample(data_split = 'scoring', stride_sample = 6)
# create list of files to extract data from
data.set_filelist(data_split = 'scoring')
# save numpy files of scoring data
data.save_as_npy(data_split = 'scoring', save_path = '')

In [12]:
pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [None]:
pip install netcdf4 h5netcdf


In [None]:
pip install netcdf4 h5netcdf
