In [2]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
from netCDF4 import Dataset

# 1. Load data

## A. Get the checksums

In [5]:
# This file contains a list of file URLs and their corresponding checksums
df = pd.read_csv('./gddp-cmip6-thredds-fileserver.csv')
df.head()

Unnamed: 0,fileMD5,fileUrl
0,e34060e7b1abf84d71d61ea21da77d8e,https://ds.nccs.nasa.gov/thredds2/fileServer/...
1,72de82165e04bf28814b493300c7e305,https://ds.nccs.nasa.gov/thredds2/fileServer/...
2,9d94ea3cab21637ddd267fe5a0a5afd3,https://ds.nccs.nasa.gov/thredds2/fileServer/...
3,1d3b4f2ab48e85e694ca2a4270aff80a,https://ds.nccs.nasa.gov/thredds2/fileServer/...
4,37c78ade0ba210f8a4d80fda03fd3397,https://ds.nccs.nasa.gov/thredds2/fileServer/...


## B. Load the data for GISS simulations.

In [11]:
# Download GISS catalog to an XML file
var_name = 'tasmax'
catalog_url = f'https://ds.nccs.nasa.gov/thredds/catalog/AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/{var_name}/catalog.xml'
resp = requests.get(catalog_url)
with open(f'./{var_name}_catalog.xml', 'wb') as f:
    f.write(resp.content)

In [107]:
# Parse dataset URLs from the XML file
dataset_urls = []
total_storage_req = 0.0
tree = ET.parse('tasmax_catalog.xml')
root = tree.getroot()

xmlns = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
ns = {
    "ns": xmlns
}
for item in tree.getroot().findall('ns:dataset/', ns):
    if 'dataset' in item.tag:
        dataset_url = item.attrib['urlPath']
        dataset_urls.append(dataset_url)

        data_size = item.find('ns:dataSize', ns)
        total_storage_req += float(data_size.text)

print(f"Found {len(dataset_urls)} dataset URLs. Total storage required: {total_storage_req} MB.")
print(dataset_urls)


Found 86 dataset URLs. Total storage required: 21090.09999999999 MB.
['AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2100.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2099.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2098.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2097.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2096.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2095.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2094.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2093.nc', 'AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp245/r1i1p1f2/tas/tas_day_GISS-E2-1-G_ssp245_r1i1p1f2_gn_2092.nc', 'AMES/NEX/GD

## C. Download the data from the specified URLs

This is the fun part! For now, I'll focus on just the years that are multiples of 5.

In [70]:
desired_years = [2000 + x * 5 for x in range(0, 15)] # 2000, 2005, ..., 2065, 2070
print(len(desired_years))

15


In [106]:
result = {}
for url in dataset_urls:
    year = int(url[-7:-3])
    if year in desired_years:
        # Download the file.
        full_url =  'https://ds.nccs.nasa.gov/thredds2/dodsC/' + url
        data = Dataset(full_url)
        coords = (32.9412, -97.1342)
        lat = coords[0]
        lon = coords[1] if coords[1] > 0 else 360 + coords[1]
        jj = np.argmin((data['lat'][:]-lat)**2)
        ii = np.argmin((data['lon'][:]-lon)**2)

        # Note: this function is slow if you compute the full average rather than sampling a day.
        # Also not sure if the full average makes more sense than sampling a single day.
        total = 0.0
        sample_days = range(0, 365, 60)
        for day in sample_days:
            total += data['tasmax'][day, jj, ii]
        result[year] = total / len(sample_days) - 273.15

        # Get data for day 200.
        # result[year] = data['tasmax'][200, jj, ii] - 273.15

        # break

print(result)

# url = 'https://ds.nccs.nasa.gov/thredds2/dodsC/AMES/NEX/GDDP-CMIP6/GISS-E2-1-G/ssp585/r1i1p1f2/tasmin/tasmin_day_GISS-E2-1-G_ssp585_r1i1p1f2_gn_2099.nc'
# data = Dataset(url)
# print(data)

{2070: 28.702604457310315, 2065: 25.531958879743343, 2060: 24.3005092075893, 2055: 26.99310128348219, 2050: 25.773265729631703, 2045: 26.600470842633968, 2040: 27.094349888392856, 2035: 29.627361188616078, 2030: 22.973670305524593, 2025: 25.881781877790218, 2020: 24.01805158342637, 2015: 27.59547903878348}
