In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import sys
import os
import os.path
import io
import itertools as it
from multiprocessing import Pool, cpu_count
from typing import *

## Raw data download helper scripts

Helper code for automating download of raw data directly from KNMI. Data must have been recently generated and published on KNMI server. Links must be regenerated manually on KNMI after expiration (~3 days).

In [2]:
lons = np.arange(1.25, 360.0, 2.5)
lats = np.arange(-88.75, 90.0, 2.5)
N_lat, N_lon = len(lats), len(lons)
print('{} lat x {} lon'.format(N_lat, N_lon))

72 lat x 144 lon


In [None]:
import requests
import tqdm
import time

expected_file_size = 44720
varname = "pr"
experiment = "rcp26"
model = "CanESM2"
dirname = 'raw_data/{}/{}/{}'.format(experiment, model, varname)
if not os.path.isdir(dirname):
    os.makedirs(dirname)
link_uri_base = "https://climexp.knmi.nl/getindices.cgi?WMO=data/gridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_%LON%_%LAT%_n&STATION=cmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info&TYPE=i&id=someone@somewhere&NPERYEAR=12"
dl_uri_base = "https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_%LON%_%LAT%_n.dat"
def download_grid_point(lat: float, lon: float):
    output_file = './raw_data/{}/{}/{}/{}'.format(experiment, model, varname, '{}_{}_{}_{}_{}.dat'.format(experiment, model, varname, lon, lat))
    if os.path.isfile(output_file) and os.path.getsize(output_file) == expected_file_size:
        print('{} already exists; skipping...'.format(output_file))
        return
    elif os.path.isfile(output_file) and os.path.getsize(output_file) < expected_file_size:
        print('{} not of expected size, overwriting...'.format(output_file))
    data_uri = dl_uri_base.replace("%LON%", '{:07.2f}'.format(lon))
    link_uri = link_uri_base.replace("%LON%", '{:07.2f}'.format(lon))
    # use weird KNMI formatting pattern for negative lattitudes > -10
    if lat > -10.0 and lat < 0.0:
        data_uri = data_uri.replace("%LAT%", "0{:.2f}".format(lat))
        link_uri = link_uri.replace("%LAT%", "0{:.2f}".format(lat))
    else:
        data_uri = data_uri.replace("%LAT%", "{:06.2f}".format(lat))
        link_uri = link_uri.replace("%LAT%", "{:06.2f}".format(lat))
    resp = requests.get(link_uri)
    if resp.status_code != 200:
        print('bad status code {} requesting page {}'.format(resp.status_code, link_uri))
    time.sleep(0.300) # wait for server to generate data file
    print('Downloading data for coordinate {},{} from {}'.format(lon, lat, data_uri))
    resp = requests.get(data_uri)
    if resp.status_code != 200:
        print('bad status code {} requesting file {}'.format(resp.status_code, data_uri))
    with open(output_file, 'wb') as f:
        f.write(resp.content)

N_threads = 4
pool = Pool(N_threads)
for _ in tqdm.tqdm(pool.starmap(download_grid_point, it.product(lats, lons)), total=N_lat*N_lon):
    pass

for file in filter(lambda f: not f.startswith('.'), os.listdir(dirname)):
    assert os.path.getsize(os.path.join(dirname, file)) == expected_file_size

Downloading data for coordinate 1.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0001.25_-88.75_n.dat
Downloading data for coordinate 181.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0181.25_-56.25_n.dat
Downloading data for coordinate 1.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0001.25_-66.25_n.dat
Downloading data for coordinate 181.25,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0181.25_-78.75_n.dat
Downloading data for coordinate 3.75,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0003.75_-88.75_n.dat
Downloading data for coordinate 183.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0183.75_-56.25_n.dat
Downloading data for coordinate 183.75,-78.75 from https://climexp.knmi.nl/d

Downloading data for coordinate 213.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0213.75_-56.25_n.dat
Downloading data for coordinate 33.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0033.75_-66.25_n.dat
Downloading data for coordinate 36.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0036.25_-88.75_n.dat
Downloading data for coordinate 216.25,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0216.25_-78.75_n.dat
Downloading data for coordinate 216.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0216.25_-56.25_n.dat
Downloading data for coordinate 36.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0036.25_-66.25_n.dat
Downloading data for coordinate 38.75,-88.75 from https://climexp.knmi.nl

Downloading data for coordinate 246.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0246.25_-56.25_n.dat
Downloading data for coordinate 68.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0068.75_-66.25_n.dat
Downloading data for coordinate 248.75,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0248.75_-78.75_n.dat
Downloading data for coordinate 71.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0071.25_-88.75_n.dat
Downloading data for coordinate 248.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0248.75_-56.25_n.dat
Downloading data for coordinate 71.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0071.25_-66.25_n.dat
Downloading data for coordinate 251.25,-78.75 from https://climexp.knmi.n

Downloading data for coordinate 278.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0278.75_-56.25_n.dat
Downloading data for coordinate 101.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0101.25_-66.25_n.dat
Downloading data for coordinate 283.75,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0283.75_-78.75_n.dat
Downloading data for coordinate 281.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0281.25_-56.25_n.dat
Downloading data for coordinate 106.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0106.25_-88.75_n.dat
Downloading data for coordinate 103.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0103.75_-66.25_n.dat
Downloading data for coordinate 286.25,-78.75 from https://climexp.knm

Downloading data for coordinate 311.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0311.25_-56.25_n.dat
Downloading data for coordinate 316.25,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0316.25_-78.75_n.dat
Downloading data for coordinate 136.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0136.25_-66.25_n.dat
Downloading data for coordinate 138.75,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0138.75_-88.75_n.dat
Downloading data for coordinate 313.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0313.75_-56.25_n.dat
Downloading data for coordinate 318.75,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0318.75_-78.75_n.dat
Downloading data for coordinate 138.75,-66.25 from https://climexp.knm

Downloading data for coordinate 343.75,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0343.75_-56.25_n.dat
Downloading data for coordinate 168.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0168.75_-66.25_n.dat
Downloading data for coordinate 171.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0171.25_-88.75_n.dat
Downloading data for coordinate 351.25,-78.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0351.25_-78.75_n.dat
Downloading data for coordinate 346.25,-56.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0346.25_-56.25_n.dat
Downloading data for coordinate 171.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0171.25_-66.25_n.dat
Downloading data for coordinate 173.75,-88.75 from https://climexp.knm

Downloading data for coordinate 203.75,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0203.75_-88.75_n.dat
Downloading data for coordinate 18.75,-53.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0018.75_-53.75_n.dat
Downloading data for coordinate 23.75,-76.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0023.75_-76.25_n.dat
Downloading data for coordinate 203.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0203.75_-66.25_n.dat
Downloading data for coordinate 206.25,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0206.25_-88.75_n.dat
Downloading data for coordinate 26.25,-76.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0026.25_-76.25_n.dat
Downloading data for coordinate 21.25,-53.75 from https://climexp.knmi.nl

Downloading data for coordinate 51.25,-53.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0051.25_-53.75_n.dat
Downloading data for coordinate 236.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0236.25_-66.25_n.dat
Downloading data for coordinate 238.75,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0238.75_-88.75_n.dat
Downloading data for coordinate 58.75,-76.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0058.75_-76.25_n.dat
Downloading data for coordinate 53.75,-53.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0053.75_-53.75_n.dat
Downloading data for coordinate 238.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0238.75_-66.25_n.dat
Downloading data for coordinate 241.25,-88.75 from https://climexp.knmi.n

Downloading data for coordinate 88.75,-76.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0088.75_-76.25_n.dat
Downloading data for coordinate 271.25,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0271.25_-66.25_n.dat
Downloading data for coordinate 273.75,-88.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0273.75_-88.75_n.dat
Downloading data for coordinate 91.25,-76.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0091.25_-76.25_n.dat
Downloading data for coordinate 86.25,-53.75 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0086.25_-53.75_n.dat
Downloading data for coordinate 273.75,-66.25 from https://climexp.knmi.nl/data/igridcmip5_pr_Amon_mod_rcp26.3.someone@somewhere.info_0273.75_-66.25_n.dat
Downloading data for coordinate 276.25,-88.75 from https://climexp.knmi.n

## Preprocessing for raw data files

1. Download, parse, and save raw data

In [4]:
raw_data_dir = 'raw_data'
var_names = ['tas','tasmin','tasmax','pr','pme','evspsbl']
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

In [5]:
import os.path

# create data/ directory
if not os.path.exists('data'):
    os.mkdir('data')

def parse_model_file(filename: str) -> xr.DataArray:
    def parse_header(lines: List[str]) -> Tuple[Dict[str, str], int]:
        metadata = dict()
        for (i, line) in enumerate(lines):
            # stop at end of header
            if not line.startswith('#'):
                return metadata, i
            # skip header lines that not in key-value format
            if not '::' in line:
                continue
            kv = line.replace('#', '').split('::')
            assert len(kv) == 2
            metadata[kv[0].strip()] = kv[1].strip()
    with open(filename) as f:
        lines = f.readlines()
        metadata, i = parse_header(lines)
        csv_str = "".join(lines[i:])
        df = pd.read_csv(io.StringIO(csv_str), delim_whitespace=True, header=None)
        years = df[0]
        df = df.drop(columns=[0])
        name = '{}_{}'.format(metadata['model_id'], metadata['realization'])
        xdarr = xr.DataArray(df, coords=[years, months], dims=['years', 'months'], attrs=metadata, name=name)
        return xdarr
    
def parse_model_grid_point_file(filename: str) -> Tuple[xr.DataArray, float, float]:
    def parse_header(lines: List[str]) -> Tuple[float, float]:
        # parse first line (lat, lon coords)
        lon_str, lat_str = lines[0].split('=')[1].strip().split()
        return float(lon_str), float(lat_str)
        
    with open(filename) as f:
        lines = f.readlines()
        lon, lat = parse_header(lines)
        line_start = 2 ## 0,1 header lines
        csv_str = "".join(lines[line_start:])
        df = pd.read_csv(io.StringIO(csv_str), delim_whitespace=True, header=None)
        years = df[0]
        df = df.drop(columns=[0])
        name = '{}_{}'.format(lon, lat)
        xdarr = xr.DataArray(df, coords=[years, months], dims=['years', 'months'], name=name)
        return xdarr, lon, lat

2. Collect and organize raw data to construct per-model datasets

In [6]:
# Utility functions

def center_monthly_means(xdarr: xr.DataArray) -> xr.DataArray:
    attrs = xdarr.attrs
    means = xdarr.mean(dim='years', keep_attrs=True)
    xdarr = xdarr - means
    xdarr.attrs = attrs
    return xdarr

def flatten_months(xdarr: xr.DataArray) -> xr.DataArray:
    d0,d1 = xdarr.shape
    xdarr = xdarr.stack(time=('years','months'))
    assert(len(xdarr.shape) == 1)
    assert(xdarr.shape[0] == d0*d1)
    return xdarr

In [7]:
def create_grid_data(exper: str, model: str, var: str):
    dirname = './{}/{}/{}/{}'.format(raw_data_dir, exper, model, var)
    grid_data_index = dict()
    print('parsing grid data files...')
    for file in filter(lambda f: not f.startswith('.'), os.listdir(dirname)):
        grid_point_arr, lon, lat = parse_model_grid_point_file(os.path.join(dirname, file))
        grid_point_arr = center_monthly_means(grid_point_arr)
        grid_point_arr = flatten_months(grid_point_arr)
        grid_data_index[(str(lat), str(lon))] = grid_point_arr.values
        time_coords = grid_point_arr.coords['time']
    print('collecting grid data...')
    grid_data_arr = np.array([grid_data_index[(str(lat), str(lon))] for lat in lats for lon in lons])
    print('building data array...')
    grid_data_arr = grid_data_arr.reshape((N_lat, N_lon, len(time_coords)))
    grid_data_arr = np.transpose(grid_data_arr, (2, 0, 1))
    assert grid_data_arr.shape == (len(time_coords), N_lat, N_lon)
    return xr.DataArray(grid_data_arr, coords={'time': time_coords, 'lat': lats, 'lon': lons}, dims=['time','lat','lon'])

In [8]:
def create_agg_data_per_var(var_names):
    for var in var_names:
        dirname = './{}/{}'.format(raw_data_dir, var)
        models = dict()
        for file in filter(lambda f: not f.startswith('.'), os.listdir(dirname)):
            xdarr = parse_model_file('{}/{}'.format(dirname, file))
            models[xdarr.name] = xdarr
        ds = xr.Dataset(models)
        ds.to_netcdf("./data/{}.nc".format(var))

In [9]:
experiment = "rcp26"
varname = "tas"
models = ['MPI-ESM-LR'] #['CCSM4', 'CESM1-CAM5', 'CanESM2', 'CNRM-CM5', 'MIROC5', 'MPI-ESM-LR']
for model in models:
    dirname = "./data/{0}/{1}/".format(experiment, model)
    filename = "{0}/{1}_{2}_grid.nc".format(dirname, experiment, model)
    if not os.path.isdir(dirname):
        os.makedirs(dirname)
    xdarr = create_grid_data(experiment, model, varname)
    ds = xr.Dataset({'tas': xdarr}).reset_index('time')
    print(ds)
    ds.to_netcdf(filename)
    xr.open_dataset(filename)

parsing grid data files...
collecting grid data...
building data array...
<xarray.Dataset>
Dimensions:  (lat: 72, lon: 144, time: 2880)
Coordinates:
  * lat      (lat) float64 -88.75 -86.25 -83.75 -81.25 ... 83.75 86.25 88.75
  * lon      (lon) float64 1.25 3.75 6.25 8.75 11.25 ... 351.2 353.8 356.2 358.8
    years    (time) int64 1861 1861 1861 1861 1861 ... 2100 2100 2100 2100 2100
    months   (time) object 'Jan' 'Feb' 'Mar' 'Apr' ... 'Sep' 'Oct' 'Nov' 'Dec'
Dimensions without coordinates: time
Data variables:
    tas      (time, lat, lon) float64 0.04984 0.03539 0.02184 ... 3.804 3.894


In [66]:
n_time_steps = len(xdarr.coords['time'])
new_time_index = xr.DataArray([y + (m+1)*1/12 for (y, m) in xdarr.coords['time']])
print(xdarr.coords['time'])
print(xdarr.reset_index('time', drop='true').reindex(time=new_time_index))

<xarray.DataArray 'time' (time: 2880)>
array([(1861, 'Jan'), (1861, 'Feb'), (1861, 'Mar'), ..., (2100, 'Oct'),
       (2100, 'Nov'), (2100, 'Dec')], dtype=object)
Coordinates:
  * time     (time) MultiIndex
  - years    (time) int64 1861 1861 1861 1861 1861 ... 1863 1863 1863 1863 1863
  - months   (time) object 'Jan' 'Feb' 'Mar' 'Apr' ... 'Mar' 'Apr' 'May' 'Jun'
<xarray.DataArray (time: 2880, lat: 72, lon: 144)>
array([[[ 4.984500e-02,  3.539417e-02, ...,  6.735958e-02,  6.064958e-02],
        [ 2.517675e-01,  1.811475e-01, ...,  2.654975e-01,  2.798563e-01],
        ...,
        [ 1.725181e+00,  1.797913e+00, ...,  1.512133e+00,  1.629881e+00],
        [ 1.813504e-01,  1.649842e-01, ...,  1.489450e-01,  1.758250e-01]],

       [[-1.274082e+00, -1.231269e+00, ..., -1.357010e+00, -1.315867e+00],
        [ 1.820671e-01,  2.898679e-01, ..., -7.035500e-02,  6.193542e-02],
        ...,
        [-7.221208e+00, -7.289993e+00, ..., -7.077008e+00, -7.150156e+00],
        [-5.880786e+00, -5.907

In [11]:
var_datasets = dict()
for var in var_names:
    xds = xr.open_dataset('data/{}.nc'.format(var))
    var_datasets[var] = xds
    
common_models = set()
for var, ds in var_datasets.items():
    if len(common_models) == 0:
        common_models |= ds.data_vars.keys()
    else:
        common_models &= ds.data_vars.keys()
var_data = dict()
for var, ds in var_datasets.items():
    xs = []
    model_names = []
    print('processing model data for {}'.format(var))
    for model in sorted(filter(lambda m: m in common_models, ds.data_vars.keys())):
        xdarr = ds.data_vars[model]
        # fill NaNs
        xdarr = xdarr.ffill(dim='years')
        xdarr = xdarr.bfill(dim='years')
        xdarr = center_monthly_means(xdarr)
        xdarr = flatten_months(xdarr)
        xs.append(xdarr)
        model_names.append(model)
    print('building data array for {}'.format(var))
    var_dr = xr.DataArray(xs, [('models', model_names),('time', xs[0].indexes['time'])])
    var_data[var] = var_dr

print('building dataset for all variables')
model_time_var_ds = xr.Dataset(var_data)
print(model_time_var_ds)
model_time_var_ds = model_time_var_ds.reset_index('time')
print(model_time_var_ds)
model_time_var_ds.to_netcdf('./data/{}.nc'.format('models_all_vars_vs_time'))

processing model data for tas
building data array for tas
processing model data for tasmin
building data array for tasmin
processing model data for tasmax
building data array for tasmax
processing model data for pr
building data array for pr
processing model data for pme
building data array for pme
processing model data for evspsbl
building data array for evspsbl
building dataset for all variables
<xarray.Dataset>
Dimensions:  (models: 40, time: 2880)
Coordinates:
  * models   (models) <U16 'CCSM4_1' 'CCSM4_2' ... 'MRI-CGCM3_1' 'NorESM1-M_1'
  * time     (time) MultiIndex
  - years    (time) int64 1861 1861 1861 1861 1861 ... 1863 1863 1863 1863 1863
  - months   (time) object 'Jan' 'Feb' 'Mar' 'Apr' ... 'Mar' 'Apr' 'May' 'Jun'
Data variables:
    tas      (models, time) float64 -1.232 -0.9981 -0.3392 ... 1.091 1.176
    tasmin   (models, time) float64 -0.8222 -3.135 -0.5128 ... 1.486 0.5544
    tasmax   (models, time) float64 -0.5729 -2.731 -0.5294 ... 1.633 -0.1004
    pr       (mode

In [5]:
from sklearn.decomposition import PCA
from sklearn.manifold import SpectralEmbedding, TSNE
import matplotlib.pyplot as plt

In [6]:
from dtw import dtw, accelerated_dtw
from typing import Callable
from itertools import product

def _pardtw(params):
    x_i, x_j, metric = params
    d,cost,acc_cost,path = accelerated_dtw(x_i, x_j, metric)
    return d

def pdtw(X, metric: str, verbose: bool = False) -> np.ndarray:
    """
    Returns a function d: X x X -> R that calculates DTW distances from
    a tensor space X, where the second dim of X is time.
    X : data matrix
    metric : metric name to use for DTW (see scipy cdist)
    """
    n, t, m = X.shape
    pool = Pool(4)
    results = pool.map(_pardtw, [(X[i], X[j], metric) for i in range(n) for j in range(n)])
    return np.array(results).reshape((n,n))

In [8]:
ds = xr.open_dataset('data/models_all_vars_vs_time.nc')
X_ds = ds.to_array().transpose('models', 'time', 'variable')
print(X_ds)

<xarray.DataArray (models: 40, time: 2880, variable: 6)>
array([[[ -1.232390e+00,  -8.221712e-01, ...,  -1.762584e-07,  -3.691904e-07],
        [ -9.980754e-01,  -3.134700e+00, ...,  -3.496395e-06,   2.935993e-07],
        ..., 
        [  1.093070e+00,   9.386808e-01, ...,  -3.006784e-06,   2.310148e-06],
        [  1.049638e+00,   1.553081e+00, ...,   5.685991e-06,   2.558547e-06]],

       [[ -6.226550e-01,  -2.455808e-01, ...,   1.469825e-06,   5.744220e-07],
        [ -8.767850e-01,   5.197417e-01, ...,  -7.336508e-06,  -3.874350e-08],
        ..., 
        [  9.840863e-01,   1.147870e+00, ...,  -1.667757e-06,   2.476872e-07],
        [  3.382746e-01,   7.926292e-01, ...,  -5.932489e-06,   3.007603e-06]],

       ..., 
       [[ -1.374901e+00,  -1.929253e+00, ...,   1.195963e-05,  -6.273888e-06],
        [ -5.095788e-01,  -3.281883e+00, ...,   7.633096e-06,   1.095362e-06],
        ..., 
        [  4.450033e-01,   8.567608e-01, ...,  -1.969318e-06,   2.737182e-06],
        [  5.74

In [None]:
D_x = pdtw(X_ds, 'euclidean')
print(D_x.shape)
print(D_x)
np.save(X_ds, '/data/dtw.npy')