# Time Analysis

* Time analysis of lungrunmip data
* Number of models for each experiment.
This notebook contains an exploration of the timesteps in the longrunmip data.
The notebooks also contains a time override. The models in my study all need to have the same timestamps - year since initialisation. The easiest way to go about this is overriding all time stamps and adding back in new time depending if annual or monthly (ann or monn in file name). 

In [1]:
import os, sys, warnings, cftime
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot 
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
sys.path.append(os.path.join(os.getcwd(), 'Documents', 'PhD'))
import constants
sys.path.append(constants.MODULE_DIR)
from importlib import reload
from time import perf_counter
from typing import List
import re
import xarray_class_accessors as xca
import utils
import open_ds

from pprint import pprint

# Files

In [2]:
def remove_other_variables(ds, variable):
    try:
        ds = ds[[variable]]
    # The dataset does not contain the variable for some reeason
    except KeyError as e:
        print('| Variable not found in dataset checing for other possible names')
        data_vars = np.array(ds.data_vars)
        
        # Only one variable there, so this must be it.
        if len(data_vars) == 1:
            ds = ds.rename({data_vars[0]: variable})
            # Can now try this again
            ds = ds[[variable]]
        else:
            possible_extra_vars = ['time_bnds', 'height', 'TAREA']
            possible_name = data_vars[~np.isin(data_vars, possible_extra_vars)]
            
            if len(possible_name) != 1:
                raise ValueError(f'Cant automate name change {possible_name=}')
                
            ds = ds.rename({possible_name[0]: variable})
            
            ds = ds[[variable]]
        
    return ds

In [3]:
def refactor_longronmip_netcdf(ds:xr.Dataset, variable:str, months: List[int] = None) -> xr.Dataset:
    '''
    Refactor a longronmip netCDF dataset by modifying its dimensions and time coordinates.
    
    Args:
        ds: The xarray dataset to be refactored.
        variable: The variable that is being used. This is important for the way that the resampling occurs.
        months: A list of month values to subset the dataset.
    
    Returns:
        The refactored xarray dataset.
    '''
    MEAN_VARIABLES = ['tas', 'tos', 'sic', 'netTOA', 'surf']
    SUM_VARIABLES = ['pr']
    
    print('\n| Refactoring dims |', end='')
    ds = open_ds.refactor_dims(ds)

    freq = 'M' if re.search(r'\w+mon\w+', fname) else 'Y'
    print(f' | freq={freq}', end='')
    ds['time'] = open_ds.make_new_time(ds, freq=freq, debug=False)
    
    if months and freq == 'M':
        print(f' | Subsetting for months {months}')
        print(f' | Original length = {len(ds.time.values)}')
        ds = ds.where(ds.time.dt.month.isin(months), drop=True)
        print(f' | New length = {len(ds.time.values)}')

    if freq == 'M':
        print(f' | Resampling with resample=True')
        print(f' | Original length = {len(ds.time.values)}', end='')
        if variable in MEAN_VARIABLES:
            print(' - Resampling with MEAN')
            ds = ds.resample(time='Y').mean()
        elif variable in SUM_VARIABLES:
            print(' - +Resampling with SUM')
            ds = ds.resample(time='Y').sum()
        else:
            raise TypeError('Variable is not in list of variables. Please define to method')
        print(f' | New length = {len(ds.time.values)}')
        
    return ds

In [4]:
os.listdir(constants.LONGRUNMIP_DIR)

['tas',
 'surf',
 'netTOA',
 'sic_aso',
 'sic_fma',
 'zarr_test',
 'landesea_masks',
 'tos',
 'pr',
 '.ipynb_checkpoints',
 'psl',
 'sic']

In [5]:
variable = 'surf' # Completed: pr, tas, sic, surf, tos, psl, netTOA
ROOT_DIR = os.path.join(constants.LONGRUNMIP_DIR, variable, 'regrid')

In [6]:
files = os.listdir(ROOT_DIR)
utils.pprint_list(files)

length = 17
0. surf_ann_HadCM3L_control_1000_g025.nc
1. surf_ann_CCSM3_control_1530_g025.nc


In [7]:
month_group = None#'aso'
MONTH_GROUP_MAPPING = {'fma': [2,3,4], 'aso': [8, 9, 10], None: None}
months = MONTH_GROUP_MAPPING[month_group]
print(months)

None


In [8]:
OUTPUT_DIR = os.path.join(constants.LONGRUNMIP_DIR, variable, 'regrid_retimestamped')
utils.mkdir_no_error(OUTPUT_DIR)
if month_group:
    OUTPUT_DIR = os.path.join(OUTPUT_DIR, month_group)
    utils.mkdir_no_error(OUTPUT_DIR)
OUTPUT_DIR

'/g/data/w40/ab2313/PhD/longrunmip/surf/regrid_retimestamped'

In [9]:
utils.pprint_list(os.listdir(OUTPUT_DIR), num_start_items=0)

length = 17



In [20]:
# Only need 4x and control
files = np.sort([f for f in files if 'abrupt4x' in f or 'control' in f])
utils.pprint_list(files, num_start_items=len(files))

length = 17
0. surf_ann_CCSM3_abrupt4x_2130_g025.nc
1. surf_ann_CCSM3_control_1530_g025.nc
2. surf_ann_CESM104_abrupt4x_5900_g025.nc
3. surf_ann_CESM104_control_1000_g025.nc
4. surf_ann_CNRMCM61_abrupt4x_1850_g025.nc
5. surf_ann_CNRMCM61_control_2000_g025.nc
6. surf_ann_ECHAM5MPIOM_control_100_g025.nc
7. surf_ann_FAMOUS_abrupt4x_3000_g025.nc
8. surf_ann_FAMOUS_control_3000_g025.nc
9. surf_ann_GFDLCM3_control_5200_g025.nc
10. surf_ann_GFDLESM2M_control_1340_g025.nc
11. surf_ann_HadCM3L_abrupt4x_1000_g025.nc
12. surf_ann_HadCM3L_control_1000_g025.nc
13. surf_ann_MPIESM11_abrupt4x_4520_g025.nc
14. surf_ann_MPIESM11_control_2000_g025.nc
15. surf_ann_MPIESM12_abrupt4x_1000_g025.nc
16. surf_ann_MPIESM12_control_1500_g025.nc


In [21]:
# Special files that may have failed.
# These are checked in http://localhost:8888/notebooks/Documents/PhD/longrumip_00_07_data_opening_check.ipynb
# 'tas_mon_CNRMCM61_control_2000_g025.nc', 'tas_mon_IPSLCM5A_abrupt4x_1000_g025.nc' 
# 'surf_ann_CESM104_abrupt4x_5900_g025.nc', 'surf_ann_CESM104_abrupt4x_5900_g025.nc'
# files = ['sic_mon_CESM104_abrupt4x_5900_g02.nc']

In [22]:
already_completed = os.listdir(OUTPUT_DIR)
utils.pprint_list(already_completed, num_end_items=2)

length = 17
0. surf_ann_HadCM3L_control_1000_g025.nc
1. surf_ann_CCSM3_control_1530_g025.nc
...
-1. surf_ann_FAMOUS_abrupt4x_3000_g025.nc
-2. surf_ann_CNRMCM61_abrupt4x_1850_g025.nc


In [23]:
files_to_complete = [f for f in files if f not in already_completed]
files_to_complete = [f for f in files_to_complete if ('abrupt4x' in f or 'control' in f)]
utils.pprint_list(files_to_complete, num_start_items=5)

length = 0



In [24]:
files_to_complete = ['surf_ann_CESM104_control_1000_g025.nc']

In [25]:
chunks = {'time':-1, 'lat':72/2, 'lon':144/12}

In [26]:
error_log = {}
for i, fname in enumerate(files_to_complete):

    t1 = perf_counter()
    print(f'{i+1}: - {fname}', end='')
    fpath = os.path.join(ROOT_DIR, fname)

    try:
        ds = xr.open_dataset(fpath, use_cftime=True, chunks=chunks)
        ds = remove_other_variables(ds, variable) 
        ds = refactor_longronmip_netcdf(ds, variable=variable, months=months)
    except ValueError as e:
        ds = xr.open_dataset(fpath, decode_times=False, chunks=chunks)
        ds = remove_other_variables(ds, variable) 
        ds = refactor_longronmip_netcdf(ds, variable=variable, months=months)
    except (ValueError, OSError, KeyError, FileNotFoundError) as e:
        print(f'\nERROR: {e}\n')
        error_log[fname] = e
    try:
        ds.to_netcdf(os.path.join(OUTPUT_DIR, fname))
        t2 = perf_counter()
        print(f'| complete ({t2-t1})')
    except PermissionError as e:
        print(f'\nERROR: {e}\n')
        error_log[fname] = e        

1: - surf_ann_CESM104_control_1000_g025.nc
| Refactoring dims | | freq=Y| complete (4.100634130649269)


In [None]:
error_log

In [None]:
file_check = os.listdir(OUTPUT_DIR)
utils.pprint_list(file_check)

Failed files <br> 
<b> Pr </b>
* <s> 'pr_mon_CESM104_abrupt4x_5900_g025.nc' </s>

<b> tas </b>
* 'tas_mon_GISSE2R_control_5225_g025.nc'

<b> surf </b>

lenght = 1 <br>
 0. <s> surf_ann_CESM104_control_1000_g025.nc </s>
 
<b> tos </b>

lenght = 1 <br>
0. <s> tos_ann_CESM104_control_1000_g025.nc </s>

<b> netNOTA </b>




<b> sic </b>

