In [5]:
import os
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

from dateutil.relativedelta import relativedelta
from my_functions import read_obsfcstana_extend_datetime

In [6]:
expt_name = 'LS_DAv8_M36'

start_date = datetime(2020, 1, 2)
end_date = datetime(2020, 1, 6)

start_date_str = start_date.strftime('%Y%m%d')
end_date_str = end_date.strftime('%Y%m%d')

In [7]:
# Initialize arrays
max_tilenum = 112573
max_speciesnum = 13

obs_cnt  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
obs_sum  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
obs2_sum = np.zeros((max_tilenum + 1, max_speciesnum + 1))
fcst_sum  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
fcst2_sum = np.zeros((max_tilenum + 1, max_speciesnum + 1))
ana_sum  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
ana2_sum = np.zeros((max_tilenum + 1, max_speciesnum + 1))
omf_sum  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
omf2_sum = np.zeros((max_tilenum + 1, max_speciesnum + 1))
oma_sum  = np.zeros((max_tilenum + 1, max_speciesnum + 1))
oma2_sum = np.zeros((max_tilenum + 1, max_speciesnum + 1))

In [8]:
# Calculate the daily statistics in observation space

# Define the path directory
path = f'/Users/amfox/Desktop/GEOSldas_diagnostics/test_data/land_sweeper/{expt_name}/output/SMAP_EASEv2_M36_GLOBAL/ana/ens_avg'

# Define the common file name start
file_name_start = f'{expt_name}.ens_avg.ldas_ObsFcstAna.'

# Define the print flag
printflag = False

# Loop over the dates
current_date = start_date

while current_date <= end_date:
    # Define the file name for the current date
    file_name = file_name_start + '202001' # current_date.strftime('%Y%m%d')
    
    # Call the read_obsfcstana function for the current file
    date_time, species, tilenum, lon, lat, obs, obsvar, fcst, fcstvar, ana, anavar = read_obsfcstana_extend_datetime(path, file_name, printflag)

    # Increment the current date by one day
    current_date += timedelta(days=1) 

    # Convert to list of datetime objects
    datetime_list = [
        datetime(
            int(entry['year'][0]),
            int(entry['month'][0]),
            int(entry['day'][0]),
            int(entry['hour'][0]),
            int(entry['min'][0]),
            int(entry['sec'][0])
        )
        for entry in date_time
    ]

    # Convert to numpy array of datetime objects
    datetime_array = np.array(datetime_list)

    # Calculate the difference between the observation and forecast and observation and analysis
    omf = obs - fcst
    oma = obs - ana 

    # Find unique species values and their number
    unique_species, counts = np.unique(species, return_counts=True)
    num_unique_species = len(unique_species)

    # Find unique tilenum values
    unique_tilenum = np.unique(tilenum)

    # Find the number of unique tilenum values
    num_unique_tilenum = len(unique_tilenum)

    # Print the number of unique tilenum values
    print(f"Number of unique tilenum values: {num_unique_tilenum}")

    # Sort the arrays based on tilenum
    sort_indices = np.argsort(tilenum)
    sorted_tilenum = tilenum[sort_indices]
    sorted_species = species[sort_indices]
    sorted_obs = obs[sort_indices]
    sorted_fcst = fcst[sort_indices]
    sorted_ana = ana[sort_indices]
    sorted_omf = omf[sort_indices]
    sorted_oma = oma[sort_indices]
    sorted_datetime_array = datetime_array[sort_indices]

    # Find the unique tilenum values and their counts
    unique_tilenum, counts = np.unique(sorted_tilenum, return_counts=True)

    # Calculate the indices where the groups should be split
    split_indices = np.cumsum(counts)[:-1]

    # Split the sorted arrays based on the split indices
    tilenum_tile = np.split(sorted_tilenum, split_indices)
    species_tile = np.split(sorted_species, split_indices)
    obs_tile = np.split(sorted_obs, split_indices)
    fcst_tile = np.split(sorted_fcst, split_indices)
    ana_tile = np.split(sorted_ana, split_indices)
    omf_tile = np.split(sorted_omf, split_indices)
    oma_tile = np.split(sorted_oma, split_indices)
    datetime_tile = np.split(sorted_datetime_array, split_indices)

    # Loop over the unique tiles

    for i in range(num_unique_tilenum):
        tc = int(tilenum_tile[i][0])  # Current tile number

        # Create a dictionary to store indices for each species in the current tile
        species_indices_dict = {sc: np.where(species_tile[i] == sc)[0] for sc in unique_species}

        for sc in unique_species:
            species_indices = species_indices_dict[sc]

            if len(species_indices) > 0:
                sc = int(sc)  # Current species number
                obs_cnt[tc, sc] += len(species_indices)
                obs_sum[tc, sc] += np.sum(obs_tile[i][species_indices])
                obs2_sum[tc, sc] += np.sum(obs_tile[i][species_indices]**2)
                fcst_sum[tc, sc] += np.sum(fcst_tile[i][species_indices])
                fcst2_sum[tc, sc] += np.sum(fcst_tile[i][species_indices]**2)
                ana_sum[tc, sc] += np.sum(ana_tile[i][species_indices])
                ana2_sum[tc, sc] += np.sum(ana_tile[i][species_indices]**2)
                omf_sum[tc, sc] += np.sum(omf_tile[i][species_indices])
                omf2_sum[tc, sc] += np.sum(omf_tile[i][species_indices]**2)
                oma_sum[tc, sc] += np.sum(oma_tile[i][species_indices])
                oma2_sum[tc, sc] += np.sum(oma_tile[i][species_indices]**2)

    current_date += relativedelta(months=1)


Number of unique tilenum values: 104973


In [9]:
# Calculate the mean and standard deviation in observation space

obs_mean = np.zeros_like(obs_cnt)
obs_var = np.zeros_like(obs_cnt)
obs_std = np.zeros_like(obs_cnt)
fcst_mean = np.zeros_like(obs_cnt)
fcst_var = np.zeros_like(obs_cnt)
fcst_std = np.zeros_like(obs_cnt)
ana_mean = np.zeros_like(obs_cnt)
ana_var = np.zeros_like(obs_cnt)
ana_std = np.zeros_like(obs_cnt)
omf_mean = np.zeros_like(obs_cnt)
omf_var = np.zeros_like(obs_cnt)
omf_std = np.zeros_like(obs_cnt)
oma_mean = np.zeros_like(obs_cnt)
oma_var = np.zeros_like(obs_cnt)
oma_std = np.zeros_like(obs_cnt)

# Avoid division by zero
valid_mask = obs_cnt > 1

# Calculate the mean only for valid entries
obs_mean[valid_mask] = obs_sum[valid_mask] / obs_cnt[valid_mask]
fcst_mean[valid_mask] = fcst_sum[valid_mask] / obs_cnt[valid_mask]
ana_mean[valid_mask] = ana_sum[valid_mask] / obs_cnt[valid_mask]
omf_mean[valid_mask] = omf_sum[valid_mask] / obs_cnt[valid_mask]
oma_mean[valid_mask] = oma_sum[valid_mask] / obs_cnt[valid_mask]

# Calculate variance using the MATLAB approach for valid entries
obs_var[valid_mask] = (obs2_sum[valid_mask] - obs_cnt[valid_mask] * obs_mean[valid_mask]**2) / (obs_cnt[valid_mask] - 1)
fcst_var[valid_mask] = (fcst2_sum[valid_mask] - obs_cnt[valid_mask] * fcst_mean[valid_mask]**2) / (obs_cnt[valid_mask] - 1)
ana_var[valid_mask] = (ana2_sum[valid_mask] - obs_cnt[valid_mask] * ana_mean[valid_mask]**2) / (obs_cnt[valid_mask] - 1)
omf_var[valid_mask] = (omf2_sum[valid_mask] - obs_cnt[valid_mask] * omf_mean[valid_mask]**2) / (obs_cnt[valid_mask] - 1)
oma_var[valid_mask] = (oma2_sum[valid_mask] - obs_cnt[valid_mask] * oma_mean[valid_mask]**2) / (obs_cnt[valid_mask] - 1)

# Calculate the standard deviation
obs_std[valid_mask] = np.sqrt(obs_var[valid_mask])
fcst_std[valid_mask] = np.sqrt(fcst_var[valid_mask])
ana_std[valid_mask] = np.sqrt(ana_var[valid_mask])
omf_std[valid_mask] = np.sqrt(omf_var[valid_mask])
oma_std[valid_mask] = np.sqrt(oma_var[valid_mask])


In [10]:
# Save all output into one file using experiment name, start and end date
output_filename = f"{expt_name}_{start_date_str}_{end_date_str}_obsfcstana_stats.npz"

np.savez(
    output_filename,
    obs_mean=obs_mean,
    obs_std=obs_std,
    fcst_mean=fcst_mean,
    fcst_std=fcst_std,
    ana_mean=ana_mean,
    ana_std=ana_std,
    omf_mean=omf_mean,
    omf_std=omf_std,
    oma_mean=oma_mean,
    oma_std=oma_std
)

print(f"Output saved to {output_filename}")

Output saved to LS_DAv8_M36_20200102_20200106_obsfcstana_stats.npz
