## Input Parameters

In [1]:
# This would show NLCD 82 points for all MGRS grid zones
point_nlcd_keep_list = [82]
point_mgrs_keep_list = []

# The minimum number of months in the target year
min_month_count = 3

# Overwrite the summary stats file everytime this block is run
output_txt = 'summary_stats_nlcd82.txt'
output_f = open(output_txt, 'w')
output_f.write('MGRS: '+ ', '.join(point_mgrs_keep_list) + '\n')
output_f.write('NLCD: '+ ', '.join(map(str, point_nlcd_keep_list)) + '\n\n')
output_f.close()

## Other Parameters

In [2]:
data_folder = 'data'
points_folder = 'points'

points_csv = 'gap_fill_test_points.csv'

# Exclude 2016 from the statistics since there is not a full prior year to interpolate from
# Including 2024 even though 2025 is not complete
stats_years = list(range(2017, 2025))

# Points were only built for NLCD 2024
nlcd_years = [2024]

# Include all MGRS grid zones except 12R and 16U since they are too small
mgrs_zones = [
    '10S', '10T', '10U', '11S', '11T', '11U', '12S', '12T', '12U', 
    '13R', '13S', '13T', '13U', '14R', '14S', '14T', '14U', '15R', '15S', '15T', '15U', 
    '16R', '16S', '16T', '17R', '17S', '17T', '18S', '18T', '19T'
    # '12R', '16U'
]

months = list(range(1, 13))

# TODO: Add support for setting a minimum number of months in the year
#   and minimum number of months in the growing season
# min_month_count = 6
# min_gs_month_count = 3


## Python Imports

In [3]:
from datetime import datetime
import math
import os
import pprint
import random

#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sns
import sklearn.metrics
import sklearn.linear_model

from whittaker_eilers import WhittakerSmoother

## Read the point CSV files

In [4]:
# Building a single points dataframe and CSV from the MGRS grid zone points CSV files
overwrite_flag = False

# Read the separate points CSV files into a single dataframe
points_df_list = [
    pd.read_csv(os.path.join(points_folder, f'points_{mgrs_zone}_{nlcd_year}.csv'), index_col=None, header=0)
    for nlcd_year in nlcd_years
    for mgrs_zone in mgrs_zones
    if os.path.isfile(os.path.join(points_folder, f'points_{mgrs_zone}_{nlcd_year}.csv'))
]
points_df = pd.concat(points_df_list, axis=0, ignore_index=True)
print(f'Points: {len(points_df.index)}')

# The mgrs_zone value will eventually be added to the csv files
points_df['mgrs_zone'] = points_df['mgrs_tile'].str.slice(0, 3)

# Add a unique index to the points dataframe
points_df['index_group'] = points_df.groupby(['mgrs_tile', 'nlcd']).cumcount()
points_df['point_id'] = (
    points_df["mgrs_tile"].str.upper() + '_' +
    'nlcd' + points_df["nlcd"].astype(str).str.zfill(2) + '_' +
    points_df["index_group"].astype(str).str.zfill(2)
)
del points_df['index_group']

# Round the lat and lon to 8 decimal places (probably should be 6)
points_df['latitude'] = round(points_df['latitude'], 8)
points_df['longitude'] = round(points_df['longitude'], 8)

# # Write to CSV
# if not os.path.isfile(points_csv) or overwrite_flag:
#     print('Writing points csv')
#     points_df.to_csv(points_csv, index=False)


Points: 17084


## Read the data CSV files

This block may take a little while to execute

In [5]:
# Read the CSV files into separate dataframes for each point
print('Reading mgrs data csv files')
data_df_dict = {}
for mgrs_zone in mgrs_zones:
    # print(mgrs_zone)
    if not os.path.isfile(os.path.join(data_folder, f'data_{mgrs_zone}.csv')):
        continue
        
    mgrs_df = pd.read_csv(os.path.join(data_folder, f'data_{mgrs_zone}.csv'), index_col=None, header=0)

    # Set MGRS value to upper case 
    # (at some point change this in all the data CSV files)
    mgrs_df['mgrs_tile'] = mgrs_df['mgrs_tile'].str.upper()
    mgrs_df['mgrs_zone'] = mgrs_df['mgrs_zone'].str.upper()
    
    # Compute the ET fraction
    mgrs_df['etf'] = mgrs_df['et'] / mgrs_df['eto']
    
    # Get the month for computing climos
    mgrs_df['date'] = pd.to_datetime(mgrs_df['date'])
    mgrs_df['year'] = mgrs_df['date'].dt.year
    mgrs_df['month'] = mgrs_df['date'].dt.month
    
    # Confirm that specific NLCD categories are not included
    # TODO: This probably isn't needed and switch to a check instead of masking
    for nlcd_skip in [11, 12, 21, 22, 23]:
        mgrs_df = mgrs_df[mgrs_df['nlcd'] != nlcd_skip]

    # Save dataframe for each point
    for point_id in mgrs_df['point_id'].unique():
        site_df = mgrs_df.loc[mgrs_df['point_id']==point_id].copy()
        site_df.set_index('date', drop=True, inplace=True)
        site_df.sort_index(inplace=True)
        data_df_dict[point_id] = site_df
        del site_df
    del mgrs_df

print('\nDone')

Reading mgrs data csv files

Done


## Compute the ETf climos

In [6]:
# Compute the maximum ETf per site
# Assuming it is okay to make this for the full period of record
print('\nComputing maximum ETf')
etf_max_dict = {
    point_id: data_df_dict[point_id].agg(etf=('etf', 'max'))['etf'].to_dict()['etf']
    for point_id in data_df_dict.keys()
}

# Compute climos for each site
# Only keep the climo value if there are at least "n" years of data
print('\nComputing monthly climatologies')
month_climo_count_min = 2
month_climo_dict = {}
for point_id in data_df_dict.keys():
    month_climo = data_df_dict[point_id].groupby(['month']).agg(
        etf=('etf', 'mean'), 
        etf_median=('etf', 'median'), 
        count=('etf', 'count'), 
        et=('et', 'mean'), 
        eto=('eto', 'mean'),
    )
    month_climo_count_mask = month_climo['count'] < month_climo_count_min
    month_climo.loc[month_climo_count_mask, ['etf', 'etf_median', 'et']] = np.nan
    month_climo_dict[point_id] = month_climo
    del month_climo, month_climo_count_mask

# # Compute climos for each target year that have the target year values excluded
# # CGM - This might be worth testing more but doesn't seem worth the time to generate
# print('\nComputing monthly climatologies for target years')
# month_climo_dict = {}
# for point_id in data_df_dict.keys():
#     if point_id not in month_climo_dict.keys():
#         month_climo_dict[point_id] = {}
#     for year in stats_years:
#         month_climo_dict[point_id][year] = (
#             data_df_dict[point_id][data_df_dict[point_id].year != year]
#             .groupby(['month'])
#             .agg(
#                 etf=('etf', 'mean'), 
#                 etf_median=('etf', 'median'), 
#                 count=('etf', 'count'), 
#                 et=('et', 'mean'), 
#                 eto=('eto', 'mean'),
#             )
#         )

# TODO: Join the climo values to the data dictionaries
#   It might be faster to join the climo data here instead of looking up in the function

print('\nDone')


Computing maximum ETf

Computing monthly climatologies

Done


## Functions for computing filled values and summary statistics

In [7]:
def generate_windows(
        point_id_list, 
        months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 
        years=[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], 
        exclude_months_without_climo=True
):
    """Generate Window Dataframes"""
    for i, point_id in enumerate(point_id_list):
        # Assume the data df dictionary exists in the global scope
        site_df = data_df_dict[point_id]
        
        for year in years:
            # Pull a three year window for each target year so that there are images to interpolate and fill from
            window_df = site_df[(site_df.index.year >= (year-1)) & (site_df.index.year <= (year+1))].copy()

            # If excluding months without climos, set them to NaN here
            # TODO: Test out adding the climo values to the window_df here 
            #   instead of in the compute_filled_values() function below
            if exclude_months_without_climo:
                merge_df = pd.merge(window_df[['month']], month_climo_dict[point_id]['etf'], how="left", on="month")
                climo_nan_mask = merge_df['etf'].isna().values
                window_df.loc[climo_nan_mask, ['etf', 'et', 'count']] = np.nan

            year_mask = window_df.index.year==year
            year_month_mask = year_mask & window_df.index.month.isin(months)
            
            if window_df.loc[year_mask, 'etf'].count() < min_month_count:
                # Check if there are enough months in the target year
                # print(f'{point_id} - {i} - {year} - not enough unmasked months, skipping')
                continue
            elif window_df.loc[year_month_mask, 'etf'].isna().all():
                # Check that there are target months with data in the 
                # print(f'{point_id} - {i} - {year} - no unmasked months in year/months, skipping')
                continue
            elif (window_df.loc[(window_df.index.year==year-1), 'etf'].isna().all() or 
                  window_df.loc[(window_df.index.year==year+1), 'etf'].isna().all()):
                # Check if there is data in the prev/next year to interpolate from
                # print(f'{point_id} - {i} - {year} - no unmasked months in next/prev year, skipping')
                continue

            yield point_id, year, window_df, year_month_mask


comparison_cols = [
    'interpolate', 'climo_mean', 'climo_median',
    'conor',
    'interp_clim_a', 'interp_clim_b', 'interp_clim_c',
    'whit_a_0p50', 'whit_a_0p20', 'whit_a_0p10', 'whit_a_0p05', 'whit_a_0p01', 
]


def compute_filled_values(window_df, tgt_indices, point_id):
    """"""
    # Get a copy of the target value before clearing
    original_etf = window_df.loc[tgt_indices, 'etf'].values
        
    # Set the target row values to NaN
    window_df.loc[tgt_indices, ('etf', 'et', 'count')] = np.nan
    
    # Setup the Whittaker Smoothing for the full dataframe outside of the index loop
    # The smoothing function needs all nans filled with a value
    # The fill value is not important as long as the weight value is set to 0
    window_df['temp'] = window_df['etf'].copy()
    window_df.loc[np.isnan(window_df['temp']), 'temp'] = -1
    etf = window_df['temp'].values

    # TODO: Make sure weights are set to 0 for all temp==-1 rows
    #   This might be happening already with the .fillna(0) call but double check
    #   Right now the code is assuming count is NaN if etf is NaN
    
    # Default weights with 1 for data and 0 for missing values
    weight_a = window_df['count'].clip(lower=1, upper=1).fillna(0)
    if not any(weight_a):
        print(f'{point_id} - {i} - {year} - all weights 0, skipping')
        return []
    # CGM - I tested out building the smoother once and then updating lambda,
    #   but it didn't seem any faster
    whit_a_0p50 = WhittakerSmoother(lmbda=0.5, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p20 = WhittakerSmoother(lmbda=0.2, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p10 = WhittakerSmoother(lmbda=0.1, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p05 = WhittakerSmoother(lmbda=0.05, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p01 = WhittakerSmoother(lmbda=0.01, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)

    # CGM - I was testing out trying different weights but it didn't seem to change the values at all
    # # Compute weights based on the the scene count value
    # # Set count 0 images to a weight of 0
    # weight = window_df['count'].clip(lower=0, upper=1).fillna(0)

    # # Compute weights based on the the scene count value
    # # Set counts of 0 to a weight of 0.5 and all other to 1    
    # weight = window_df['count'].add(1).clip(upper=2).divide(2).fillna(0)

    # # Compute weights based on the scene count value
    # # Set count weights as: 0 -> 0, 1 -> 0.5, 2+ -> 1
    # weight = window_df['count'].fillna(0).clip(upper=2).divide(2)
    
    # Process each target index separately
    values = []
    for i, (tgt_index, tgt_i) in enumerate(zip(tgt_indices, window_df.index.get_indexer(tgt_indices))):

        interp_value = window_df['etf'].interpolate(method='linear').loc[tgt_index]

        # Climos for all years
        climo_mean = month_climo_dict[point_id].loc[tgt_index.month, 'etf']
        climo_count = month_climo_dict[point_id].loc[tgt_index.month, 'count']
        climo_median = month_climo_dict[point_id].loc[tgt_index.month, 'etf_median']
        # # Climos with the target year excluded (not sure if this matters)
        # climo_mean = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'etf']
        # climo_count = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'count']
        # climo_median = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'etf_median']

        # Compute various combinations of averaging the climo and interpolate values
        # Simple mean
        interp_clim_a = (climo_mean + interp_value) / 2
        # Simple mean with the median climo
        interp_clim_c = (climo_median + interp_value) / 2
        # Weight the climo based on the number of months in the climo?
        climo_months = 10
        interp_clim_b = (climo_mean * climo_count + interp_value * climo_months) / (climo_count + climo_months)

        # Conor's Approach
        # There is probably an easier way, but splitting the dataframe at the target index seemed to work pretty well
        window_prev_df = window_df.iloc[:tgt_i]
        window_next_df = window_df.iloc[tgt_i+1:]
        prev_index = window_prev_df['etf'].last_valid_index()
        next_index = window_next_df['etf'].first_valid_index()
        prev_i = window_df.index.get_loc(prev_index)
        next_i = window_df.index.get_loc(next_index)
        w_prev = 0.5 * math.exp(1 - (tgt_i - prev_i))
        w_next = 0.5 * math.exp(1 - (next_i - tgt_i))
        value_prev = window_df['etf'].iloc[prev_i]
        value_next = window_df['etf'].iloc[next_i]
        climo_prev = month_climo_dict[point_id].loc[prev_index.month, 'etf']
        climo_next = month_climo_dict[point_id].loc[next_index.month, 'etf']
        conor = w_prev * (value_prev - climo_prev) + w_next * (value_next - climo_next) + climo_mean

        values.append({
            'index': tgt_index,
            'point_id': point_id,
            'mgrs': point_id.split('_')[0],
            'nlcd': int(point_id.split('_')[1][4:6]),
            'original': original_etf[i],
            # Filled values
            'interpolate': interp_value,
            'climo_mean': climo_mean,
            'climo_median': climo_median,
            'conor': conor,
            'interp_clim_a': interp_clim_a,
            'interp_clim_b': interp_clim_b,
            'interp_clim_c': interp_clim_c,
            'whit_a_0p50': min(max(whit_a_0p50[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p20': min(max(whit_a_0p20[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p10': min(max(whit_a_0p10[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p05': min(max(whit_a_0p05[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p01': min(max(whit_a_0p01[tgt_i], 0), etf_max_dict[point_id]),
            
        })

    return values


def comparison_stats(df, x_col='original', y_cols=[], title='', print_flag=True, write_flag=True):
    """"""
    output = [title]

    # TODO: Build the format strings based on the number of parameters instead of hardcoding
    output.append('  {:>16s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}'.format(
        'method', 'rmse', 'mae', 'mbe', 'm', 'b', 'r2', 'n'
    ))
    for y_col in y_cols:
        # Remove any NaN rows before computing statistics
        stat_df = df[df[y_col].notna()]
        model = sklearn.linear_model.LinearRegression()
        model.fit(stat_df[[x_col]], stat_df[y_col])

        output.append('  {:>16s} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8d}'.format(
            y_col,
            sklearn.metrics.root_mean_squared_error(stat_df[x_col], stat_df[y_col]),
            sklearn.metrics.mean_absolute_error(stat_df[x_col], stat_df[y_col]),
            np.mean(stat_df[y_col] - stat_df[x_col]),
            # np.mean(stat_df[x_col] - stat_df[y_col]),
            # sklearn.metrics.r2_score(stat_df[x_col], stat_df[y_col]),
            model.coef_[0],
            model.intercept_, 
            model.score(stat_df[[x_col]], stat_df[y_col]),
            # This count doesn't seem to change even when there are NaN values in the dataframe
            stat_df[y_col].count(),
        ))

    if print_flag:
        print('\n'.join(output))
    if write_flag:
        with open(output_txt, 'a') as output_f:
            output_f.write('\n'.join(output+ ['\n']))


## Filter to the target MGRS and NLCD classes

In [8]:
# Filter the points list to the target NLCD classes and MGRS grid zones
point_id_list = list(data_df_dict.keys())
if point_nlcd_keep_list:
    point_id_list = [p for p in point_id_list if int(p.split('_')[1][4:6]) in point_nlcd_keep_list]
if point_mgrs_keep_list:
    point_id_list = [p for p in point_id_list if p.split('_')[0][0:3] in point_mgrs_keep_list]

print(f'Points: {len(point_id_list)}')

Points: 1577


## Randomly drop one datapoint in each year

In [9]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop one datapoint in each year',
    print_flag=True, write_flag=True
)

Randomly drop one datapoint in each year
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1386   0.1067   0.0010   0.7421   0.1597   0.7395    12616
        climo_mean   0.1512   0.1155   0.0014   0.6887   0.1929   0.6899    12616
      climo_median   0.1565   0.1108   0.0026   0.7260   0.1712   0.6723    12616
             conor   0.1180   0.0903  -0.0003   0.8403   0.0979   0.8123    12616
     interp_clim_a   0.1276   0.0992   0.0012   0.7154   0.1763   0.7857    12616
     interp_clim_b   0.1280   0.0994   0.0015   0.7154   0.1766   0.7840    12616
     interp_clim_c   0.1283   0.0974   0.0018   0.7341   0.1654   0.7795    12616
       whit_a_0p50   0.1457   0.1117   0.0006   0.7474   0.1560   0.7138    12616
       whit_a_0p20   0.1407   0.1071   0.0004   0.7837   0.1335   0.7347    12616
       whit_a_0p10   0.1391   0.1055   0.0002   0.8026   0.1217   0.7426    12616
       whit_a_0p05   0.1387   0.1049   0.0001   0.8149   

## Randomly drop a single datapoint from the "growing" season (Apr-Sept)

In [10]:
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
        
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a single datapoint from the "growing" season (Apr-Sept)',
    print_flag=True, write_flag=True
)

Randomly drop a single datapoint from the "growing" season (Apr-Sept)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1368   0.1064  -0.0119   0.7400   0.1668   0.7629    12616
        climo_mean   0.1549   0.1173   0.0028   0.6921   0.2144   0.6931    12616
      climo_median   0.1609   0.1124   0.0081   0.7360   0.1895   0.6749    12616
             conor   0.1137   0.0869   0.0008   0.8545   0.1008   0.8349    12616
     interp_clim_a   0.1277   0.1002  -0.0045   0.7160   0.1906   0.8004    12616
     interp_clim_b   0.1277   0.1002  -0.0045   0.7161   0.1905   0.8005    12616
     interp_clim_c   0.1281   0.0975  -0.0019   0.7380   0.1782   0.7938    12616
       whit_a_0p50   0.1448   0.1123  -0.0070   0.7330   0.1764   0.7321    12616
       whit_a_0p20   0.1374   0.1056  -0.0043   0.7769   0.1490   0.7591    12616
       whit_a_0p10   0.1344   0.1028  -0.0032   0.8007   0.1337   0.7703    12616
       whit_a_0p05   0.1331 

## Randomly drop a single datapoint from the "non-growing" season (Oct-Mar)

In [11]:
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a single datapoint from the "non-growing" season (Oct-Mar)',
    print_flag=True, write_flag=True
)

Randomly drop a single datapoint from the "non-growing" season (Oct-Mar)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1431   0.1087   0.0202   0.7203   0.1664   0.6261    12582
        climo_mean   0.1483   0.1157  -0.0003   0.5760   0.2215   0.5747    12582
      climo_median   0.1525   0.1112  -0.0035   0.5997   0.2059   0.5541    12582
             conor   0.1236   0.0952  -0.0016   0.7560   0.1260   0.7078    12582
     interp_clim_a   0.1289   0.1000   0.0100   0.6482   0.1940   0.6823    12582
     interp_clim_b   0.1300   0.1006   0.0107   0.6492   0.1942   0.6765    12582
     interp_clim_c   0.1298   0.0989   0.0083   0.6600   0.1861   0.6762    12582
       whit_a_0p50   0.1483   0.1123   0.0124   0.7296   0.1538   0.6039    12582
       whit_a_0p20   0.1465   0.1109   0.0077   0.7498   0.1386   0.6156    12582
       whit_a_0p10   0.1466   0.1110   0.0057   0.7598   0.1313   0.6177    12582
       whit_a_0p05   0.14

## Randomly drop a datapoint from the winter (Dec-Feb)

In [12]:
months = [12, 1, 2]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))

output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a datapoint from the winter (Dec-Feb)',
    print_flag=True, write_flag=True
)

Randomly drop a datapoint from the winter (Dec-Feb)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1426   0.1075   0.0178   0.6776   0.1720   0.6005    11022
        climo_mean   0.1453   0.1136   0.0036   0.5661   0.2111   0.5685    11022
      climo_median   0.1494   0.1083  -0.0017   0.5870   0.1959   0.5467    11022
             conor   0.1247   0.0955   0.0011   0.7475   0.1218   0.6873    11022
     interp_clim_a   0.1282   0.0989   0.0107   0.6218   0.1916   0.6696    11022
     interp_clim_b   0.1304   0.0998   0.0121   0.6228   0.1924   0.6570    11022
     interp_clim_c   0.1290   0.0974   0.0081   0.6323   0.1839   0.6623    11022
       whit_a_0p50   0.1442   0.1085   0.0042   0.6851   0.1549   0.5903    11022
       whit_a_0p20   0.1450   0.1088   0.0020   0.7061   0.1426   0.5922    11022
       whit_a_0p10   0.1467   0.1099   0.0016   0.7167   0.1371   0.5882    11022
       whit_a_0p05   0.1486   0.1113   0.0015 

## Randomly drop a single datapoint that is next to an existing missing data point 

In [13]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # Skip the year if there are no NaN values
    if not window_df.loc[year_month_mask, 'etf'].isna().any():
        # print(f'{point_id} - {i} - {year} - no missing data points, skipping')
        continue
    
    # For the target year, pick a random month that is next to a missing/masked month but has data
    nan_mask = window_df['etf'].isna()
    tgt_mask = (
        (nan_mask | nan_mask.shift(1) | nan_mask.shift(-1))
        & window_df['etf'].notna() & year_month_mask
    )
    if not tgt_mask.any():
        print(f'{point_id} - {i} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a single datapoint that is next to an existing missing data point',
    print_flag=True, write_flag=True
)

Randomly drop a single datapoint that is next to an existing missing data point
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1920   0.1503   0.0285   0.5585   0.2798   0.4185     5431
        climo_mean   0.1629   0.1266  -0.0105   0.5345   0.2545   0.5392     5431
      climo_median   0.1692   0.1223  -0.0141   0.5498   0.2421   0.5077     5431
             conor   0.1491   0.1160  -0.0105   0.6217   0.2048   0.6143     5431
     interp_clim_a   0.1588   0.1245   0.0090   0.5465   0.2671   0.5622     5431
     interp_clim_b   0.1620   0.1266   0.0112   0.5423   0.2717   0.5446     5431
     interp_clim_c   0.1603   0.1238   0.0072   0.5542   0.2610   0.5530     5431
       whit_a_0p50   0.2030   0.1579   0.0048   0.6005   0.2322   0.3923     5431
       whit_a_0p20   0.2023   0.1570  -0.0021   0.6322   0.2072   0.4088     5431
       whit_a_0p10   0.2032   0.1575  -0.0047   0.6484   0.1954   0.4136     5431
       whit_a_0p05

## Randomly drop a two month gap during the year

But only check the filled value in one month of the gap

In [14]:
dropped_months = 2
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))

    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a two month gap during the year',
    print_flag=True, write_flag=True
)

Randomly drop a two month gap during the year
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1702   0.1317   0.0000   0.6501   0.2146   0.6034    12616
        climo_mean   0.1513   0.1164  -0.0024   0.6845   0.1910   0.6840    12616
      climo_median   0.1565   0.1118  -0.0014   0.7226   0.1687   0.6664    12616
             conor   0.1282   0.0986  -0.0036   0.7731   0.1356   0.7734    12616
     interp_clim_a   0.1403   0.1101  -0.0012   0.6673   0.2028   0.7345    12616
     interp_clim_b   0.1412   0.1108  -0.0007   0.6663   0.2040   0.7304    12616
     interp_clim_c   0.1402   0.1081  -0.0007   0.6863   0.1917   0.7312    12616
       whit_a_0p50   0.1778   0.1366  -0.0008   0.6825   0.1939   0.5813    12616
       whit_a_0p20   0.1749   0.1337  -0.0015   0.7199   0.1703   0.6010    12616
       whit_a_0p10   0.1746   0.1332  -0.0019   0.7398   0.1577   0.6077    12616
       whit_a_0p05   0.1753   0.1336  -0.0022   0.75

## Randomly drop a two month gap during the growing season (Apr-Sept)

In [15]:
dropped_months = 2
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a two month gap during the growing season (Apr-Sept)',
    print_flag=True, write_flag=True
)

Randomly drop a two month gap during the growing season (Apr-Sept)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1741   0.1353  -0.0203   0.6437   0.2235   0.6206    12616
        climo_mean   0.1544   0.1170   0.0036   0.6921   0.2142   0.6971    12616
      climo_median   0.1600   0.1119   0.0092   0.7347   0.1907   0.6798    12616
             conor   0.1293   0.0985   0.0027   0.7831   0.1511   0.7873    12616
     interp_clim_a   0.1431   0.1127  -0.0084   0.6679   0.2189   0.7490    12616
     interp_clim_b   0.1433   0.1128  -0.0084   0.6677   0.2190   0.7485    12616
     interp_clim_c   0.1427   0.1099  -0.0056   0.6892   0.2071   0.7456    12616
       whit_a_0p50   0.1815   0.1404  -0.0078   0.6602   0.2247   0.5899    12616
       whit_a_0p20   0.1759   0.1350  -0.0046   0.7047   0.1974   0.6188    12616
       whit_a_0p10   0.1737   0.1326  -0.0036   0.7297   0.1813   0.6317    12616
       whit_a_0p05   0.1729   0

## Randomly drop a two month gap during the non-growing season (Oct-Mar)

In [16]:
dropped_months = 2
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a two month gap during the non-growing season (Oct-Mar)',
    print_flag=True, write_flag=True
)

Randomly drop a two month gap during the non-growing season (Oct-Mar)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1666   0.1276   0.0352   0.6678   0.2082   0.5313    12582
        climo_mean   0.1491   0.1157   0.0011   0.5706   0.2248   0.5778    12582
      climo_median   0.1532   0.1110  -0.0025   0.5950   0.2085   0.5568    12582
             conor   0.1302   0.1005  -0.0001   0.6863   0.1633   0.6783    12582
     interp_clim_a   0.1389   0.1080   0.0181   0.6192   0.2165   0.6408    12582
     interp_clim_b   0.1411   0.1092   0.0195   0.6187   0.2181   0.6296    12582
     interp_clim_c   0.1395   0.1067   0.0164   0.6314   0.2084   0.6357    12582
       whit_a_0p50   0.1739   0.1324   0.0186   0.6878   0.1813   0.5014    12582
       whit_a_0p20   0.1738   0.1318   0.0134   0.7131   0.1629   0.5104    12582
       whit_a_0p10   0.1752   0.1328   0.0114   0.7265   0.1539   0.5107    12582
       whit_a_0p05   0.1772 

## Randomly drop a three month gap during the year

In [17]:
dropped_months = 3
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a three month gap during the year',
    print_flag=True, write_flag=True
)

Randomly drop a three month gap during the year
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1959   0.1513   0.0041   0.5845   0.2571   0.4934    12616
        climo_mean   0.1523   0.1166  -0.0001   0.6805   0.1945   0.6834    12616
      climo_median   0.1579   0.1122   0.0014   0.7172   0.1736   0.6642    12616
             conor   0.1366   0.1045  -0.0008   0.7439   0.1551   0.7454    12616
     interp_clim_a   0.1518   0.1194   0.0020   0.6325   0.2258   0.6904    12616
     interp_clim_b   0.1532   0.1202   0.0029   0.6303   0.2280   0.6840    12616
     interp_clim_c   0.1517   0.1175   0.0028   0.6508   0.2154   0.6879    12616
       whit_a_0p50   0.2057   0.1579   0.0028   0.6405   0.2218   0.4780    12616
       whit_a_0p20   0.2055   0.1570   0.0025   0.6766   0.1995   0.4926    12616
       whit_a_0p10   0.2066   0.1575   0.0025   0.6953   0.1880   0.4970    12616
       whit_a_0p05   0.2081   0.1586   0.0024   0.

## Randomly drop a three month gap during the growing season (Apr-Sept)

In [18]:
dropped_months = 3
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a three month gap during the growing season (Apr-Sept)',
    print_flag=True, write_flag=True
)

Randomly drop a three month gap during the growing season (Apr-Sept)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2057   0.1601  -0.0279   0.5629   0.2687   0.4841    12616
        climo_mean   0.1518   0.1150   0.0023   0.6989   0.2067   0.7068    12616
      climo_median   0.1571   0.1098   0.0075   0.7418   0.1827   0.6905    12616
             conor   0.1350   0.1030   0.0022   0.7580   0.1664   0.7682    12616
     interp_clim_a   0.1551   0.1233  -0.0128   0.6309   0.2377   0.7034    12616
     interp_clim_b   0.1554   0.1235  -0.0127   0.6304   0.2380   0.7022    12616
     interp_clim_c   0.1539   0.1202  -0.0102   0.6523   0.2257   0.7035    12616
       whit_a_0p50   0.2159   0.1666  -0.0087   0.5976   0.2643   0.4536    12616
       whit_a_0p20   0.2134   0.1637  -0.0035   0.6381   0.2420   0.4759    12616
       whit_a_0p10   0.2126   0.1626  -0.0015   0.6609   0.2286   0.4870    12616
       whit_a_0p05   0.2126  

## Randomly drop a three month gap during the non-growing season (Oct-Mar)

In [19]:
dropped_months = 3
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a three month gap during the non-growing season (Oct-Mar)',
    print_flag=True, write_flag=True
)

Randomly drop a three month gap during the non-growing season (Oct-Mar)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1824   0.1403   0.0435   0.6295   0.2392   0.4679    12582
        climo_mean   0.1500   0.1164   0.0002   0.5764   0.2240   0.5789    12582
      climo_median   0.1541   0.1114  -0.0035   0.6050   0.2051   0.5592    12582
             conor   0.1344   0.1037  -0.0003   0.6641   0.1771   0.6618    12582
     interp_clim_a   0.1454   0.1131   0.0219   0.6029   0.2316   0.6129    12582
     interp_clim_b   0.1480   0.1147   0.0235   0.6007   0.2344   0.6003    12582
     interp_clim_c   0.1457   0.1117   0.0200   0.6172   0.2221   0.6098    12582
       whit_a_0p50   0.1934   0.1479   0.0211   0.6626   0.1992   0.4314    12582
       whit_a_0p20   0.1951   0.1489   0.0140   0.6883   0.1786   0.4363    12582
       whit_a_0p10   0.1978   0.1509   0.0110   0.7016   0.1686   0.4343    12582
       whit_a_0p05   0.200

## Randomly drop a four month gap during the year

In [20]:
dropped_months = 4
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a four month gap during the year',
    print_flag=True, write_flag=True
)

Randomly drop a four month gap during the year
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2160   0.1672   0.0033   0.5188   0.2957   0.3949    12616
        climo_mean   0.1526   0.1169   0.0012   0.6787   0.1964   0.6783    12616
      climo_median   0.1576   0.1118   0.0019   0.7168   0.1740   0.6619    12616
             conor   0.1407   0.1077  -0.0005   0.7256   0.1663   0.7267    12616
     interp_clim_a   0.1605   0.1263   0.0022   0.5988   0.2460   0.6482    12616
     interp_clim_b   0.1621   0.1274   0.0032   0.5961   0.2486   0.6405    12616
     interp_clim_c   0.1599   0.1239   0.0026   0.6178   0.2348   0.6486    12616
       whit_a_0p50   0.2318   0.1775   0.0025   0.5834   0.2556   0.3746    12616
       whit_a_0p20   0.2346   0.1789   0.0015   0.6186   0.2333   0.3837    12616
       whit_a_0p10   0.2370   0.1804   0.0009   0.6370   0.2215   0.3866    12616
       whit_a_0p05   0.2394   0.1821   0.0005   0.6

## Randomly drop a four month gap during the growing season (Apr-Sept)

In [21]:
dropped_months = 4
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a four month gap during the growing season (Apr-Sept)',
    print_flag=True, write_flag=True
)

Randomly drop a four month gap during the growing season (Apr-Sept)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2298   0.1766  -0.0382   0.5032   0.2938   0.3924    12616
        climo_mean   0.1521   0.1156   0.0025   0.7074   0.1980   0.7123    12616
      climo_median   0.1573   0.1104   0.0080   0.7476   0.1767   0.6965    12616
             conor   0.1393   0.1060   0.0021   0.7520   0.1679   0.7587    12616
     interp_clim_a   0.1650   0.1303  -0.0179   0.6053   0.2459   0.6718    12616
     interp_clim_b   0.1656   0.1307  -0.0176   0.6039   0.2471   0.6686    12616
     interp_clim_c   0.1637   0.1273  -0.0151   0.6254   0.2353   0.6727    12616
       whit_a_0p50   0.2449   0.1861  -0.0164   0.5511   0.2837   0.3592    12616
       whit_a_0p20   0.2481   0.1880  -0.0104   0.5847   0.2672   0.3660    12616
       whit_a_0p10   0.2503   0.1894  -0.0080   0.6026   0.2576   0.3688    12616
       whit_a_0p05   0.2524   

## Randomly drop a four month gap during the non-growing season (Oct-Mar)

In [22]:
dropped_months = 4
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a four month gap during the non-growing season (Oct-Mar)',
    print_flag=True, write_flag=True
)

Randomly drop a four month gap during the non-growing season (Oct-Mar)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2030   0.1572   0.0507   0.5814   0.2777   0.4045    12582
        climo_mean   0.1517   0.1178   0.0001   0.5901   0.2223   0.5990    12582
      climo_median   0.1564   0.1129  -0.0027   0.6204   0.2031   0.5771    12582
             conor   0.1401   0.1084  -0.0010   0.6506   0.1884   0.6581    12582
     interp_clim_a   0.1555   0.1217   0.0254   0.5857   0.2500   0.5902    12582
     interp_clim_b   0.1581   0.1234   0.0271   0.5823   0.2536   0.5772    12582
     interp_clim_c   0.1557   0.1202   0.0240   0.6009   0.2404   0.5880    12582
       whit_a_0p50   0.2192   0.1681   0.0277   0.6278   0.2295   0.3652    12582
       whit_a_0p20   0.2219   0.1701   0.0187   0.6516   0.2076   0.3676    12582
       whit_a_0p10   0.2245   0.1721   0.0144   0.6633   0.1969   0.3663    12582
       whit_a_0p05   0.2272

## Randomly drop a six month gap during the year

In [23]:
dropped_months = 6
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a six month gap during the year',
    print_flag=True, write_flag=True
)

Randomly drop a six month gap during the year
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2452   0.1918   0.0032   0.4383   0.3455   0.2718    12616
        climo_mean   0.1512   0.1158   0.0002   0.6792   0.1957   0.6842    12616
      climo_median   0.1568   0.1114   0.0018   0.7162   0.1748   0.6647    12616
             conor   0.1437   0.1097  -0.0001   0.7111   0.1759   0.7149    12616
     interp_clim_a   0.1718   0.1366   0.0017   0.5587   0.2706   0.5948    12616
     interp_clim_b   0.1737   0.1381   0.0030   0.5553   0.2740   0.5848    12616
     interp_clim_c   0.1711   0.1344   0.0025   0.5773   0.2601   0.5965    12616
       whit_a_0p50   0.2709   0.2084   0.0057   0.5026   0.3088   0.2481    12616
       whit_a_0p20   0.2799   0.2151   0.0058   0.5318   0.2912   0.2470    12616
       whit_a_0p10   0.2854   0.2195   0.0054   0.5455   0.2823   0.2448    12616
       whit_a_0p05   0.2897   0.2230   0.0048   0.55

## Randomly drop a six month gap during the growing season (Apr-Sept)

In [24]:
dropped_months = 6
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a six month gap during the growing season (Apr-Sept)',
    print_flag=True, write_flag=True
)

Randomly drop a six month gap during the growing season (Apr-Sept)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2581   0.2008  -0.0395   0.4030   0.3429   0.2576    12616
        climo_mean   0.1501   0.1141   0.0005   0.7143   0.1835   0.7145    12616
      climo_median   0.1552   0.1095   0.0038   0.7513   0.1631   0.6989    12616
             conor   0.1420   0.1081   0.0004   0.7406   0.1665   0.7444    12616
     interp_clim_a   0.1752   0.1393  -0.0195   0.5586   0.2632   0.6222    12616
     interp_clim_b   0.1767   0.1403  -0.0185   0.5552   0.2664   0.6143    12616
     interp_clim_c   0.1739   0.1368  -0.0179   0.5771   0.2530   0.6245    12616
       whit_a_0p50   0.2833   0.2165  -0.0273   0.4542   0.3224   0.2252    12616
       whit_a_0p20   0.2929   0.2236  -0.0222   0.4785   0.3119   0.2206    12616
       whit_a_0p10   0.2990   0.2284  -0.0203   0.4901   0.3064   0.2168    12616
       whit_a_0p05   0.3037   0

## Randomly drop a six month gap during the non-growing season (Oct-Mar)

In [25]:
dropped_months = 6
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue

    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(
    output_df, x_col='original', y_cols=comparison_cols, 
    title='Randomly drop a six month gap during the non-growing season (Oct-Mar)',
    print_flag=True, write_flag=True
)

Randomly drop a six month gap during the non-growing season (Oct-Mar)
            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2278   0.1782   0.0447   0.5158   0.3257   0.3339    12582
        climo_mean   0.1530   0.1180   0.0003   0.6393   0.2096   0.6412    12582
      climo_median   0.1582   0.1133  -0.0010   0.6765   0.1868   0.6213    12582
             conor   0.1444   0.1112   0.0002   0.6767   0.1878   0.6805    12582
     interp_clim_a   0.1662   0.1311   0.0225   0.5776   0.2676   0.5842    12582
     interp_clim_b   0.1688   0.1328   0.0243   0.5737   0.2716   0.5723    12582
     interp_clim_c   0.1658   0.1292   0.0219   0.5961   0.2562   0.5861    12582
       whit_a_0p50   0.2532   0.1953   0.0321   0.5949   0.2672   0.3058    12582
       whit_a_0p20   0.2621   0.2017   0.0264   0.6241   0.2445   0.3017    12582
       whit_a_0p10   0.2676   0.2058   0.0234   0.6373   0.2339   0.2977    12582
       whit_a_0p05   0.2718 

### Old plotting function code

In [26]:
# sns.pairplot(
#     output_df[['original'] + comparison_cols], corner=True, kind='reg',
#     plot_kws={'scatter_kws': {'s': 2, 'alpha': 0.2}, 'line_kws': {'color': 'red'}},
# )

In [27]:
# sns.pairplot(output_df[['original'] + comparison_cols], corner=True, kind='hist')

In [28]:
# sns.pairplot(output_df[['original'] + comparison_cols], corner=True, kind='kde')

In [29]:
# g = sns.pairplot(
#     output_df, x_vars = ['original'], y_vars = comparison_cols, kind='reg', 
#     plot_kws={'scatter_kws': {'s': 1, 'alpha': 0.1}, 'line_kws': {'color': 'red'}},
# )
# for ax in g.axes.flatten():
#     if ax:
#         ax.set_xlim(0, 1.2)
#         ax.set_ylim(0, 1.2)
#         ax.set_aspect('equal')
#         ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2])
#         ax.axline((0, 0), slope=1, color='gray', linestyle='--')  # linewidth=1
# plt.show()

In [30]:
# sns.pairplot(
#     output_df[['original'] + comparison_cols], corner=True, kind='reg',
#     plot_kws={'scatter_kws': {'s': 2, 'alpha': 0.1}, 'line_kws': {'color': 'red'}},
# )