## Input Parameters

In [1]:
# The default is to include points from all NLCD classes and MGRS grid zones
point_nlcd_keep_list = []
point_mgrs_keep_list = []

# This would show all NLCD classes for MGRS grid zone 12T
point_nlcd_keep_list = [31, 41, 42, 43, 52, 71, 81, 82, 90, 95]
point_mgrs_keep_list = ['12T']

# # This would show NLCD 82 points for all MGRS grid zones
# point_nlcd_keep_list = [82]
# point_mgrs_keep_list = []

# The minimum number of months in the target year
min_month_count = 3

## Other Parameters

In [2]:
data_folder = 'data'
points_folder = 'points'

points_csv = 'gap_fill_test_points.csv'

# Exclude 2016 from the statistics since there is not a full prior year to interpolate from
# Including 2024 even though 2025 is not complete
stats_years = list(range(2017, 2025))

# Points were only built for NLCD 2024
nlcd_years = [2024]

# Include all MGRS grid zones except 12R and 16U since they are too small
mgrs_zones = [
    '10S', '10T', '10U', '11S', '11T', '11U', '12S', '12T', '12U', 
    '13R', '13S', '13T', '13U', '14R', '14S', '14T', '14U', '15R', '15S', '15T', '15U', 
    '16R', '16S', '16T', '17R', '17S', '17T', '18S', '18T', '19T'
    # '12R', '16U'
]

months = list(range(1, 13))

# TODO: Add support for setting a minimum number of months in the year
#   and minimum number of months in the growing season
# min_month_count = 6
# min_gs_month_count = 3


## Python Imports

In [3]:
from datetime import datetime
import os
import pprint
import random

#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sns
import sklearn.metrics
import sklearn.linear_model

from whittaker_eilers import WhittakerSmoother

## Read the point CSV files

In [4]:
# Building a single points dataframe and CSV from the MGRS grid zone points CSV files
overwrite_flag = False

# Read the separate points CSV files into a single dataframe
points_df_list = [
    pd.read_csv(os.path.join(points_folder, f'points_{mgrs_zone}_{nlcd_year}.csv'), index_col=None, header=0)
    for nlcd_year in nlcd_years
    for mgrs_zone in mgrs_zones
    if os.path.isfile(os.path.join(points_folder, f'points_{mgrs_zone}_{nlcd_year}.csv'))
]
points_df = pd.concat(points_df_list, axis=0, ignore_index=True)
print(f'Points: {len(points_df.index)}')

# The mgrs_zone value will eventually be added to the csv files
points_df['mgrs_zone'] = points_df['mgrs_tile'].str.slice(0, 3)

# Add a unique index to the points dataframe
points_df['index_group'] = points_df.groupby(['mgrs_tile', 'nlcd']).cumcount()
points_df['point_id'] = (
    points_df["mgrs_tile"].str.upper() + '_' +
    'nlcd' + points_df["nlcd"].astype(str).str.zfill(2) + '_' +
    points_df["index_group"].astype(str).str.zfill(2)
)
del points_df['index_group']

# Round the lat and lon to 8 decimal places (probably should be 6)
points_df['latitude'] = round(points_df['latitude'], 8)
points_df['longitude'] = round(points_df['longitude'], 8)

# # Write to CSV
# if not os.path.isfile(points_csv) or overwrite_flag:
#     print('Writing points csv')
#     points_df.to_csv(points_csv, index=False)


Points: 17084


## Read the data CSV files

This block may take a little while to execute

In [5]:
# Read the CSV files into separate dataframes for each point
print('Reading mgrs data csv files')
data_df_dict = {}
for mgrs_zone in mgrs_zones:
    # print(mgrs_zone)
    if not os.path.isfile(os.path.join(data_folder, f'data_{mgrs_zone}.csv')):
        continue
        
    mgrs_df = pd.read_csv(os.path.join(data_folder, f'data_{mgrs_zone}.csv'), index_col=None, header=0)

    # Set MGRS value to upper case 
    # (at some point change this in all the data CSV files)
    mgrs_df['mgrs_tile'] = mgrs_df['mgrs_tile'].str.upper()
    mgrs_df['mgrs_zone'] = mgrs_df['mgrs_zone'].str.upper()
    
    # Compute the ET fraction
    mgrs_df['etf'] = mgrs_df['et'] / mgrs_df['eto']
    
    # Get the month for computing climos
    mgrs_df['date'] = pd.to_datetime(mgrs_df['date'])
    mgrs_df['year'] = mgrs_df['date'].dt.year
    mgrs_df['month'] = mgrs_df['date'].dt.month
    
    # Confirm that specific NLCD categories are not included
    # TODO: This probably isn't needed and switch to a check instead of masking
    for nlcd_skip in [11, 12, 21, 22, 23]:
        mgrs_df = mgrs_df[mgrs_df['nlcd'] != nlcd_skip]

    # Save dataframe for each point
    for point_id in mgrs_df['point_id'].unique():
        site_df = mgrs_df.loc[mgrs_df['point_id']==point_id].copy()
        site_df.set_index('date', drop=True, inplace=True)
        site_df.sort_index(inplace=True)
        data_df_dict[point_id] = site_df
        del site_df
    del mgrs_df

print('\nDone')

Reading mgrs data csv files

Done


## Compute the ETf climos

In [6]:
# Compute the maximum ETf per site
# Assuming it is okay to make this for the full period of record
print('\nComputing maximum ETf')
etf_max_dict = {
    point_id: data_df_dict[point_id].agg(etf=('etf', 'max'))['etf'].to_dict()['etf']
    for point_id in data_df_dict.keys()
}

# Compute climos for each site
# Only keep the climo value if there are at least "n" years of data
print('\nComputing monthly climatologies')
month_climo_count_min = 2
month_climo_dict = {}
for point_id in data_df_dict.keys():
    month_climo = data_df_dict[point_id].groupby(['month']).agg(
        etf=('etf', 'mean'), 
        etf_median=('etf', 'median'), 
        count=('etf', 'count'), 
        et=('et', 'mean'), 
        eto=('eto', 'mean'),
    )
    month_climo_count_mask = month_climo['count'] < month_climo_count_min
    month_climo.loc[month_climo_count_mask, ['etf', 'etf_median', 'et']] = np.nan
    month_climo_dict[point_id] = month_climo
    del month_climo, month_climo_count_mask

# # Compute climos for each target year that have the target year values excluded
# # CGM - This might be worth testing more but doesn't seem worth the time to generate
# print('\nComputing monthly climatologies for target years')
# month_climo_dict = {}
# for point_id in data_df_dict.keys():
#     if point_id not in month_climo_dict.keys():
#         month_climo_dict[point_id] = {}
#     for year in stats_years:
#         month_climo_dict[point_id][year] = (
#             data_df_dict[point_id][data_df_dict[point_id].year != year]
#             .groupby(['month'])
#             .agg(
#                 etf=('etf', 'mean'), 
#                 etf_median=('etf', 'median'), 
#                 count=('etf', 'count'), 
#                 et=('et', 'mean'), 
#                 eto=('eto', 'mean'),
#             )
#         )

# TODO: Join the climo values to the data dictionaries
#   It might be faster to join the climo data here instead of looking up in the function

print('\nDone')


Computing maximum ETf

Computing monthly climatologies

Done


## Functions for computing filled values and summary statistics

In [7]:
def generate_windows(
        point_id_list, 
        months=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 
        years=[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], 
        exclude_months_without_climo=True
):
    """Generate Window Dataframes"""
    for i, point_id in enumerate(point_id_list):
        # Assume the data df dictionary exists in the global scope
        site_df = data_df_dict[point_id]
        
        for year in years:
            # Pull a three year window for each target year so that there are images to interpolate and fill from
            window_df = site_df[(site_df.index.year >= (year-1)) & (site_df.index.year <= (year+1))].copy()

            # If excluding months without climos, set them to NaN here
            # TODO: Test out adding the climo values to the window_df here 
            #   instead of in the compute_filled_values() function below
            if exclude_months_without_climo:
                merge_df = pd.merge(window_df[['month']], month_climo_dict[point_id]['etf'], how="left", on="month")
                climo_nan_mask = merge_df['etf'].isna().values
                window_df.loc[climo_nan_mask, ['etf', 'et', 'count']] = np.nan

            year_mask = window_df.index.year==year
            year_month_mask = year_mask & window_df.index.month.isin(months)
            
            if window_df.loc[year_mask, 'etf'].count() < min_month_count:
                # Check if there are enough months in the target year
                # print(f'{point_id} - {i} - {year} - not enough unmasked months, skipping')
                continue
            elif window_df.loc[year_month_mask, 'etf'].isna().all():
                # Check that there are target months with data in the 
                # print(f'{point_id} - {i} - {year} - no unmasked months in year/months, skipping')
                continue
            elif (window_df.loc[(window_df.index.year==year-1), 'etf'].isna().all() or 
                  window_df.loc[(window_df.index.year==year+1), 'etf'].isna().all()):
                # Check if there is data in the prev/next year to interpolate from
                # print(f'{point_id} - {i} - {year} - no unmasked months in next/prev year, skipping')
                continue

            yield point_id, year, window_df, year_month_mask


comparison_cols = [
    'interpolate', 'climo_mean', 'climo_median',
    'interp_clim_a', 'interp_clim_b', 'interp_clim_c',
    'whit_a_0p50', 'whit_a_0p20', 'whit_a_0p10', 'whit_a_0p05', 'whit_a_0p01', 
]


def compute_filled_values(window_df, tgt_indices, point_id):
    """"""
    # Get a copy of the target value before clearing
    original_etf = window_df.loc[tgt_indices, 'etf'].values
        
    # Set the target row values to NaN
    window_df.loc[tgt_indices, ('etf', 'et', 'count')] = np.nan
    
    # Setup the Whittaker Smoothing for the full dataframe outside of the index loop
    # The smoothing function needs all nans filled with a value
    # The fill value is not important as long as the weight value is set to 0
    window_df['temp'] = window_df['etf'].copy()
    window_df.loc[np.isnan(window_df['temp']), 'temp'] = -1
    etf = window_df['temp'].values

    # TODO: Make sure weights are set to 0 for all temp==-1 rows
    #   This might be happening already with the .fillna(0) call but double check
    #   Right now the code is assuming count is NaN if etf is NaN
    
    # Default weights with 1 for data and 0 for missing values
    weight_a = window_df['count'].clip(lower=1, upper=1).fillna(0)
    if not any(weight_a):
        print(f'{point_id} - {i} - {year} - all weights 0, skipping')
        return []
    # CGM - I tested out building the smoother once and then updating lambda,
    #   but it didn't seem any faster
    whit_a_0p50 = WhittakerSmoother(lmbda=0.5, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p20 = WhittakerSmoother(lmbda=0.2, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p10 = WhittakerSmoother(lmbda=0.1, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p05 = WhittakerSmoother(lmbda=0.05, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)
    whit_a_0p01 = WhittakerSmoother(lmbda=0.01, order=2, data_length=len(weight_a), weights=weight_a).smooth(etf)

    # CGM - I was testing out trying different weights but it didn't seem to change the values at all
    # # Compute weights based on the the scene count value
    # # Set count 0 images to a weight of 0
    # weight = window_df['count'].clip(lower=0, upper=1).fillna(0)

    # # Compute weights based on the the scene count value
    # # Set counts of 0 to a weight of 0.5 and all other to 1    
    # weight = window_df['count'].add(1).clip(upper=2).divide(2).fillna(0)

    # # Compute weights based on the scene count value
    # # Set count weights as: 0 -> 0, 1 -> 0.5, 2+ -> 1
    # weight = window_df['count'].fillna(0).clip(upper=2).divide(2)

    # Process each target index separately
    values = []
    for i, (tgt_index, tgt_i) in enumerate(zip(tgt_indices, window_df.index.get_indexer(tgt_indices))):

        interp_value = window_df['etf'].interpolate(method='linear').loc[tgt_index]

        # Climos for all years
        climo_mean = month_climo_dict[point_id].loc[tgt_index.month, 'etf']
        climo_count = month_climo_dict[point_id].loc[tgt_index.month, 'count']
        climo_median = month_climo_dict[point_id].loc[tgt_index.month, 'etf_median']
        # # Climos with the target year excluded (not sure if this matters)
        # climo_mean = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'etf']
        # climo_count = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'count']
        # climo_median = month_climo_dict[point_id][tgt_index.year].loc[tgt_index.month, 'etf_median']

        # Compute various combinations of averaging the climo and interpolate values
        # Simple mean
        interp_clim_a = (climo_mean + interp_value) / 2
        # Simple mean with the median climo
        interp_clim_c = (climo_median + interp_value) / 2
        # Weight the climo based on the number of months in the climo?
        climo_months = 10
        interp_clim_b = (climo_mean * climo_count + interp_value * climo_months) / (climo_count + climo_months)
            
        values.append({
            'index': tgt_index,
            'point_id': point_id,
            'mgrs': point_id.split('_')[0],
            'nlcd': int(point_id.split('_')[1][4:6]),
            'original': original_etf[i],
            # Filled values
            'interpolate': interp_value,
            'climo_mean': climo_mean,
            'climo_median': climo_median,
            'interp_clim_a': interp_clim_a,
            'interp_clim_b': interp_clim_b,
            'interp_clim_c': interp_clim_c,
            'whit_a_0p50': min(max(whit_a_0p50[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p20': min(max(whit_a_0p20[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p10': min(max(whit_a_0p10[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p05': min(max(whit_a_0p05[tgt_i], 0), etf_max_dict[point_id]),
            'whit_a_0p01': min(max(whit_a_0p01[tgt_i], 0), etf_max_dict[point_id]),
        })

    return values


def comparison_stats(df, x_col='original', y_cols=[]):
    """"""
    # TODO: Build the format strings based on the number of parameters instead of hardcoding
    print('  {:>16s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}'.format(
        'method', 'rmse', 'mae', 'mbe', 'm', 'b', 'r2', 'n'
    ))
    for y_col in y_cols:
        # Remove any NaN rows before computing statistics
        stat_df = df[df[y_col].notna()]
        model = sklearn.linear_model.LinearRegression()
        model.fit(stat_df[[x_col]], stat_df[y_col])
        
        print('  {:>16s} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8d}'.format(
            y_col,
            sklearn.metrics.root_mean_squared_error(stat_df[x_col], stat_df[y_col]),
            sklearn.metrics.mean_absolute_error(stat_df[x_col], stat_df[y_col]),
            np.mean(stat_df[y_col] - stat_df[x_col]),
            # np.mean(stat_df[x_col] - stat_df[y_col]),
            # sklearn.metrics.r2_score(stat_df[x_col], stat_df[y_col]),
            model.coef_[0],
            model.intercept_, 
            model.score(stat_df[[x_col]], stat_df[y_col]),
            # This count doesn't seem to change even when there are NaN values in the dataframe
            stat_df[y_col].count(),
        ))


## Filter to the target MGRS and NLCD classes

In [8]:
# Filter the points list to the target NLCD classes and MGRS grid zones
point_id_list = list(data_df_dict.keys())
if point_nlcd_keep_list:
    point_id_list = [p for p in point_id_list if int(p.split('_')[1][4:6]) in point_nlcd_keep_list]
if point_mgrs_keep_list:
    point_id_list = [p for p in point_id_list if p.split('_')[0][0:3] in point_mgrs_keep_list]

print(f'Points: {len(point_id_list)}')

Points: 883


## Randomly drop one datapoint in each year

In [9]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)


            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1299   0.0979   0.0031   0.7968   0.1155   0.7718     7035
        climo_mean   0.1232   0.0924  -0.0000   0.7955   0.1131   0.7940     7035
      climo_median   0.1278   0.0897  -0.0041   0.8145   0.0985   0.7802     7035
     interp_clim_a   0.1111   0.0844   0.0016   0.7961   0.1143   0.8343     7035
     interp_clim_b   0.1125   0.0853   0.0022   0.7950   0.1156   0.8297     7035
     interp_clim_c   0.1122   0.0836  -0.0005   0.8057   0.1070   0.8299     7035
       whit_a_0p50   0.1321   0.0992   0.0030   0.8141   0.1058   0.7663     7035
       whit_a_0p20   0.1317   0.0984   0.0026   0.8362   0.0932   0.7704     7035
       whit_a_0p10   0.1326   0.0989   0.0023   0.8470   0.0869   0.7694     7035
       whit_a_0p05   0.1339   0.0998   0.0020   0.8540   0.0828   0.7666     7035
       whit_a_0p01   0.1363   0.1015   0.0017   0.8608   0.0786   0.7611     7035


## Randomly drop a single datapoint from the "growing" season (Apr-Sept)

In [10]:
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
        
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1136   0.0864  -0.0089   0.8290   0.0935   0.8372     7035
        climo_mean   0.1181   0.0878  -0.0014   0.8164   0.1086   0.8229     7035
      climo_median   0.1215   0.0850  -0.0033   0.8380   0.0937   0.8134     7035
     interp_clim_a   0.1011   0.0767  -0.0052   0.8227   0.1010   0.8736     7035
     interp_clim_b   0.1014   0.0770  -0.0053   0.8224   0.1011   0.8727     7035
     interp_clim_c   0.1017   0.0758  -0.0061   0.8335   0.0936   0.8708     7035
       whit_a_0p50   0.1168   0.0887  -0.0039   0.8416   0.0910   0.8271     7035
       whit_a_0p20   0.1151   0.0870  -0.0020   0.8621   0.0806   0.8329     7035
       whit_a_0p10   0.1152   0.0870  -0.0015   0.8721   0.0751   0.8334     7035
       whit_a_0p05   0.1159   0.0875  -0.0013   0.8786   0.0714   0.8320     7035
       whit_a_0p01   0.1176   0.0888  -0.0013   0.8849   0.0676   0.8282     7035


## Randomly drop a single datapoint from the "non-growing" season (Oct-Mar)

In [11]:
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1576   0.1212   0.0297   0.6968   0.1684   0.5848     6910
        climo_mean   0.1339   0.1028   0.0029   0.6712   0.1534   0.6735     6910
      climo_median   0.1379   0.0982  -0.0038   0.6919   0.1371   0.6560     6910
     interp_clim_a   0.1292   0.0998   0.0163   0.6840   0.1609   0.7013     6910
     interp_clim_b   0.1331   0.1023   0.0184   0.6807   0.1645   0.6834     6910
     interp_clim_c   0.1297   0.0984   0.0129   0.6944   0.1528   0.6967     6910
       whit_a_0p50   0.1570   0.1199   0.0157   0.7044   0.1510   0.5814     6910
       whit_a_0p20   0.1583   0.1205   0.0097   0.7172   0.1391   0.5784     6910
       whit_a_0p10   0.1606   0.1221   0.0073   0.7245   0.1334   0.5719     6910
       whit_a_0p05   0.1631   0.1239   0.0061   0.7295   0.1298   0.5644     6910
       whit_a_0p01   0.1669   0.1266   0.0050   0.7348   0.1263   0.5529     6910


## Randomly drop a datapoint from the winter (Dec-Feb)

In [12]:
months = [12, 1, 2]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        
    tgt_mask = year_month_mask & window_df['etf'].notna()
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))

output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1712   0.1301   0.0324   0.6272   0.1821   0.5369     3699
        climo_mean   0.1407   0.1085   0.0059   0.6607   0.1422   0.6659     3699
      climo_median   0.1452   0.0998  -0.0043   0.6859   0.1219   0.6461     3699
     interp_clim_a   0.1390   0.1071   0.0191   0.6439   0.1622   0.6812     3699
     interp_clim_b   0.1492   0.1141   0.0248   0.6365   0.1708   0.6338     3699
     interp_clim_c   0.1393   0.1042   0.0140   0.6565   0.1520   0.6755     3699
       whit_a_0p50   0.1707   0.1287   0.0231   0.6423   0.1668   0.5367     3699
       whit_a_0p20   0.1723   0.1287   0.0185   0.6679   0.1519   0.5359     3699
       whit_a_0p10   0.1747   0.1300   0.0165   0.6817   0.1443   0.5311     3699
       whit_a_0p05   0.1773   0.1317   0.0154   0.6910   0.1395   0.5249     3699
       whit_a_0p01   0.1812   0.1345   0.0145   0.7001   0.1349   0.5149     3699


## Randomly drop a single datapoint that is next to an existing missing data point 

In [13]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # Skip the year if there are no NaN values
    if not window_df.loc[year_month_mask, 'etf'].isna().any():
        # print(f'{point_id} - {i} - {year} - no missing data points, skipping')
        continue
    
    # For the target year, pick a random month that is next to a missing/masked month but has data
    nan_mask = window_df['etf'].isna()
    tgt_mask = (
        (nan_mask | nan_mask.shift(1) | nan_mask.shift(-1))
        & window_df['etf'].notna() & year_month_mask
    )
    if not tgt_mask.any():
        print(f'{point_id} - {i} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index
    output_list.extend(compute_filled_values(window_df, tgt_indices, point_id))
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1770   0.1367   0.0265   0.6716   0.2002   0.5671     6394
        climo_mean   0.1406   0.1075  -0.0042   0.7006   0.1542   0.7082     6394
      climo_median   0.1464   0.1038  -0.0097   0.7132   0.1420   0.6857     6394
     interp_clim_a   0.1425   0.1097   0.0112   0.6861   0.1772   0.7024     6394
     interp_clim_b   0.1477   0.1134   0.0137   0.6776   0.1842   0.6806     6394
     interp_clim_c   0.1441   0.1089   0.0084   0.6924   0.1711   0.6944     6394
       whit_a_0p50   0.1792   0.1377   0.0208   0.7095   0.1745   0.5675     6394
       whit_a_0p20   0.1811   0.1386   0.0148   0.7270   0.1592   0.5650     6394
       whit_a_0p10   0.1840   0.1408   0.0113   0.7346   0.1517   0.5579     6394
       whit_a_0p05   0.1870   0.1433   0.0089   0.7389   0.1470   0.5496     6394
       whit_a_0p01   0.1916   0.1471   0.0063   0.7422   0.1426   0.5368     6394


## Randomly drop a two month gap during the year

But only check the filled value in one month of the gap

In [14]:
dropped_months = 2
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))

    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1560   0.1187   0.0058   0.7273   0.1550   0.6724     7035
        climo_mean   0.1274   0.0955  -0.0001   0.7836   0.1183   0.7782     7035
      climo_median   0.1323   0.0925  -0.0042   0.8030   0.1036   0.7630     7035
     interp_clim_a   0.1245   0.0948   0.0028   0.7555   0.1367   0.7899     7035
     interp_clim_b   0.1274   0.0966   0.0038   0.7511   0.1400   0.7795     7035
     interp_clim_c   0.1253   0.0937   0.0008   0.7652   0.1293   0.7860     7035
       whit_a_0p50   0.1584   0.1196   0.0051   0.7674   0.1324   0.6712     7035
       whit_a_0p20   0.1599   0.1202   0.0043   0.7893   0.1196   0.6715     7035
       whit_a_0p10   0.1622   0.1219   0.0039   0.7998   0.1134   0.6672     7035
       whit_a_0p05   0.1646   0.1238   0.0035   0.8063   0.1095   0.6616     7035
       whit_a_0p01   0.1682   0.1267   0.0032   0.8124   0.1058   0.6525     7035


## Randomly drop a two month gap during the growing season (Apr-Sept)

In [15]:
dropped_months = 2
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1388   0.1061  -0.0131   0.7751   0.1188   0.7584     7035
        climo_mean   0.1151   0.0863   0.0003   0.8306   0.0996   0.8320     7035
      climo_median   0.1191   0.0839  -0.0020   0.8519   0.0849   0.8214     7035
     interp_clim_a   0.1095   0.0839  -0.0064   0.8029   0.1092   0.8512     7035
     interp_clim_b   0.1101   0.0843  -0.0066   0.8019   0.1096   0.8495     7035
     interp_clim_c   0.1101   0.0832  -0.0075   0.8135   0.1018   0.8484     7035
       whit_a_0p50   0.1429   0.1083  -0.0058   0.8148   0.1028   0.7477     7035
       whit_a_0p20   0.1428   0.1080  -0.0037   0.8384   0.0911   0.7517     7035
       whit_a_0p10   0.1439   0.1087  -0.0028   0.8497   0.0853   0.7507     7035
       whit_a_0p05   0.1455   0.1098  -0.0023   0.8568   0.0816   0.7478     7035
       whit_a_0p01   0.1481   0.1118  -0.0018   0.8638   0.0781   0.7420     7035


## Randomly drop a two month gap during the non-growing season

In [16]:
dropped_months = 2
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1793   0.1388   0.0409   0.6633   0.2011   0.5102     6910
        climo_mean   0.1379   0.1061  -0.0014   0.6602   0.1602   0.6660     6910
      climo_median   0.1427   0.1015  -0.0084   0.6791   0.1442   0.6452     6910
     interp_clim_a   0.1406   0.1093   0.0197   0.6618   0.1806   0.6593     6910
     interp_clim_b   0.1465   0.1133   0.0232   0.6552   0.1872   0.6331     6910
     interp_clim_c   0.1412   0.1077   0.0162   0.6712   0.1726   0.6545     6910
       whit_a_0p50   0.1794   0.1376   0.0246   0.6871   0.1734   0.5083     6910
       whit_a_0p20   0.1820   0.1389   0.0170   0.7006   0.1594   0.5018     6910
       whit_a_0p10   0.1855   0.1414   0.0135   0.7068   0.1529   0.4922     6910
       whit_a_0p05   0.1890   0.1441   0.0112   0.7104   0.1490   0.4823     6910
       whit_a_0p01   0.1942   0.1481   0.0090   0.7137   0.1452   0.4679     6910


## Randomly drop a three month gap during the year

In [17]:
dropped_months = 3
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1752   0.1347   0.0023   0.6819   0.1775   0.5976     7035
        climo_mean   0.1253   0.0939  -0.0022   0.7883   0.1144   0.7882     7035
      climo_median   0.1299   0.0905  -0.0063   0.8100   0.0984   0.7743     7035
     interp_clim_a   0.1306   0.1000   0.0001   0.7351   0.1460   0.7713     7035
     interp_clim_b   0.1351   0.1027   0.0015   0.7280   0.1513   0.7547     7035
     interp_clim_c   0.1312   0.0990  -0.0020   0.7460   0.1380   0.7684     7035
       whit_a_0p50   0.1796   0.1364   0.0034   0.7383   0.1476   0.5977     7035
       whit_a_0p20   0.1826   0.1383   0.0024   0.7625   0.1333   0.5964     7035
       whit_a_0p10   0.1857   0.1408   0.0017   0.7733   0.1265   0.5908     7035
       whit_a_0p05   0.1888   0.1431   0.0011   0.7798   0.1224   0.5844     7035
       whit_a_0p01   0.1930   0.1465   0.0004   0.7855   0.1185   0.5747     7035


## Randomly drop a three month gap during the growing season (Apr-Sept)

In [18]:
dropped_months = 3
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1640   0.1243  -0.0148   0.7062   0.1536   0.6675     7035
        climo_mean   0.1167   0.0874  -0.0001   0.8231   0.1013   0.8293     7035
      climo_median   0.1201   0.0847  -0.0026   0.8439   0.0869   0.8200     7035
     interp_clim_a   0.1209   0.0927  -0.0074   0.7647   0.1274   0.8214     7035
     interp_clim_b   0.1217   0.0932  -0.0074   0.7634   0.1281   0.8186     7035
     interp_clim_c   0.1208   0.0917  -0.0087   0.7751   0.1202   0.8203     7035
       whit_a_0p50   0.1696   0.1263  -0.0094   0.7650   0.1253   0.6577     7035
       whit_a_0p20   0.1717   0.1276  -0.0086   0.7907   0.1113   0.6580     7035
       whit_a_0p10   0.1745   0.1297  -0.0084   0.8026   0.1047   0.6532     7035
       whit_a_0p05   0.1772   0.1319  -0.0083   0.8098   0.1007   0.6476     7035
       whit_a_0p01   0.1811   0.1352  -0.0084   0.8164   0.0969   0.6388     7035


## Randomly drop a three month gap during the non-growing season

In [19]:
dropped_months = 3
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1923   0.1483   0.0395   0.6421   0.2202   0.4933     6910
        climo_mean   0.1381   0.1062  -0.0011   0.6968   0.1520   0.7031     6910
      climo_median   0.1436   0.1015  -0.0065   0.7158   0.1370   0.6816     6910
     interp_clim_a   0.1461   0.1133   0.0192   0.6694   0.1861   0.6733     6910
     interp_clim_b   0.1540   0.1186   0.0232   0.6567   0.1966   0.6395     6910
     interp_clim_c   0.1469   0.1120   0.0165   0.6789   0.1786   0.6683     6910
       whit_a_0p50   0.1941   0.1482   0.0303   0.6830   0.1903   0.4972     6910
       whit_a_0p20   0.1980   0.1505   0.0256   0.6970   0.1786   0.4888     6910
       whit_a_0p10   0.2019   0.1531   0.0232   0.7025   0.1734   0.4785     6910
       whit_a_0p05   0.2057   0.1556   0.0216   0.7052   0.1704   0.4684     6910
       whit_a_0p01   0.2108   0.1594   0.0198   0.7071   0.1677   0.4544     6910


## Randomly drop a four month gap during the year

In [20]:
dropped_months = 4
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1960   0.1501   0.0061   0.6246   0.2123   0.5097     7035
        climo_mean   0.1283   0.0961  -0.0013   0.7732   0.1233   0.7787     7035
      climo_median   0.1334   0.0928  -0.0059   0.7902   0.1093   0.7622     7035
     interp_clim_a   0.1410   0.1078   0.0024   0.6989   0.1678   0.7346     7035
     interp_clim_b   0.1458   0.1107   0.0043   0.6918   0.1735   0.7153     7035
     interp_clim_c   0.1417   0.1068   0.0001   0.7074   0.1608   0.7309     7035
       whit_a_0p50   0.2053   0.1548   0.0070   0.6845   0.1803   0.5010     7035
       whit_a_0p20   0.2090   0.1568   0.0061   0.7104   0.1652   0.5010     7035
       whit_a_0p10   0.2127   0.1595   0.0055   0.7217   0.1583   0.4955     7035
       whit_a_0p05   0.2162   0.1623   0.0050   0.7280   0.1544   0.4888     7035
       whit_a_0p01   0.2213   0.1665   0.0044   0.7331   0.1510   0.4782     7035


## Randomly drop a four month gap during the growing season (Apr-Sept)

In [21]:
dropped_months = 4
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1893   0.1449  -0.0008   0.6363   0.2003   0.5512     7035
        climo_mean   0.1157   0.0870   0.0024   0.8228   0.1003   0.8276     7035
      climo_median   0.1187   0.0836  -0.0009   0.8429   0.0860   0.8190     7035
     interp_clim_a   0.1304   0.1003   0.0008   0.7295   0.1503   0.7846     7035
     interp_clim_b   0.1326   0.1016   0.0014   0.7261   0.1528   0.7768     7035
     interp_clim_c   0.1299   0.0990  -0.0009   0.7396   0.1431   0.7852     7035
       whit_a_0p50   0.2004   0.1501  -0.0005   0.7063   0.1618   0.5363     7035
       whit_a_0p20   0.2030   0.1523  -0.0023   0.7311   0.1463   0.5381     7035
       whit_a_0p10   0.2065   0.1555  -0.0034   0.7412   0.1396   0.5324     7035
       whit_a_0p05   0.2100   0.1587  -0.0043   0.7464   0.1359   0.5250     7035
       whit_a_0p01   0.2151   0.1630  -0.0051   0.7502   0.1330   0.5131     7035


## Randomly drop a four month gap during the non-growing season

In [22]:
dropped_months = 4
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.1927   0.1485   0.0209   0.6495   0.2069   0.5086     6910
        climo_mean   0.1375   0.1050  -0.0007   0.7241   0.1457   0.7270     6910
      climo_median   0.1428   0.1008  -0.0060   0.7422   0.1308   0.7075     6910
     interp_clim_a   0.1453   0.1125   0.0101   0.6868   0.1763   0.6967     6910
     interp_clim_b   0.1524   0.1172   0.0133   0.6766   0.1849   0.6672     6910
     interp_clim_c   0.1465   0.1117   0.0075   0.6958   0.1688   0.6907     6910
       whit_a_0p50   0.1956   0.1491   0.0186   0.7042   0.1755   0.5185     6910
       whit_a_0p20   0.1996   0.1516   0.0162   0.7245   0.1624   0.5141     6910
       whit_a_0p10   0.2037   0.1546   0.0148   0.7337   0.1561   0.5062     6910
       whit_a_0p05   0.2074   0.1575   0.0138   0.7389   0.1523   0.4978     6910
       whit_a_0p01   0.2126   0.1616   0.0126   0.7429   0.1490   0.4854     6910


## Randomly drop a six month gap during the year

In [23]:
dropped_months = 6
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2189   0.1704   0.0047   0.5714   0.2391   0.4103     7035
        climo_mean   0.1235   0.0934  -0.0005   0.7881   0.1154   0.7922     7035
      climo_median   0.1277   0.0897  -0.0045   0.8110   0.0989   0.7794     7035
     interp_clim_a   0.1466   0.1144   0.0021   0.6798   0.1773   0.7084     7035
     interp_clim_b   0.1520   0.1181   0.0041   0.6708   0.1841   0.6857     7035
     interp_clim_c   0.1464   0.1133   0.0001   0.6912   0.1690   0.7082     7035
       whit_a_0p50   0.2436   0.1852   0.0025   0.6156   0.2127   0.3644     7035
       whit_a_0p20   0.2480   0.1878   0.0005   0.6367   0.1992   0.3646     7035
       whit_a_0p10   0.2524   0.1911  -0.0008   0.6454   0.1931   0.3593     7035
       whit_a_0p05   0.2566   0.1944  -0.0019   0.6499   0.1895   0.3528     7035
       whit_a_0p01   0.2624   0.1992  -0.0031   0.6530   0.1867   0.3427     7035


## Randomly drop a six month gap during the growing season (Apr-Sept)

In [24]:
dropped_months = 6
months = [4, 5, 6, 7, 8, 9]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue
            
    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2265   0.1766   0.0277   0.5490   0.2684   0.3897     7035
        climo_mean   0.1182   0.0890  -0.0010   0.8036   0.1038   0.8136     7035
      climo_median   0.1221   0.0863  -0.0052   0.8247   0.0884   0.8017     7035
     interp_clim_a   0.1473   0.1150   0.0134   0.6763   0.1861   0.7146     7035
     interp_clim_b   0.1520   0.1180   0.0155   0.6680   0.1927   0.6956     7035
     interp_clim_c   0.1467   0.1139   0.0113   0.6868   0.1784   0.7151     7035
       whit_a_0p50   0.2619   0.1999   0.0202   0.5864   0.2409   0.3171     7035
       whit_a_0p20   0.2661   0.2024   0.0156   0.6090   0.2243   0.3193     7035
       whit_a_0p10   0.2708   0.2059   0.0135   0.6179   0.2174   0.3146     7035
       whit_a_0p05   0.2752   0.2096   0.0122   0.6223   0.2137   0.3087     7035
       whit_a_0p01   0.2808   0.2144   0.0111   0.6251   0.2112   0.3000     7035


## Randomly drop a six month gap during the non-growing season

In [25]:
dropped_months = 6
months = [10, 11, 12, 1, 2, 3]

output_list = []
for point_id, year, window_df, year_month_mask in generate_windows(point_id_list, months=months):        

    # For the target year, identify all the gap windows with at least 1 month of data
    # This approach is assuming the gap will be 2 or months in the gap
    gap_mask = window_df['etf'].notna()
    for i in range(dropped_months-1):
        gap_mask = gap_mask | window_df['etf'].notna().shift(-(i+1))
        
    tgt_mask = year_month_mask & gap_mask
    if not tgt_mask.any():
        print(f'{point_id} - {year} - no unmasked months, skipping')
        continue

    tgt_indices = window_df.loc[tgt_mask].sample(n=1).index

    # Add an extra month index for each dropped month
    tgt_indices.freq = 'ms'
    for i in range(dropped_months-1):
        tgt_indices = tgt_indices.append(pd.DatetimeIndex([tgt_indices[-1] + pd.DateOffset(months=1)]))
    
    values = compute_filled_values(window_df, tgt_indices, point_id)

    # Only keep values dictionaries that had data originally
    values = [v for v in values if not np.isnan(v['original'])]

    # Only keep 1 of the filled values from the window
    output_list.extend(random.sample(values, 1))

    del window_df, tgt_mask, tgt_indices, gap_mask
    
output_df = pd.DataFrame(output_list)
comparison_stats(output_df, x_col='original', y_cols=comparison_cols)

            method     rmse      mae      mbe        m        b       r2        n
       interpolate   0.2121   0.1642  -0.0293   0.6031   0.1959   0.4467     6910
        climo_mean   0.1287   0.0982  -0.0012   0.7729   0.1277   0.7717     6910
      climo_median   0.1328   0.0940  -0.0055   0.7958   0.1103   0.7590     6910
     interp_clim_a   0.1482   0.1139  -0.0152   0.6880   0.1618   0.7006     6910
     interp_clim_b   0.1549   0.1185  -0.0137   0.6763   0.1699   0.6719     6910
     interp_clim_c   0.1489   0.1133  -0.0174   0.6995   0.1531   0.6988     6910
       whit_a_0p50   0.2214   0.1691  -0.0256   0.6564   0.1693   0.4400     6910
       whit_a_0p20   0.2258   0.1716  -0.0239   0.6780   0.1588   0.4375     6910
       whit_a_0p10   0.2304   0.1749  -0.0237   0.6871   0.1538   0.4299     6910
       whit_a_0p05   0.2349   0.1782  -0.0240   0.6921   0.1507   0.4215     6910
       whit_a_0p01   0.2410   0.1830  -0.0247   0.6957   0.1479   0.4089     6910


### Old plotting function code

In [26]:
# sns.pairplot(
#     output_df[['original'] + comparison_cols], corner=True, kind='reg',
#     plot_kws={'scatter_kws': {'s': 2, 'alpha': 0.2}, 'line_kws': {'color': 'red'}},
# )

In [27]:
# sns.pairplot(output_df[['original'] + comparison_cols], corner=True, kind='hist')

In [28]:
# sns.pairplot(output_df[['original'] + comparison_cols], corner=True, kind='kde')

In [29]:
# g = sns.pairplot(
#     output_df, x_vars = ['original'], y_vars = comparison_cols, kind='reg', 
#     plot_kws={'scatter_kws': {'s': 1, 'alpha': 0.1}, 'line_kws': {'color': 'red'}},
# )
# for ax in g.axes.flatten():
#     if ax:
#         ax.set_xlim(0, 1.2)
#         ax.set_ylim(0, 1.2)
#         ax.set_aspect('equal')
#         ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2])
#         ax.axline((0, 0), slope=1, color='gray', linestyle='--')  # linewidth=1
# plt.show()

In [30]:
# sns.pairplot(
#     output_df[['original'] + comparison_cols], corner=True, kind='reg',
#     plot_kws={'scatter_kws': {'s': 2, 'alpha': 0.1}, 'line_kws': {'color': 'red'}},
# )